From a1968edd216617786da41aa9d5fa3c6d45ef6508 Mon Sep 17 00:00:00 2001
From: sshugsc <sshu@bcgsc.ca>
Date: Thu, 29 Jan 2026 12:27:54 -0800
Subject: [PATCH 01/64] checking for project membership before creating the
 report

---
 pori_python/ipr/connection.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index dca4687d..cc126c4a 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -107,6 +107,10 @@ def upload_report(
 
             projects = self.get("project")
             project_names = [item["name"] for item in projects]
+            project_users = {
+                item["name"]: [user["username"] for user in item.get("users", [])]
+                for item in projects
+            }
 
             # if project is not exist, create one
             if content["project"] not in project_names:
@@ -118,6 +122,9 @@ def upload_report(
                 except Exception as err:
                     raise Exception(f"Project creation failed due to {err}")
 
+            if self.username not in project_users[content["project"]]:
+                raise Exception(f"User have no permission to create report in project {content['project']}")
+            
             if ignore_extra_fields:
                 initial_result = self.post("reports-async?ignore_extra_fields=true", content)
             else:

From be3add72f8502b74160e2bd5cecc22d0b1ab8bd1 Mon Sep 17 00:00:00 2001
From: sshugsc <sshu@bcgsc.ca>
Date: Fri, 30 Jan 2026 12:01:43 -0800
Subject: [PATCH 02/64] lint

---
 pori_python/ipr/connection.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index cc126c4a..1cfd260a 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -123,8 +123,10 @@ def upload_report(
                     raise Exception(f"Project creation failed due to {err}")
 
             if self.username not in project_users[content["project"]]:
-                raise Exception(f"User have no permission to create report in project {content['project']}")
-            
+                raise Exception(
+                    f"User have no permission to create report in project {content['project']}"
+                )
+
             if ignore_extra_fields:
                 initial_result = self.post("reports-async?ignore_extra_fields=true", content)
             else:

From d329813b13a060800b4fb93eb9f84f10e57a8cf1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 23 Feb 2026 13:42:28 -0800
Subject: [PATCH 03/64] add flag field to input types

---
 pori_python/ipr/inputs.py | 9 +++++++--
 pori_python/types.py      | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index 01976603..c2f464c0 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -59,6 +59,7 @@
     'comments',
     'library',
     'germline',
+    'flags'
 ]
 
 SMALL_MUT_REQ = ['gene', 'proteinChange']
@@ -97,6 +98,7 @@
     'tumourRefCount',
     'tumourRefCopies',
     'zygosity',
+    'flags'
 ]
 
 EXP_REQ = ['gene', 'kbCategory']
@@ -129,6 +131,7 @@
     'rnaReads',
     'rpkm',
     'tpm',
+    'flags'
 ]
 
 SV_REQ = [
@@ -161,12 +164,13 @@
     'tumourDepth',
     'germline',
     'mavis_product_id',
+    'flags'
 ]
 
 SIGV_REQ = ['signatureName', 'variantTypeName']
 SIGV_COSMIC = ['signature']  # 1st element used as signatureName key
 SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2']
-SIGV_OPTIONAL = ['displayName']
+SIGV_OPTIONAL = ['displayName', 'flags']
 SIGV_KEY = SIGV_REQ[:]
 
 
@@ -277,6 +281,7 @@ def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]:
         return tuple(['small mutation'] + key_vals)
 
     result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key)
+
     if not result:
         return []
 
@@ -330,11 +335,11 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant]
     Validate the input rows contain the minimum required fields and
     generate any default values where possible
     """
-
     def row_key(row: Dict) -> Tuple[str, ...]:
         return tuple(['expression'] + [row[key] for key in EXP_KEY])
 
     variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key)
+
     result = [cast(IprExprVariant, var) for var in variants]
     float_columns = [
         col
diff --git a/pori_python/types.py b/pori_python/types.py
index dd1ab7e5..3840cfc3 100644
--- a/pori_python/types.py
+++ b/pori_python/types.py
@@ -134,11 +134,12 @@ def __hash__(self):
 
 
 class IprVariantBase(TypedDict):
-    """Required properties of all variants for IPR."""
+    """Required or possible properties of all variants for IPR."""
 
     key: str
     variantType: str
     variant: str
+    flags: Optional[List[str]]
 
 
 class IprGeneVariant(IprVariantBase):

From 4859c2096ff26e98ea6684b1a2e949c85b0f0801 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 24 Feb 2026 14:52:47 -0800
Subject: [PATCH 04/64] add func to create observed vars section

---
 pori_python/ipr/ipr.py  | 36 ++++++++++++++++++++++++
 pori_python/ipr/main.py | 61 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 06d9efbd..01f9a554 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -728,3 +728,39 @@ def get_kb_disease_matches(
         raise ValueError(msg)
 
     return disease_matches
+
+
+def get_variant_flags(variant_sources):
+    def ensure_str_list(val):
+        if isinstance(val, str):
+            return [val]
+        if isinstance(val, list):
+            if not all(isinstance(item, str) for item in val):
+                raise TypeError("All items in flags must be strings")
+            return val
+        raise TypeError(f"Unexpected type in flags field: {type(val).__name__}")
+
+    flags = []
+
+    for item in variant_sources:
+        raw_flags = item.get('flags')
+
+        if not raw_flags:  # skips None and ''
+            continue
+
+        flags.append({
+            'variant': item['key'],
+            'variantType': item['variantType'],
+            'flags': [f for f in ensure_str_list(raw_flags) if f]
+        })
+        item.pop('flags', None)  # remove after extraction
+
+        return flags
+
+    observed_vars_section = [
+        flag
+        for variants in variant_sources
+        for flag in extract_flags(variants)
+    ]
+
+    return observed_vars_section
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index cbb7c128..5980c482 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -46,6 +46,7 @@
     get_kb_disease_matches,
     get_kb_matches_sections,
     select_expression_plots,
+    get_variant_flags
 )
 from .summary import auto_analyst_comments, get_ipr_analyst_comments
 from .therapeutic_options import create_therapeutic_options
@@ -234,7 +235,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict
         for key, count in removed_keys.items():
             logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.")
 
-    drop_columns = ['variant', 'variantType', 'histogramImage']
+    drop_columns = ['variant', 'variantType', 'histogramImage', 'flags']
     # DEVSU-2034 - use a 'displayName'
     VARIANT_LIST_KEYS = [
         'expressionVariants',
@@ -281,7 +282,6 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict
 
     # Removing cosmicSignatures. Temporary
     upload_content.pop('cosmicSignatures', None)
-
     return upload_content
 
 
@@ -410,6 +410,7 @@ def ipr_report(
     expression_variants: List[IprExprVariant] = preprocess_expression_variants(
         content.get('expressionVariants', [])
     )
+
     # Additional checks
     if expression_variants:
         check_comparators(content, expression_variants)
@@ -527,10 +528,58 @@ def ipr_report(
         gkb_matches, all_variants, kb_matched_sections['kbMatches']
     )
 
+    if True:
+
+        def extract_flags(variant_list):
+            # convert item to list of str, if it's just a str
+            def ensure_list(val):
+                if isinstance(val, str):
+                    return [val]
+                if not isinstance(val, list):
+                    raise ValueError('Unexpected type in flags field', val, {type(val).__name__})
+                return val
+
+            # extract only the relevant fields for creating an observed variant annotation record
+            flags = [
+                {'variant': item['key'],
+                'variantType': item['variantType'],
+                'flags': ensure_list(item['flags'])
+                }
+                for item in variant_list if item['flags'] is not None and item['flags'] != ''
+                ]
+            _ = [item.pop('flags', '') for item in variant_list]
+
+            return flags
+
+        observed_vars_section = []
+        for varlist in [small_mutations, copy_variants, expression_variants]:
+            flags = extract_flags([item for item in varlist if item['gene'] in genes_with_variants])
+            observed_vars_section.extend(flags)
+        observed_vars_section.extend(extract_flags(signature_variants))
+        observed_vars_section.extend(extract_flags(filter_structural_variants(
+                        structural_variants, gkb_matches, gene_information
+                    )))
+
+    if False:
+        variant_sources = [
+            v
+            for source in [
+                [v for v in small_mutations if v['gene'] in genes_with_variants],
+                [v for v in copy_variants if v['gene'] in genes_with_variants],
+                [v for v in expression_variants if v['gene'] in genes_with_variants],
+                signature_variants,
+                filter_structural_variants(structural_variants, gkb_matches, gene_information),
+            ]
+            for v in source
+        ]
+
+        observed_vars_section = get_variant_flags(variant_sources)
+
     # OUTPUT CONTENT
     # thread safe deep-copy the original content
     output = json.loads(json.dumps(content))
     output.update(kb_matched_sections)
+
     output.update(
         {
             'copyVariants': [
@@ -550,15 +599,20 @@ def ipr_report(
                 for s in filter_structural_variants(
                     structural_variants, gkb_matches, gene_information
                 )
-            ],
+            ],  # TODO NB are we omitting non-matched sv's?
             'signatureVariants': [trim_empty_values(s) for s in signature_variants],
             'genes': gene_information,
             'genomicAlterationsIdentified': key_alterations,
             'variantCounts': variant_counts,
             'analystComments': comments,
             'therapeuticTarget': targets,
+            'observedVariantAnnotations': observed_vars_section
         }
     )
+
+    # TODO there are 13 outliers in the test data; if even only three are matched, why are only those three
+    # shown in the expression section? shouldn't we be seeing the non-kbmatched vars there as well?
+
     output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants))
 
     # if input includes hrdScore field, that is ok to pass to db
@@ -577,6 +631,7 @@ def ipr_report(
         if not ipr_conn:
             raise ValueError('ipr_url required to upload report')
         ipr_spec = ipr_conn.get_spec()
+
         output = clean_unsupported_content(output, ipr_spec)
         try:
             logger.info(f'Uploading to IPR {ipr_conn.url}')

From 97f4740479cb61a41e9bf0518e7e222c5c9c6791 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 24 Feb 2026 15:46:27 -0800
Subject: [PATCH 05/64] add func to prepare observed var section

---
 pori_python/ipr/inputs.py | 10 +++++-----
 pori_python/ipr/ipr.py    | 21 ++++++++++-----------
 pori_python/ipr/main.py   | 28 ++++++++++++++++------------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index c2f464c0..7d5bac0f 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -59,7 +59,7 @@
     'comments',
     'library',
     'germline',
-    'flags'
+    'flags',
 ]
 
 SMALL_MUT_REQ = ['gene', 'proteinChange']
@@ -98,7 +98,7 @@
     'tumourRefCount',
     'tumourRefCopies',
     'zygosity',
-    'flags'
+    'flags',
 ]
 
 EXP_REQ = ['gene', 'kbCategory']
@@ -131,7 +131,7 @@
     'rnaReads',
     'rpkm',
     'tpm',
-    'flags'
+    'flags',
 ]
 
 SV_REQ = [
@@ -164,7 +164,7 @@
     'tumourDepth',
     'germline',
     'mavis_product_id',
-    'flags'
+    'flags',
 ]
 
 SIGV_REQ = ['signatureName', 'variantTypeName']
@@ -335,6 +335,7 @@ def preprocess_expression_variants(rows: Iterable[Dict]) -> List[IprExprVariant]
     Validate the input rows contain the minimum required fields and
     generate any default values where possible
     """
+
     def row_key(row: Dict) -> Tuple[str, ...]:
         return tuple(['expression'] + [row[key] for key in EXP_KEY])
 
@@ -375,7 +376,6 @@ def row_key(row: Dict) -> Tuple[str, ...]:
 
     if errors:
         raise ValueError(f'{len(errors)} Invalid expression variants in file')
-
     return result
 
 
diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 01f9a554..91754ce7 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -668,7 +668,6 @@ def get_kb_disease_matches(
     verbose: bool = True,
     useSubgraphsRoute: bool = True,
 ) -> list[Dict]:
-
     disease_matches = []
 
     if not kb_disease_match:
@@ -736,9 +735,9 @@ def ensure_str_list(val):
             return [val]
         if isinstance(val, list):
             if not all(isinstance(item, str) for item in val):
-                raise TypeError("All items in flags must be strings")
+                raise TypeError('All items in flags must be strings')
             return val
-        raise TypeError(f"Unexpected type in flags field: {type(val).__name__}")
+        raise TypeError(f'Unexpected type in flags field: {type(val).__name__}')
 
     flags = []
 
@@ -748,19 +747,19 @@ def ensure_str_list(val):
         if not raw_flags:  # skips None and ''
             continue
 
-        flags.append({
-            'variant': item['key'],
-            'variantType': item['variantType'],
-            'flags': [f for f in ensure_str_list(raw_flags) if f]
-        })
+        flags.append(
+            {
+                'variant': item['key'],
+                'variantType': item['variantType'],
+                'flags': [f for f in ensure_str_list(raw_flags) if f],
+            }
+        )
         item.pop('flags', None)  # remove after extraction
 
         return flags
 
     observed_vars_section = [
-        flag
-        for variants in variant_sources
-        for flag in extract_flags(variants)
+        flag for variants in variant_sources for flag in extract_flags(variants)
     ]
 
     return observed_vars_section
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 5980c482..df316615 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -46,7 +46,7 @@
     get_kb_disease_matches,
     get_kb_matches_sections,
     select_expression_plots,
-    get_variant_flags
+    get_variant_flags,
 )
 from .summary import auto_analyst_comments, get_ipr_analyst_comments
 from .therapeutic_options import create_therapeutic_options
@@ -528,7 +528,7 @@ def ipr_report(
         gkb_matches, all_variants, kb_matched_sections['kbMatches']
     )
 
-    if True:
+    if False:
 
         def extract_flags(variant_list):
             # convert item to list of str, if it's just a str
@@ -541,12 +541,14 @@ def ensure_list(val):
 
             # extract only the relevant fields for creating an observed variant annotation record
             flags = [
-                {'variant': item['key'],
-                'variantType': item['variantType'],
-                'flags': ensure_list(item['flags'])
+                {
+                    'variant': item['key'],
+                    'variantType': item['variantType'],
+                    'flags': ensure_list(item['flags']),
                 }
-                for item in variant_list if item['flags'] is not None and item['flags'] != ''
-                ]
+                for item in variant_list
+                if item['flags'] is not None and item['flags'] != ''
+            ]
             _ = [item.pop('flags', '') for item in variant_list]
 
             return flags
@@ -556,11 +558,13 @@ def ensure_list(val):
             flags = extract_flags([item for item in varlist if item['gene'] in genes_with_variants])
             observed_vars_section.extend(flags)
         observed_vars_section.extend(extract_flags(signature_variants))
-        observed_vars_section.extend(extract_flags(filter_structural_variants(
-                        structural_variants, gkb_matches, gene_information
-                    )))
+        observed_vars_section.extend(
+            extract_flags(
+                filter_structural_variants(structural_variants, gkb_matches, gene_information)
+            )
+        )
 
-    if False:
+    if True:
         variant_sources = [
             v
             for source in [
@@ -606,7 +610,7 @@ def ensure_list(val):
             'variantCounts': variant_counts,
             'analystComments': comments,
             'therapeuticTarget': targets,
-            'observedVariantAnnotations': observed_vars_section
+            'observedVariantAnnotations': observed_vars_section,
         }
     )
 

From 29521a6956ab09faa0a31b99ae44d1a519473ca1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 4 Mar 2026 15:35:48 -0800
Subject: [PATCH 06/64] commit to save

---
 pori_python/ipr/ipr.py  | 49 +++++++++++++++-----------
 pori_python/ipr/main.py | 78 +++++++++++++++--------------------------
 2 files changed, 57 insertions(+), 70 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 91754ce7..dd90fdcc 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -729,37 +729,44 @@ def get_kb_disease_matches(
     return disease_matches
 
 
-def get_variant_flags(variant_sources):
-    def ensure_str_list(val):
-        if isinstance(val, str):
-            return [val]
-        if isinstance(val, list):
-            if not all(isinstance(item, str) for item in val):
-                raise TypeError('All items in flags must be strings')
-            return val
-        raise TypeError(f'Unexpected type in flags field: {type(val).__name__}')
+def ensure_str_list(val):
+    if isinstance(val, str):
+        return [val]
+    if isinstance(val, list):
+        if not all(isinstance(item, str) for item in val):
+            raise TypeError('All items in flags must be strings')
+        return val
+    raise TypeError(f'Unexpected type in flags field: {type(val).__name__}')
+
+
+def add_transcript_flags(variant_sources, transcript_flags_df):
+    lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags']))
+
+    for record in variant_sources:
+        new_flag = lookup.get(record.get('transcript'))
+        if not new_flag:
+            continue
+        flags = ensure_str_list(record.setdefault('flags', []))
+        if new_flag not in flags:
+            flags.append(new_flag)
+            record['flags'] = flags
 
-    flags = []
+    return variant_sources
 
+
+def get_variant_flags(variant_sources):
+    flags = []
     for item in variant_sources:
         raw_flags = item.get('flags')
-
         if not raw_flags:  # skips None and ''
             continue
-
+        # create record, removing dupes from flags list
         flags.append(
             {
                 'variant': item['key'],
                 'variantType': item['variantType'],
-                'flags': [f for f in ensure_str_list(raw_flags) if f],
+                'flags': list(set([f for f in ensure_str_list(raw_flags) if f])),
             }
         )
         item.pop('flags', None)  # remove after extraction
-
-        return flags
-
-    observed_vars_section = [
-        flag for variants in variant_sources for flag in extract_flags(variants)
-    ]
-
-    return observed_vars_section
+    return flags
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index df316615..1fe5ebd4 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -6,6 +6,7 @@
 import jsonschema.exceptions
 import logging
 import os
+import pandas as pd
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 from typing import Callable, Dict, List, Optional, Sequence, Set
 
@@ -47,6 +48,7 @@
     get_kb_matches_sections,
     select_expression_plots,
     get_variant_flags,
+    add_transcript_flags
 )
 from .summary import auto_analyst_comments, get_ipr_analyst_comments
 from .therapeutic_options import create_therapeutic_options
@@ -158,6 +160,12 @@ def command_interface() -> None:
         action='store_true',
         help='True if ignore extra fields in json',
     )
+    parser.add_argument(
+        '--transcript_flags',
+        required=False,
+        type=file_path,
+        help='TSV without header, with columns: gene, transcript, comma-separated list of flags'
+    )
     args = parser.parse_args()
 
     with open(args.content, 'r') as fh:
@@ -182,6 +190,7 @@ def command_interface() -> None:
         upload_json=args.upload_json,
         validate_json=args.validate_json,
         ignore_extra_fields=args.ignore_extra_fields,
+        transcript_flags=args.transcript_flags,
     )
 
 
@@ -318,6 +327,7 @@ def ipr_report(
     validate_json: bool = False,
     ignore_extra_fields: bool = False,
     tmb_high: float = TMB_SIGNATURE_HIGH_THRESHOLD,
+    transcript_flags: str = '',
 ) -> Dict:
     """Run the matching and create the report JSON for upload to IPR.
 
@@ -386,6 +396,10 @@ def ipr_report(
         logger.error('Failed schema check - report variants may be corrupted or unmatched.')
         logger.error(f'Failed schema check: {err}')
 
+    transcript_flags_df = None
+    if transcript_flags:
+        transcript_flags_df = pd.read_csv(transcript_flags, sep='\t', names=['gene', 'transcript', 'flags'])
+
     # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS)
     signature_variants: List[IprSignatureVariant] = preprocess_signature_variants(
         [
@@ -460,6 +474,10 @@ def ipr_report(
         *structural_variants,
     ]  # type: ignore
 
+    # ANNOTATING VARIANTS WITH TRANSCRIPT FLAGS
+    if transcript_flags_df is not None and not transcript_flags_df.empty:
+        all_variants = add_transcript_flags(all_variants, transcript_flags_df)
+
     # GKB_MATCHES FILTERING
     if match_germline:
         # verify germline kb statements matched germline observed variants, not somatic variants
@@ -528,56 +546,18 @@ def ipr_report(
         gkb_matches, all_variants, kb_matched_sections['kbMatches']
     )
 
-    if False:
-
-        def extract_flags(variant_list):
-            # convert item to list of str, if it's just a str
-            def ensure_list(val):
-                if isinstance(val, str):
-                    return [val]
-                if not isinstance(val, list):
-                    raise ValueError('Unexpected type in flags field', val, {type(val).__name__})
-                return val
-
-            # extract only the relevant fields for creating an observed variant annotation record
-            flags = [
-                {
-                    'variant': item['key'],
-                    'variantType': item['variantType'],
-                    'flags': ensure_list(item['flags']),
-                }
-                for item in variant_list
-                if item['flags'] is not None and item['flags'] != ''
-            ]
-            _ = [item.pop('flags', '') for item in variant_list]
-
-            return flags
-
-        observed_vars_section = []
-        for varlist in [small_mutations, copy_variants, expression_variants]:
-            flags = extract_flags([item for item in varlist if item['gene'] in genes_with_variants])
-            observed_vars_section.extend(flags)
-        observed_vars_section.extend(extract_flags(signature_variants))
-        observed_vars_section.extend(
-            extract_flags(
-                filter_structural_variants(structural_variants, gkb_matches, gene_information)
-            )
-        )
-
-    if True:
-        variant_sources = [
-            v
-            for source in [
-                [v for v in small_mutations if v['gene'] in genes_with_variants],
-                [v for v in copy_variants if v['gene'] in genes_with_variants],
-                [v for v in expression_variants if v['gene'] in genes_with_variants],
-                signature_variants,
-                filter_structural_variants(structural_variants, gkb_matches, gene_information),
-            ]
-            for v in source
+    variant_sources = [
+        v
+        for source in [
+            [v for v in small_mutations if v['gene'] in genes_with_variants],
+            [v for v in copy_variants if v['gene'] in genes_with_variants],
+            [v for v in expression_variants if v['gene'] in genes_with_variants],
+            signature_variants,
+            filter_structural_variants(structural_variants, gkb_matches, gene_information),
         ]
-
-        observed_vars_section = get_variant_flags(variant_sources)
+        for v in source
+    ]
+    observed_vars_section = get_variant_flags(variant_sources)
 
     # OUTPUT CONTENT
     # thread safe deep-copy the original content

From caa77be597ae45c4a22cb7eacfce2e6c9f271603 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 11 Mar 2026 14:17:59 -0700
Subject: [PATCH 07/64] add tests, fix string input formatting

---
 pori_python/ipr/ipr.py     | 17 ++++++----
 tests/test_ipr/test_ipr.py | 67 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index dd90fdcc..de8f2cce 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -160,7 +160,6 @@ def convert_statements_to_alterations(
             )
             if query_result:
                 recruitment_statuses[rid] = query_result[0]['recruitmentStatus']  # type: ignore
-
     for statement in statements:
         variants = [
             cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES
@@ -229,6 +228,7 @@ def convert_statements_to_alterations(
                     row['kbContextId'], 'not found'
                 )
             rows.append(row)
+
     return rows
 
 
@@ -731,7 +731,7 @@ def get_kb_disease_matches(
 
 def ensure_str_list(val):
     if isinstance(val, str):
-        return [val]
+        return [f.strip() for f in val.split(',') if f.strip()]
     if isinstance(val, list):
         if not all(isinstance(item, str) for item in val):
             raise TypeError('All items in flags must be strings')
@@ -743,13 +743,16 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
     lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags']))
 
     for record in variant_sources:
-        new_flag = lookup.get(record.get('transcript'))
-        if not new_flag:
+        flags_str = lookup.get(record.get('transcript'))
+        if not flags_str:
             continue
+        # Split on commas and strip whitespace
+        new_flags = ensure_str_list(str(flags_str))
         flags = ensure_str_list(record.setdefault('flags', []))
-        if new_flag not in flags:
-            flags.append(new_flag)
-            record['flags'] = flags
+        for new_flag in new_flags:
+            if new_flag not in flags:
+                flags.append(new_flag)
+        record['flags'] = flags
 
     return variant_sources
 
diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py
index 3e9b01a3..5023c51a 100644
--- a/tests/test_ipr/test_ipr.py
+++ b/tests/test_ipr/test_ipr.py
@@ -1,4 +1,5 @@
 import pytest
+import pandas as pd
 from unittest.mock import Mock, patch
 
 from pori_python.graphkb import statement as gkb_statement
@@ -415,6 +416,72 @@ def test_approved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn
         assert row['category'] == 'therapeutic'
 
 
+class TestFlagUtilities:
+    def test_ensure_str_list_accepts_string(self):
+        from pori_python.ipr.ipr import ensure_str_list
+
+        assert ensure_str_list('abc') == ['abc']
+
+    def test_ensure_str_list_splits_comma_separated_string(self):
+        from pori_python.ipr.ipr import ensure_str_list
+
+        assert ensure_str_list('a, b , c') == ['a', 'b', 'c']
+
+    def test_ensure_str_list_accepts_list_of_strings(self):
+        from pori_python.ipr.ipr import ensure_str_list
+
+        assert ensure_str_list(['a', 'b']) == ['a', 'b']
+
+    def test_ensure_str_list_rejects_bad_types(self):
+        from pori_python.ipr.ipr import ensure_str_list
+
+        with pytest.raises(TypeError):
+            ensure_str_list([1, 'a'])
+        with pytest.raises(TypeError):
+            ensure_str_list(123)
+
+    def test_add_transcript_flags_basic(self):
+        from pori_python.ipr.ipr import add_transcript_flags
+
+        variant_sources = [
+            {'transcript': 'T1', 'key': 'k1', 'variantType': 'mut'},
+            {'transcript': 'T2', 'flags': 'existing', 'key': 'k2', 'variantType': 'mut'},
+            {'transcript': 'T3', 'flags': ['present'], 'key': 'k3', 'variantType': 'mut'},
+            {'transcript': 'T4', 'key': 'k4', 'variantType': 'mut'},
+        ]
+        df = pd.DataFrame({'transcript': ['T1', 'T2', 'T4'], 'flags': ['flag_a,flag_b', 'existing', 'flag_c, flag_d']})
+        result = add_transcript_flags(variant_sources, df)
+        # T1 should have two flags from comma-separated list
+        assert set(result[0]['flags']) == {'flag_a', 'flag_b'}
+        # T2 had a string flag that gets converted to list and duplicate is avoided
+        assert result[1]['flags'] == ['existing']
+        # T3 unaffected (no matching transcript in df)
+        assert result[2]['flags'] == ['present']
+        # T4 should have two flags with whitespace stripped
+        assert set(result[3]['flags']) == {'flag_c', 'flag_d'}
+
+    def test_get_variant_flags_behaviour(self):
+        from pori_python.ipr.ipr import get_variant_flags
+
+        variants = [
+            {'key': 'k1', 'variantType': 'mut', 'flags': 'foo'},
+            {'key': 'k2', 'variantType': 'mut', 'flags': ['bar', 'bar', '']},
+            {'key': 'k3', 'variantType': 'mut', 'flags': None},
+            {'key': 'k4', 'variantType': 'mut', 'flags': []},
+        ]
+        out = get_variant_flags(variants)
+        # k1 and k2 should be converted to flag records, k3/k4 skipped
+        assert any(item['variant'] == 'k1' and item['flags'] == ['foo'] for item in out)
+        assert any(item['variant'] == 'k2' and set(item['flags']) == {'bar'} for item in out)
+        assert len(out) == 2
+        # processed records should have their flags removed
+        assert 'flags' not in variants[0]
+        assert 'flags' not in variants[1]
+        # skipped records retain original flags key
+        assert 'flags' in variants[2]
+        assert 'flags' in variants[3]
+
+
 class TestKbmatchFilters:
     def test_germline_kb_matches(self):
         assert len(germline_kb_matches(GERMLINE_KB_MATCHES, GERMLINE_VARIANTS)) == len(

From 6305d2f2ffbc8c4edd6660d0a5d8fc528bbb37d6 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 16 Mar 2026 16:01:27 -0700
Subject: [PATCH 08/64] fix transcript matching for svs

---
 pori_python/ipr/ipr.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index de8f2cce..210db6de 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -754,6 +754,26 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
                 flags.append(new_flag)
         record['flags'] = flags
 
+    # fusions: check both transcripts for flags and add to the same record
+    label_map = {
+        'ctermTranscript': 'cterm',
+        'ntermTranscript': 'nterm'
+    }
+
+    for record in variant_sources:
+        flags = ensure_str_list(record.setdefault('flags', []))
+
+        for key, label in label_map.items():
+            transcript = record.get(key)
+            flags_str = lookup.get(transcript)
+            if not flags_str:
+                continue
+
+            for flag in ensure_str_list(str(flags_str)):
+                new_flag = f"{flag} ({label})"
+                if new_flag not in flags:
+                    flags.append(new_flag)
+                record['flags'] = flags
     return variant_sources
 
 

From 36d6ab095b0d4d765af967d526b56c8a62a2f3ad Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 16 Mar 2026 16:02:02 -0700
Subject: [PATCH 09/64] add test for updating sv transcript flags

---
 tests/test_ipr/test_ipr.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py
index 5023c51a..3232f1f7 100644
--- a/tests/test_ipr/test_ipr.py
+++ b/tests/test_ipr/test_ipr.py
@@ -460,6 +460,29 @@ def test_add_transcript_flags_basic(self):
         # T4 should have two flags with whitespace stripped
         assert set(result[3]['flags']) == {'flag_c', 'flag_d'}
 
+    def test_add_transcript_flags_fusions(self):
+        from pori_python.ipr.ipr import add_transcript_flags
+
+        # Fusion records can have separate nterm/cterm transcripts
+        variant_sources = [
+            {
+                'key': 'f1',
+                'variantType': 'fusion',
+                'ctermTranscript': 'CT1',
+                'ntermTranscript': 'NT1',
+            }
+        ]
+        df = pd.DataFrame(
+            {
+                'transcript': ['CT1', 'NT1'],
+                'flags': ['cterm_flag', 'nterm_flag'],
+            }
+        )
+        result = add_transcript_flags(variant_sources, df)
+        flags = result[0]['flags']
+        assert 'cterm_flag (cterm)' in flags
+        assert 'nterm_flag (nterm)' in flags
+
     def test_get_variant_flags_behaviour(self):
         from pori_python.ipr.ipr import get_variant_flags
 

From bbabdd943f6936b276703c96244cd06a3b92faad Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 16 Mar 2026 16:11:10 -0700
Subject: [PATCH 10/64] add upload test

---
 tests/test_ipr/test_upload_with_flags.py | 345 +++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 tests/test_ipr/test_upload_with_flags.py

diff --git a/tests/test_ipr/test_upload_with_flags.py b/tests/test_ipr/test_upload_with_flags.py
new file mode 100644
index 00000000..b75a922e
--- /dev/null
+++ b/tests/test_ipr/test_upload_with_flags.py
@@ -0,0 +1,345 @@
+import json
+import os
+import pandas as pd
+import pytest
+import sys
+import uuid
+from typing import Generator
+from unittest.mock import patch
+
+from pori_python.ipr.connection import IprConnection
+from pori_python.ipr.main import command_interface
+from pori_python.types import IprGene
+
+from .constants import EXCLUDE_INTEGRATION_TESTS
+
+EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1'
+EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1'
+INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1'
+DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1'
+
+
+def get_test_spec():
+    ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}}
+    ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__
+    for key in ipr_gene_keys:
+        ipr_spec['components']['schemas']['genesCreate']['properties'][key] = ''
+    return ipr_spec
+
+
+def get_test_file(name: str) -> str:
+    return os.path.join(os.path.dirname(__file__), 'test_data', name)
+
+
+@pytest.fixture(scope='module')
+def loaded_reports(tmp_path_factory) -> Generator:
+    """
+    Load test data with selective flagging to enable end-to-end testing of:
+    1. Flags from variant input TSVs (expressionVariants, smallMutations, etc.)
+    2. Flags from transcript annotation TSV (transcript_flags)
+    3. Variants without flags should NOT have observedVariantAnnotation entries
+
+    This fixture:
+    - Only flags a subset of variants per type (e.g. first expression variant, first mutation)
+    - Creates a transcript flags TSV with 2 flags for APC gene variants
+    - Verifies compatibility with pori_ipr_api's observedVariantAnnotation model
+    """
+    json_file = tmp_path_factory.mktemp('inputs') / 'content.json'
+    async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json'
+    transcript_flags_file = tmp_path_factory.mktemp('inputs') / 'transcript_flags.tsv'
+    patient_id = f'TEST_WITH_FLAGS{str(uuid.uuid4())}'
+    async_patient_id = f'TEST_WITH_FLAGS_ASYNC_{str(uuid.uuid4())}'
+
+    # Load data - only flag SOME variants to test that unflagged ones don't get annotations
+    expvars = pd.read_csv(get_test_file('expression.short.tab'), sep='\t')
+    # Flag only the first expression variant
+    expvars['flags'] = ''
+    expvars_variant_locs = expvars[~pd.isnull(expvars.kbCategory)].index[0:2].tolist()
+    expvars.loc[expvars_variant_locs[0], 'flags'] = 'expression_flag_1'
+    expvars.loc[expvars_variant_locs[1], 'flags'] = 'expression_flag_1,expression_flag_2'  # test multiple flags in one string
+    expvars_json = expvars.to_json(orient='records')
+
+    smallmuts = pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t')
+    # Flag only the first small mutation
+    smallmuts['flags'] = ''
+    smallmuts.loc[0, 'flags'] = 'mutation_flag_1'
+
+    # Find the first small mutation that is not on APC gene to avoid overlap with transcript flags test
+    non_apc_indices = smallmuts[smallmuts['gene'] != 'APC'].index
+    multi_flag_index = non_apc_indices[0]
+    smallmuts.loc[multi_flag_index, 'flags'] = 'mutation_flag_2,mutation_flag_1'  # test multiple flags in one string
+
+    # get transcript for this mutation to match in transcript flags file
+    smallmut_gene = smallmuts.loc[multi_flag_index, 'gene']
+    smallmut_transcript = smallmuts.loc[multi_flag_index, 'transcript']
+    smallmuts_json = smallmuts.to_json(orient='records')
+
+    copyvars = pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t')
+    # Flag only the first copy variant
+    copyvars['flags'] = ''
+    copyvars.loc[0, 'flags'] = 'cnv_flag_1'
+    copyvars.loc[1, 'flags'] = 'cnv_flag_1,cnv_flag_2'  # test multiple flags in one string
+    copyvars_json = copyvars.to_json(orient='records')
+
+    svs = pd.read_csv(get_test_file('fusions.tab'), sep='\t')
+    # Flag only the first SV
+    svs['flags'] = ''
+    svs.loc[0, 'flags'] = 'sv_flag_1'
+    svs.loc[1, 'flags'] = 'sv_flag_1,sv_flag_2'  # test multiple flags in one string
+    svs_json = svs.to_json(orient='records')
+
+    hla = pd.read_csv(get_test_file('hla_variants.tab'), sep='\t')
+    hla_json = hla.to_json(orient='records')
+
+    # Create a transcript flags file with flags for specific transcripts
+    # Match transcripts from small_mutations.short.tab
+    transcript_flags_df = pd.DataFrame({
+        'gene': ['APC', 'APC', smallmut_gene, 'svgene1', 'svgene2', 'svgene3', 'svgene4'],
+        'transcript': ['ENST00000457016', 'ENST00000257430', smallmut_transcript,'ENST00000358273', 'ENST00000397938', 'ENST00000373930', 'ENST00000457710'],
+        'flags': ['transcript_flag_1', 'transcript_flag_2', 'additional_transcript_flag', 'sv_transcript_flag_1', 'sv_transcript_flag_2', 'sv_transcript_flag_3', 'sv_transcript_flag_4'],
+    })
+    transcript_flags_df.to_csv(transcript_flags_file, sep='\t', index=False)
+
+    json_contents = {
+        'comparators': [
+            {'analysisRole': 'expression (disease)', 'name': '1'},
+            {'analysisRole': 'expression (primary site)', 'name': '2'},
+            {'analysisRole': 'expression (biopsy site)', 'name': '3'},
+            {
+                'analysisRole': 'expression (internal pancancer cohort)',
+                'name': '4',
+            },
+        ],
+        'patientId': patient_id,
+        'project': 'TEST',
+        'sampleInfo': [
+            {
+                'sample': 'Constitutional',
+                'biopsySite': 'Normal tissue',
+                'sampleName': 'SAMPLE1-PB',
+                'primarySite': 'Blood-Peripheral',
+                'collectionDate': '11-11-11',
+            },
+            {
+                'sample': 'Tumour',
+                'pathoTc': '90%',
+                'biopsySite': 'hepatic',
+                'sampleName': 'SAMPLE2-FF-1',
+                'primarySite': 'Vena Cava-Hepatic',
+                'collectionDate': '12-12-12',
+            },
+        ],
+        'kbDiseaseMatch': 'colorectal cancer',
+        'msi': [
+            {
+                'score': 1000.0,
+                'kbCategory': 'microsatellite instability',
+            }
+        ],
+        'hrd': {
+            'score': 9999.0,
+            'kbCategory': 'homologous recombination deficiency strong signature',
+        },
+        'expressionVariants': json.loads(expvars_json),
+        'smallMutations': json.loads(smallmuts_json),
+        'copyVariants': json.loads(copyvars_json),
+        'structuralVariants': json.loads(svs_json),
+        'cosmicSignatures': pd.read_csv(
+            get_test_file('cosmic_variants.tab'), sep='\t'
+        ).signature.tolist(),
+        'hlaTypes': json.loads(hla_json),
+    }
+
+    json_contents['patientId'] = async_patient_id
+    async_json_file.write_text(
+        json.dumps(
+            json_contents,
+            allow_nan=False,
+        )
+    )
+
+    argslist = [
+        'ipr',
+        '--username',
+        os.environ.get('IPR_USER', os.environ['USER']),
+        '--password',
+        os.environ['IPR_PASS'],
+        '--graphkb_username',
+        os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])),
+        '--graphkb_password',
+        os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']),
+        '--ipr_url',
+        os.environ['IPR_TEST_URL'],
+        '--graphkb_url',
+        os.environ.get('GRAPHKB_URL', False),
+        '--therapeutics',
+        '--allow_partial_matches',
+        '-o upload_with_flags.json',
+        '--transcript_flags',
+        str(transcript_flags_file),
+    ]
+
+    async_argslist = argslist.copy()
+    async_argslist.extend(['--content', str(async_json_file), '--async_upload'])
+    with patch.object(sys, 'argv', async_argslist):
+        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
+            command_interface()
+
+    ipr_conn = IprConnection(
+        username=os.environ.get('IPR_USER', os.environ['USER']),
+        password=os.environ['IPR_PASS'],
+        url=os.environ['IPR_TEST_URL'],
+    )
+    async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}')
+
+    # Collect expected flagged genes for each variant type
+    expected_flagged = {
+        'expression': expvars[expvars['flags'] != '']['gene'].tolist(),
+        'small_mutations': smallmuts[smallmuts['flags'] != '']['gene'].tolist(),
+        'copy_variants': copyvars[copyvars['flags'] != '']['gene'].tolist(),
+        'structural_variants_cterm': svs[svs['flags'] != '']['gene2'].tolist(),
+        'structural_variants_nterm': svs[svs['flags'] != '']['gene1'].tolist()
+    }
+
+    loaded_reports_result = {
+        'async': (async_patient_id, async_loaded_report),
+        'expected_flagged': expected_flagged,
+    }
+    yield loaded_reports_result
+    if DELETE_UPLOAD_TEST_REPORTS:
+        ipr_conn.delete(uri=f'reports/{async_loaded_report["reports"][0]["ident"]}')
+
+
+def get_section(loaded_report, section_name):
+    ident = loaded_report[1]['reports'][0]['ident']
+    ipr_conn = IprConnection(
+        username=os.environ.get('IPR_USER', os.environ['USER']),
+        password=os.environ['IPR_PASS'],
+        url=os.environ['IPR_TEST_URL'],
+    )
+    return ipr_conn.get(uri=f'reports/{ident}/{section_name}')
+
+
+def stringify_sorted(obj):
+    """
+    stringifies a (json) object
+    in such a way that it can be compared for equality
+    with another json object"""
+    if isinstance(obj, list):
+        obj = [stringify_sorted(item) for item in obj]
+        obj.sort()
+        return str(obj)
+    elif isinstance(obj, dict):
+        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt', 'reportId', 'variantId', 'id'):
+            obj.pop(key, None)
+        keys = obj.keys()
+        for key in keys:
+            if isinstance(obj[key], list):
+                obj[key] = stringify_sorted(obj[key])
+            elif isinstance(obj[key], dict):
+                obj[key] = stringify_sorted(obj[key])
+        return str(obj)
+    elif isinstance(obj, str):
+        return obj
+    else:
+        return str(obj)
+
+
+@pytest.mark.skipif(
+    not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance'
+)
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests')
+class TestCreateReport:
+    def test_patient_id_loaded_once(self, loaded_reports) -> None:
+        async_patient_id = loaded_reports['async'][0]
+        assert loaded_reports['async'][1]['total'] == 1
+        assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id
+
+    def test_observed_variant_annotations_loaded(self, loaded_reports) -> None:
+        """Test that flagged variants have observedVariantAnnotation with correct flags."""
+        variants_section = get_section(loaded_reports['async'], 'variants')
+        expected_flagged = loaded_reports['expected_flagged']
+
+        # Check that expression variant with input flag has annotation
+        exp_vars_with_annot = [v for v in variants_section if v['variantType'] == 'exp' and 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
+        assert len(exp_vars_with_annot) > 0, "Should have at least one expression variant with annotation"
+
+        # Find the flagged expression variants
+        flagged_exp_genes = expected_flagged['expression']
+        for gene in flagged_exp_genes:
+            flagged_exp = [v for v in exp_vars_with_annot if v['gene']['name'] == gene]
+            assert len(flagged_exp) >= 1, f"{gene} should be flagged with input flag"
+            for var in flagged_exp:
+                assert any(['expression_flag' in str(var['observedVariantAnnotation'].get('flags', []))])
+
+        # Check that the variant with multiple flags has both flags correctly split
+        multi_flag_exp = [
+            v for v in exp_vars_with_annot if len(v['observedVariantAnnotation']['flags'])>1
+        ]
+        assert len(multi_flag_exp) > 0, "Should have at least one expression variant with multiple flags"
+        for var in multi_flag_exp:
+            flags = var['observedVariantAnnotation'].get('flags', [])
+            if len(flags) > 1:
+                assert 'expression_flag_1' in flags and 'expression_flag_2' in flags, \
+                    f"Variant with multiple flags should have both expression_flag_1 and expression_flag_2, got {flags}"
+
+        # Check that unflagged expression variants don't have observedVariantAnnotation
+        unflagged_exp = [v for v in variants_section if v['variantType'] == 'exp' and v['gene']['name'] not in flagged_exp_genes]
+        for var in unflagged_exp:
+            assert 'observedVariantAnnotation' not in var or var['observedVariantAnnotation'] is None, \
+                f"Unflagged expression variant {var['gene']['name']} should not have annotations"
+
+    def test_variant_transcript_annotations_loaded(self, loaded_reports) -> None:
+        """Test that variants with transcript flags have observedVariantAnnotation from transcript file."""
+        variants_section = get_section(loaded_reports['async'], 'variants')
+
+        # Find small mutation variants
+        mut_vars = [v for v in variants_section if v['variantType'] == 'mut']
+        assert len(mut_vars) > 0, "Should have small mutations"
+
+        # each APC with ENST00000457016 transcript should have transcript_flag_1
+        apc_mut_enst1 = [v for v in mut_vars if v['gene']['name'] == 'APC' and v.get('transcript') == 'ENST00000457016']
+        assert len(apc_mut_enst1) > 0, "Should find at least one APC mutation with ENST00000457016 transcript"
+
+        has_transcript_flag = True
+        for var in apc_mut_enst1:
+            if 'observedVariantAnnotation' in var and var['observedVariantAnnotation'] is not None:
+                if 'transcript_flag_1' not in var['observedVariantAnnotation'].get('flags', []):
+                    has_transcript_flag = False
+                    break
+
+        assert has_transcript_flag, "All mutations with transcript ENST00000457016 should have transcript_flag_1 from transcript file"
+
+        # APC with ENST00000257430 transcript should have transcript_flag_2
+        apc_mut_enst2 = [v for v in mut_vars if v['gene']['name'] == 'APC' and v.get('transcript') == 'ENST00000257430']
+        assert len(apc_mut_enst2) > 0, "Should find APC mutation with ENST00000257430 transcript"
+
+        has_second_flag = True
+        for var in apc_mut_enst2:
+            if 'observedVariantAnnotation' in var and var['observedVariantAnnotation'] is not None:
+                if 'transcript_flag_2' not in var['observedVariantAnnotation'].get('flags', []):
+                    has_second_flag = False
+                    break
+
+        assert has_second_flag, "All APC mutations with transcript ENST00000257430 should have transcript_flag_2 from transcript file"
+
+        # Check that the variant with a transcript flag and multiple input flags
+        # has all flags correctly represented in observedVariantAnnotation
+        annotated_mut_vars = [v for v in mut_vars if 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
+        multi_flag_mut = [v for v in annotated_mut_vars if len(v.get('observedVariantAnnotation').get('flags', [])) > 2]
+        assert len(multi_flag_mut) > 0, "Should have at least one small mutation variant with multiple flags"
+        for var in multi_flag_mut:
+            flags = var['observedVariantAnnotation'].get('flags', [])
+            if len(flags) > 2:
+                assert 'mutation_flag_1' in flags and 'mutation_flag_2' in flags, \
+                    f"Variant with multiple flags should have both mutation_flag_1 and mutation_flag_2, got {flags}"
+
+    def test_fusion_variants_have_multiple_transcript_annotations_loaded(self, loaded_reports) -> None:
+        """Test that variants with transcript flags have observedVariantAnnotation from transcript file."""
+        variants_section = get_section(loaded_reports['async'], 'variants')
+
+        # Find small mutation variants
+        svs = [v for v in variants_section if v['variantType'] == 'sv']
+        annotated_svs = [v for v in svs if 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
+
+        assert len(annotated_svs) > 0, "Should have annotated svs"

From 90a51bd596085275c54cf462e4f44bd65165a055 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 17 Mar 2026 09:26:23 -0700
Subject: [PATCH 11/64] add flags to test_upload

---
 tests/test_ipr/test_upload.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index 2c6fb73c..5fc17d2a 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -19,7 +19,7 @@
 DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1'
 
 
-def get_test_spec():
+def get_test_spec() -> dict:
     ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}}
     ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__
     for key in ipr_gene_keys:
@@ -31,12 +31,35 @@ def get_test_file(name: str) -> str:
     return os.path.join(os.path.dirname(__file__), 'test_data', name)
 
 
+def get_test_transcript_flags(json_contents) -> pd.DataFrame:
+    """ creates a dataframe of transcript flags for test purposes, based on the input json contents """
+    transcript_flags = []
+    for item in json_contents['structuralVariants']:
+        transcript_flags.append((item['gene1'], item['ntermTranscript'], 'TRANSCRIPT FLAG'))
+        transcript_flags.append((item['gene2'], item['ctermTranscript'], 'TRANSCRIPT FLAG'))
+    for item in json_contents['smallMutations']:
+        transcript_flags.append((item['gene'], item['transcript'], 'TRANSCRIPT FLAG'))
+    df = pd.DataFrame(transcript_flags, columns=['gene', 'transcript', 'flags'])
+    df = df.drop_duplicates()
+    return df
+
+
+def add_test_variant_flags_to_input_data(json_contents) -> dict:
+    """ adds flags to the input variants for test purposes """
+    for vtype in ['structuralVariants', 'smallMutations', 'copyVariants', 'expressionVariants']:
+        for item in json_contents[vtype]:
+            item['flags'] = ['TEST FLAG']
+    return json_contents
+
+
 @pytest.fixture(scope='module')
 def loaded_reports(tmp_path_factory) -> Generator:
     json_file = tmp_path_factory.mktemp('inputs') / 'content.json'
     async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json'
+    transcript_flags_file = tmp_path_factory.mktemp('inputs') / 'transcript_flags.tsv'
     patient_id = f'TEST_{str(uuid.uuid4())}'
     async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}'
+
     json_contents = {
         'comparators': [
             {'analysisRole': 'expression (disease)', 'name': '1'},
@@ -109,6 +132,11 @@ def loaded_reports(tmp_path_factory) -> Generator:
         'config': 'test config',
     }
 
+    json_contents = add_test_variant_flags_to_input_data(json_contents)
+
+    transcript_flags_df = get_test_transcript_flags(json_contents)
+    transcript_flags_df.to_csv(transcript_flags_file, sep='\t', index=False)
+
     json_file.write_text(
         json.dumps(
             json_contents,
@@ -140,6 +168,8 @@ def loaded_reports(tmp_path_factory) -> Generator:
         os.environ.get('GRAPHKB_URL', False),
         '--therapeutics',
         '--allow_partial_matches',
+        '--transcript_flags',
+        str(transcript_flags_file),
     ]
 
     sync_argslist = argslist.copy()
@@ -192,7 +222,7 @@ def stringify_sorted(obj):
         obj.sort()
         return str(obj)
     elif isinstance(obj, dict):
-        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'):
+        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt', 'variantId', 'id', 'reportId'):
             obj.pop(key, None)
         keys = obj.keys()
         for key in keys:

From dab3e65f4079f19ac4161b60245d9176c80c2d06 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 17 Mar 2026 11:27:56 -0700
Subject: [PATCH 12/64] fix spec, add tests, format

---
 pori_python/ipr/content.spec.json |  43 ++++++++++-
 pori_python/ipr/ipr.py            |   7 +-
 pori_python/ipr/main.py           |   8 ++-
 tests/test_ipr/test_ipr.py        | 116 ++++++++++++++++++++----------
 tests/test_ipr/test_upload.py     |   4 +-
 5 files changed, 129 insertions(+), 49 deletions(-)

diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
index 5a1793a2..9f031899 100644
--- a/pori_python/ipr/content.spec.json
+++ b/pori_python/ipr/content.spec.json
@@ -202,6 +202,16 @@
                             "number",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -475,6 +485,16 @@
                             "null",
                             "string"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -1106,6 +1126,16 @@
                             "string",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -1161,8 +1191,7 @@
                         "description": "the type of underlying structural variant",
                         "example": "deletion",
                         "type": "string"
-                    },
-                    "exon1": {
+                    },                    "exon1": {
                         "description": "the 5' (n-terminal) exon",
                         "example": 1,
                         "type": [
@@ -1290,6 +1319,16 @@
                             "integer",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 210db6de..ca1f1c6b 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -755,10 +755,7 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
         record['flags'] = flags
 
     # fusions: check both transcripts for flags and add to the same record
-    label_map = {
-        'ctermTranscript': 'cterm',
-        'ntermTranscript': 'nterm'
-    }
+    label_map = {'ctermTranscript': 'cterm', 'ntermTranscript': 'nterm'}
 
     for record in variant_sources:
         flags = ensure_str_list(record.setdefault('flags', []))
@@ -770,7 +767,7 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
                 continue
 
             for flag in ensure_str_list(str(flags_str)):
-                new_flag = f"{flag} ({label})"
+                new_flag = f'{flag} ({label})'
                 if new_flag not in flags:
                     flags.append(new_flag)
                 record['flags'] = flags
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 1fe5ebd4..c94721a3 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -48,7 +48,7 @@
     get_kb_matches_sections,
     select_expression_plots,
     get_variant_flags,
-    add_transcript_flags
+    add_transcript_flags,
 )
 from .summary import auto_analyst_comments, get_ipr_analyst_comments
 from .therapeutic_options import create_therapeutic_options
@@ -164,7 +164,7 @@ def command_interface() -> None:
         '--transcript_flags',
         required=False,
         type=file_path,
-        help='TSV without header, with columns: gene, transcript, comma-separated list of flags'
+        help='TSV without header, with columns: gene, transcript, comma-separated list of flags',
     )
     args = parser.parse_args()
 
@@ -398,7 +398,9 @@ def ipr_report(
 
     transcript_flags_df = None
     if transcript_flags:
-        transcript_flags_df = pd.read_csv(transcript_flags, sep='\t', names=['gene', 'transcript', 'flags'])
+        transcript_flags_df = pd.read_csv(
+            transcript_flags, sep='\t', names=['gene', 'transcript', 'flags']
+        )
 
     # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS)
     signature_variants: List[IprSignatureVariant] = preprocess_signature_variants(
diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py
index 3232f1f7..68ea2ccc 100644
--- a/tests/test_ipr/test_ipr.py
+++ b/tests/test_ipr/test_ipr.py
@@ -13,7 +13,11 @@
     get_kb_variants,
     get_kb_matches_sections,
     create_key_alterations,
+    ensure_str_list,
+    add_transcript_flags,
+    get_variant_flags,
 )
+
 from pori_python.types import Statement
 
 DISEASE_RIDS = ['#138:12', '#138:13']
@@ -418,52 +422,53 @@ def test_approved_therapeutic(self, mock_get_evidencelevel_mapping, graphkb_conn
 
 class TestFlagUtilities:
     def test_ensure_str_list_accepts_string(self):
-        from pori_python.ipr.ipr import ensure_str_list
-
         assert ensure_str_list('abc') == ['abc']
 
     def test_ensure_str_list_splits_comma_separated_string(self):
-        from pori_python.ipr.ipr import ensure_str_list
-
         assert ensure_str_list('a, b , c') == ['a', 'b', 'c']
 
     def test_ensure_str_list_accepts_list_of_strings(self):
-        from pori_python.ipr.ipr import ensure_str_list
-
         assert ensure_str_list(['a', 'b']) == ['a', 'b']
 
     def test_ensure_str_list_rejects_bad_types(self):
-        from pori_python.ipr.ipr import ensure_str_list
-
         with pytest.raises(TypeError):
             ensure_str_list([1, 'a'])
         with pytest.raises(TypeError):
             ensure_str_list(123)
 
-    def test_add_transcript_flags_basic(self):
-        from pori_python.ipr.ipr import add_transcript_flags
-
+    def test_add_transcript_flags_basic_adds_flags_from_comma_separated_string(self):
         variant_sources = [
             {'transcript': 'T1', 'key': 'k1', 'variantType': 'mut'},
+        ]
+        df = pd.DataFrame({'transcript': ['T1'], 'flags': ['flag_a,flag_b']})
+        result = add_transcript_flags(variant_sources, df)
+        assert set(result[0]['flags']) == {'flag_a', 'flag_b'}
+
+    def test_add_transcript_flags_basic_converts_string_flag_to_list_avoiding_duplicates(self):
+        variant_sources = [
             {'transcript': 'T2', 'flags': 'existing', 'key': 'k2', 'variantType': 'mut'},
+        ]
+        df = pd.DataFrame({'transcript': ['T2'], 'flags': ['existing']})
+        result = add_transcript_flags(variant_sources, df)
+        assert result[0]['flags'] == ['existing']
+
+    def test_add_transcript_flags_basic_leaves_unmatched_transcripts_unaffected(self):
+        variant_sources = [
             {'transcript': 'T3', 'flags': ['present'], 'key': 'k3', 'variantType': 'mut'},
-            {'transcript': 'T4', 'key': 'k4', 'variantType': 'mut'},
         ]
-        df = pd.DataFrame({'transcript': ['T1', 'T2', 'T4'], 'flags': ['flag_a,flag_b', 'existing', 'flag_c, flag_d']})
+        df = pd.DataFrame({'transcript': ['T1', 'T2'], 'flags': ['flag_a,flag_b', 'existing']})
         result = add_transcript_flags(variant_sources, df)
-        # T1 should have two flags from comma-separated list
-        assert set(result[0]['flags']) == {'flag_a', 'flag_b'}
-        # T2 had a string flag that gets converted to list and duplicate is avoided
-        assert result[1]['flags'] == ['existing']
-        # T3 unaffected (no matching transcript in df)
-        assert result[2]['flags'] == ['present']
-        # T4 should have two flags with whitespace stripped
-        assert set(result[3]['flags']) == {'flag_c', 'flag_d'}
+        assert result[0]['flags'] == ['present']
 
-    def test_add_transcript_flags_fusions(self):
-        from pori_python.ipr.ipr import add_transcript_flags
+    def test_add_transcript_flags_basic_strips_whitespace_from_comma_separated_flags(self):
+        variant_sources = [
+            {'transcript': 'T4', 'key': 'k4', 'variantType': 'mut'},
+        ]
+        df = pd.DataFrame({'transcript': ['T4'], 'flags': ['flag_c, flag_d']})
+        result = add_transcript_flags(variant_sources, df)
+        assert set(result[0]['flags']) == {'flag_c', 'flag_d'}
 
-        # Fusion records can have separate nterm/cterm transcripts
+    def test_add_transcript_flags_fusions_tags_cterm_flags(self):
         variant_sources = [
             {
                 'key': 'f1',
@@ -474,35 +479,72 @@ def test_add_transcript_flags_fusions(self):
         ]
         df = pd.DataFrame(
             {
-                'transcript': ['CT1', 'NT1'],
-                'flags': ['cterm_flag', 'nterm_flag'],
+                'transcript': ['CT1'],
+                'flags': ['cterm_flag'],
             }
         )
         result = add_transcript_flags(variant_sources, df)
         flags = result[0]['flags']
         assert 'cterm_flag (cterm)' in flags
-        assert 'nterm_flag (nterm)' in flags
 
-    def test_get_variant_flags_behaviour(self):
-        from pori_python.ipr.ipr import get_variant_flags
+    def test_add_transcript_flags_fusions_tags_nterm_flags(self):
+        variant_sources = [
+            {
+                'key': 'f1',
+                'variantType': 'fusion',
+                'ctermTranscript': 'CT1',
+                'ntermTranscript': 'NT1',
+            }
+        ]
+        df = pd.DataFrame(
+            {
+                'transcript': ['NT1'],
+                'flags': ['nterm_flag'],
+            }
+        )
+        result = add_transcript_flags(variant_sources, df)
+        flags = result[0]['flags']
+        assert 'nterm_flag (nterm)' in flags
 
+    def test_get_variant_flags_converts_string_flags_to_records(self):
         variants = [
             {'key': 'k1', 'variantType': 'mut', 'flags': 'foo'},
+        ]
+        out = get_variant_flags(variants)
+        assert any(item['variant'] == 'k1' and item['flags'] == ['foo'] for item in out)
+        assert len(out) == 1
+
+    def test_get_variant_flags_deduplicates_and_removes_empty_strings(self):
+        variants = [
             {'key': 'k2', 'variantType': 'mut', 'flags': ['bar', 'bar', '']},
+        ]
+        out = get_variant_flags(variants)
+        assert any(item['variant'] == 'k2' and set(item['flags']) == {'bar'} for item in out)
+
+    def test_get_variant_flags_skips_null_flags(self):
+        variants = [
             {'key': 'k3', 'variantType': 'mut', 'flags': None},
+        ]
+        out = get_variant_flags(variants)
+        assert not any(item['variant'] == 'k3' for item in out)
+        assert len(out) == 0
+
+    def test_get_variant_flags_skips_empty_list_flags(self):
+        variants = [
             {'key': 'k4', 'variantType': 'mut', 'flags': []},
         ]
         out = get_variant_flags(variants)
-        # k1 and k2 should be converted to flag records, k3/k4 skipped
-        assert any(item['variant'] == 'k1' and item['flags'] == ['foo'] for item in out)
-        assert any(item['variant'] == 'k2' and set(item['flags']) == {'bar'} for item in out)
-        assert len(out) == 2
-        # processed records should have their flags removed
+        assert not any(item['variant'] == 'k4' for item in out)
+        assert len(out) == 0
+
+    def test_get_variant_flags_removes_flags_key_from_processed_records(self):
+        variants = [
+            {'key': 'k1', 'variantType': 'mut', 'flags': 'foo'},
+            {'key': 'k2', 'variantType': 'mut', 'flags': ['bar', 'bar', '']},
+        ]
+        get_variant_flags(variants)
         assert 'flags' not in variants[0]
         assert 'flags' not in variants[1]
-        # skipped records retain original flags key
-        assert 'flags' in variants[2]
-        assert 'flags' in variants[3]
 
 
 class TestKbmatchFilters:
diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index 5fc17d2a..70837bf3 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -32,7 +32,7 @@ def get_test_file(name: str) -> str:
 
 
 def get_test_transcript_flags(json_contents) -> pd.DataFrame:
-    """ creates a dataframe of transcript flags for test purposes, based on the input json contents """
+    """creates a dataframe of transcript flags for test purposes, based on the input json contents"""
     transcript_flags = []
     for item in json_contents['structuralVariants']:
         transcript_flags.append((item['gene1'], item['ntermTranscript'], 'TRANSCRIPT FLAG'))
@@ -45,7 +45,7 @@ def get_test_transcript_flags(json_contents) -> pd.DataFrame:
 
 
 def add_test_variant_flags_to_input_data(json_contents) -> dict:
-    """ adds flags to the input variants for test purposes """
+    """adds flags to the input variants for test purposes"""
     for vtype in ['structuralVariants', 'smallMutations', 'copyVariants', 'expressionVariants']:
         for item in json_contents[vtype]:
             item['flags'] = ['TEST FLAG']

From 7fe28dee2c1dfa44f1ebfd0657e8df56d20445fb Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 17 Mar 2026 13:24:45 -0700
Subject: [PATCH 13/64] remove unneeded test file

---
 tests/test_ipr/test_upload_with_flags.py | 345 -----------------------
 1 file changed, 345 deletions(-)
 delete mode 100644 tests/test_ipr/test_upload_with_flags.py

diff --git a/tests/test_ipr/test_upload_with_flags.py b/tests/test_ipr/test_upload_with_flags.py
deleted file mode 100644
index b75a922e..00000000
--- a/tests/test_ipr/test_upload_with_flags.py
+++ /dev/null
@@ -1,345 +0,0 @@
-import json
-import os
-import pandas as pd
-import pytest
-import sys
-import uuid
-from typing import Generator
-from unittest.mock import patch
-
-from pori_python.ipr.connection import IprConnection
-from pori_python.ipr.main import command_interface
-from pori_python.types import IprGene
-
-from .constants import EXCLUDE_INTEGRATION_TESTS
-
-EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1'
-EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1'
-INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1'
-DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1'
-
-
-def get_test_spec():
-    ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}}
-    ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__
-    for key in ipr_gene_keys:
-        ipr_spec['components']['schemas']['genesCreate']['properties'][key] = ''
-    return ipr_spec
-
-
-def get_test_file(name: str) -> str:
-    return os.path.join(os.path.dirname(__file__), 'test_data', name)
-
-
-@pytest.fixture(scope='module')
-def loaded_reports(tmp_path_factory) -> Generator:
-    """
-    Load test data with selective flagging to enable end-to-end testing of:
-    1. Flags from variant input TSVs (expressionVariants, smallMutations, etc.)
-    2. Flags from transcript annotation TSV (transcript_flags)
-    3. Variants without flags should NOT have observedVariantAnnotation entries
-
-    This fixture:
-    - Only flags a subset of variants per type (e.g. first expression variant, first mutation)
-    - Creates a transcript flags TSV with 2 flags for APC gene variants
-    - Verifies compatibility with pori_ipr_api's observedVariantAnnotation model
-    """
-    json_file = tmp_path_factory.mktemp('inputs') / 'content.json'
-    async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json'
-    transcript_flags_file = tmp_path_factory.mktemp('inputs') / 'transcript_flags.tsv'
-    patient_id = f'TEST_WITH_FLAGS{str(uuid.uuid4())}'
-    async_patient_id = f'TEST_WITH_FLAGS_ASYNC_{str(uuid.uuid4())}'
-
-    # Load data - only flag SOME variants to test that unflagged ones don't get annotations
-    expvars = pd.read_csv(get_test_file('expression.short.tab'), sep='\t')
-    # Flag only the first expression variant
-    expvars['flags'] = ''
-    expvars_variant_locs = expvars[~pd.isnull(expvars.kbCategory)].index[0:2].tolist()
-    expvars.loc[expvars_variant_locs[0], 'flags'] = 'expression_flag_1'
-    expvars.loc[expvars_variant_locs[1], 'flags'] = 'expression_flag_1,expression_flag_2'  # test multiple flags in one string
-    expvars_json = expvars.to_json(orient='records')
-
-    smallmuts = pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t')
-    # Flag only the first small mutation
-    smallmuts['flags'] = ''
-    smallmuts.loc[0, 'flags'] = 'mutation_flag_1'
-
-    # Find the first small mutation that is not on APC gene to avoid overlap with transcript flags test
-    non_apc_indices = smallmuts[smallmuts['gene'] != 'APC'].index
-    multi_flag_index = non_apc_indices[0]
-    smallmuts.loc[multi_flag_index, 'flags'] = 'mutation_flag_2,mutation_flag_1'  # test multiple flags in one string
-
-    # get transcript for this mutation to match in transcript flags file
-    smallmut_gene = smallmuts.loc[multi_flag_index, 'gene']
-    smallmut_transcript = smallmuts.loc[multi_flag_index, 'transcript']
-    smallmuts_json = smallmuts.to_json(orient='records')
-
-    copyvars = pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t')
-    # Flag only the first copy variant
-    copyvars['flags'] = ''
-    copyvars.loc[0, 'flags'] = 'cnv_flag_1'
-    copyvars.loc[1, 'flags'] = 'cnv_flag_1,cnv_flag_2'  # test multiple flags in one string
-    copyvars_json = copyvars.to_json(orient='records')
-
-    svs = pd.read_csv(get_test_file('fusions.tab'), sep='\t')
-    # Flag only the first SV
-    svs['flags'] = ''
-    svs.loc[0, 'flags'] = 'sv_flag_1'
-    svs.loc[1, 'flags'] = 'sv_flag_1,sv_flag_2'  # test multiple flags in one string
-    svs_json = svs.to_json(orient='records')
-
-    hla = pd.read_csv(get_test_file('hla_variants.tab'), sep='\t')
-    hla_json = hla.to_json(orient='records')
-
-    # Create a transcript flags file with flags for specific transcripts
-    # Match transcripts from small_mutations.short.tab
-    transcript_flags_df = pd.DataFrame({
-        'gene': ['APC', 'APC', smallmut_gene, 'svgene1', 'svgene2', 'svgene3', 'svgene4'],
-        'transcript': ['ENST00000457016', 'ENST00000257430', smallmut_transcript,'ENST00000358273', 'ENST00000397938', 'ENST00000373930', 'ENST00000457710'],
-        'flags': ['transcript_flag_1', 'transcript_flag_2', 'additional_transcript_flag', 'sv_transcript_flag_1', 'sv_transcript_flag_2', 'sv_transcript_flag_3', 'sv_transcript_flag_4'],
-    })
-    transcript_flags_df.to_csv(transcript_flags_file, sep='\t', index=False)
-
-    json_contents = {
-        'comparators': [
-            {'analysisRole': 'expression (disease)', 'name': '1'},
-            {'analysisRole': 'expression (primary site)', 'name': '2'},
-            {'analysisRole': 'expression (biopsy site)', 'name': '3'},
-            {
-                'analysisRole': 'expression (internal pancancer cohort)',
-                'name': '4',
-            },
-        ],
-        'patientId': patient_id,
-        'project': 'TEST',
-        'sampleInfo': [
-            {
-                'sample': 'Constitutional',
-                'biopsySite': 'Normal tissue',
-                'sampleName': 'SAMPLE1-PB',
-                'primarySite': 'Blood-Peripheral',
-                'collectionDate': '11-11-11',
-            },
-            {
-                'sample': 'Tumour',
-                'pathoTc': '90%',
-                'biopsySite': 'hepatic',
-                'sampleName': 'SAMPLE2-FF-1',
-                'primarySite': 'Vena Cava-Hepatic',
-                'collectionDate': '12-12-12',
-            },
-        ],
-        'kbDiseaseMatch': 'colorectal cancer',
-        'msi': [
-            {
-                'score': 1000.0,
-                'kbCategory': 'microsatellite instability',
-            }
-        ],
-        'hrd': {
-            'score': 9999.0,
-            'kbCategory': 'homologous recombination deficiency strong signature',
-        },
-        'expressionVariants': json.loads(expvars_json),
-        'smallMutations': json.loads(smallmuts_json),
-        'copyVariants': json.loads(copyvars_json),
-        'structuralVariants': json.loads(svs_json),
-        'cosmicSignatures': pd.read_csv(
-            get_test_file('cosmic_variants.tab'), sep='\t'
-        ).signature.tolist(),
-        'hlaTypes': json.loads(hla_json),
-    }
-
-    json_contents['patientId'] = async_patient_id
-    async_json_file.write_text(
-        json.dumps(
-            json_contents,
-            allow_nan=False,
-        )
-    )
-
-    argslist = [
-        'ipr',
-        '--username',
-        os.environ.get('IPR_USER', os.environ['USER']),
-        '--password',
-        os.environ['IPR_PASS'],
-        '--graphkb_username',
-        os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])),
-        '--graphkb_password',
-        os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']),
-        '--ipr_url',
-        os.environ['IPR_TEST_URL'],
-        '--graphkb_url',
-        os.environ.get('GRAPHKB_URL', False),
-        '--therapeutics',
-        '--allow_partial_matches',
-        '-o upload_with_flags.json',
-        '--transcript_flags',
-        str(transcript_flags_file),
-    ]
-
-    async_argslist = argslist.copy()
-    async_argslist.extend(['--content', str(async_json_file), '--async_upload'])
-    with patch.object(sys, 'argv', async_argslist):
-        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
-            command_interface()
-
-    ipr_conn = IprConnection(
-        username=os.environ.get('IPR_USER', os.environ['USER']),
-        password=os.environ['IPR_PASS'],
-        url=os.environ['IPR_TEST_URL'],
-    )
-    async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}')
-
-    # Collect expected flagged genes for each variant type
-    expected_flagged = {
-        'expression': expvars[expvars['flags'] != '']['gene'].tolist(),
-        'small_mutations': smallmuts[smallmuts['flags'] != '']['gene'].tolist(),
-        'copy_variants': copyvars[copyvars['flags'] != '']['gene'].tolist(),
-        'structural_variants_cterm': svs[svs['flags'] != '']['gene2'].tolist(),
-        'structural_variants_nterm': svs[svs['flags'] != '']['gene1'].tolist()
-    }
-
-    loaded_reports_result = {
-        'async': (async_patient_id, async_loaded_report),
-        'expected_flagged': expected_flagged,
-    }
-    yield loaded_reports_result
-    if DELETE_UPLOAD_TEST_REPORTS:
-        ipr_conn.delete(uri=f'reports/{async_loaded_report["reports"][0]["ident"]}')
-
-
-def get_section(loaded_report, section_name):
-    ident = loaded_report[1]['reports'][0]['ident']
-    ipr_conn = IprConnection(
-        username=os.environ.get('IPR_USER', os.environ['USER']),
-        password=os.environ['IPR_PASS'],
-        url=os.environ['IPR_TEST_URL'],
-    )
-    return ipr_conn.get(uri=f'reports/{ident}/{section_name}')
-
-
-def stringify_sorted(obj):
-    """
-    stringifies a (json) object
-    in such a way that it can be compared for equality
-    with another json object"""
-    if isinstance(obj, list):
-        obj = [stringify_sorted(item) for item in obj]
-        obj.sort()
-        return str(obj)
-    elif isinstance(obj, dict):
-        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt', 'reportId', 'variantId', 'id'):
-            obj.pop(key, None)
-        keys = obj.keys()
-        for key in keys:
-            if isinstance(obj[key], list):
-                obj[key] = stringify_sorted(obj[key])
-            elif isinstance(obj[key], dict):
-                obj[key] = stringify_sorted(obj[key])
-        return str(obj)
-    elif isinstance(obj, str):
-        return obj
-    else:
-        return str(obj)
-
-
-@pytest.mark.skipif(
-    not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance'
-)
-@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests')
-class TestCreateReport:
-    def test_patient_id_loaded_once(self, loaded_reports) -> None:
-        async_patient_id = loaded_reports['async'][0]
-        assert loaded_reports['async'][1]['total'] == 1
-        assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id
-
-    def test_observed_variant_annotations_loaded(self, loaded_reports) -> None:
-        """Test that flagged variants have observedVariantAnnotation with correct flags."""
-        variants_section = get_section(loaded_reports['async'], 'variants')
-        expected_flagged = loaded_reports['expected_flagged']
-
-        # Check that expression variant with input flag has annotation
-        exp_vars_with_annot = [v for v in variants_section if v['variantType'] == 'exp' and 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
-        assert len(exp_vars_with_annot) > 0, "Should have at least one expression variant with annotation"
-
-        # Find the flagged expression variants
-        flagged_exp_genes = expected_flagged['expression']
-        for gene in flagged_exp_genes:
-            flagged_exp = [v for v in exp_vars_with_annot if v['gene']['name'] == gene]
-            assert len(flagged_exp) >= 1, f"{gene} should be flagged with input flag"
-            for var in flagged_exp:
-                assert any(['expression_flag' in str(var['observedVariantAnnotation'].get('flags', []))])
-
-        # Check that the variant with multiple flags has both flags correctly split
-        multi_flag_exp = [
-            v for v in exp_vars_with_annot if len(v['observedVariantAnnotation']['flags'])>1
-        ]
-        assert len(multi_flag_exp) > 0, "Should have at least one expression variant with multiple flags"
-        for var in multi_flag_exp:
-            flags = var['observedVariantAnnotation'].get('flags', [])
-            if len(flags) > 1:
-                assert 'expression_flag_1' in flags and 'expression_flag_2' in flags, \
-                    f"Variant with multiple flags should have both expression_flag_1 and expression_flag_2, got {flags}"
-
-        # Check that unflagged expression variants don't have observedVariantAnnotation
-        unflagged_exp = [v for v in variants_section if v['variantType'] == 'exp' and v['gene']['name'] not in flagged_exp_genes]
-        for var in unflagged_exp:
-            assert 'observedVariantAnnotation' not in var or var['observedVariantAnnotation'] is None, \
-                f"Unflagged expression variant {var['gene']['name']} should not have annotations"
-
-    def test_variant_transcript_annotations_loaded(self, loaded_reports) -> None:
-        """Test that variants with transcript flags have observedVariantAnnotation from transcript file."""
-        variants_section = get_section(loaded_reports['async'], 'variants')
-
-        # Find small mutation variants
-        mut_vars = [v for v in variants_section if v['variantType'] == 'mut']
-        assert len(mut_vars) > 0, "Should have small mutations"
-
-        # each APC with ENST00000457016 transcript should have transcript_flag_1
-        apc_mut_enst1 = [v for v in mut_vars if v['gene']['name'] == 'APC' and v.get('transcript') == 'ENST00000457016']
-        assert len(apc_mut_enst1) > 0, "Should find at least one APC mutation with ENST00000457016 transcript"
-
-        has_transcript_flag = True
-        for var in apc_mut_enst1:
-            if 'observedVariantAnnotation' in var and var['observedVariantAnnotation'] is not None:
-                if 'transcript_flag_1' not in var['observedVariantAnnotation'].get('flags', []):
-                    has_transcript_flag = False
-                    break
-
-        assert has_transcript_flag, "All mutations with transcript ENST00000457016 should have transcript_flag_1 from transcript file"
-
-        # APC with ENST00000257430 transcript should have transcript_flag_2
-        apc_mut_enst2 = [v for v in mut_vars if v['gene']['name'] == 'APC' and v.get('transcript') == 'ENST00000257430']
-        assert len(apc_mut_enst2) > 0, "Should find APC mutation with ENST00000257430 transcript"
-
-        has_second_flag = True
-        for var in apc_mut_enst2:
-            if 'observedVariantAnnotation' in var and var['observedVariantAnnotation'] is not None:
-                if 'transcript_flag_2' not in var['observedVariantAnnotation'].get('flags', []):
-                    has_second_flag = False
-                    break
-
-        assert has_second_flag, "All APC mutations with transcript ENST00000257430 should have transcript_flag_2 from transcript file"
-
-        # Check that the variant with a transcript flag and multiple input flags
-        # has all flags correctly represented in observedVariantAnnotation
-        annotated_mut_vars = [v for v in mut_vars if 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
-        multi_flag_mut = [v for v in annotated_mut_vars if len(v.get('observedVariantAnnotation').get('flags', [])) > 2]
-        assert len(multi_flag_mut) > 0, "Should have at least one small mutation variant with multiple flags"
-        for var in multi_flag_mut:
-            flags = var['observedVariantAnnotation'].get('flags', [])
-            if len(flags) > 2:
-                assert 'mutation_flag_1' in flags and 'mutation_flag_2' in flags, \
-                    f"Variant with multiple flags should have both mutation_flag_1 and mutation_flag_2, got {flags}"
-
-    def test_fusion_variants_have_multiple_transcript_annotations_loaded(self, loaded_reports) -> None:
-        """Test that variants with transcript flags have observedVariantAnnotation from transcript file."""
-        variants_section = get_section(loaded_reports['async'], 'variants')
-
-        # Find small mutation variants
-        svs = [v for v in variants_section if v['variantType'] == 'sv']
-        annotated_svs = [v for v in svs if 'observedVariantAnnotation' in v and v['observedVariantAnnotation'] is not None]
-
-        assert len(annotated_svs) > 0, "Should have annotated svs"

From 4e585abfadabc5bce7403dadcf40edc255c25505 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 18 Mar 2026 15:48:47 -0700
Subject: [PATCH 14/64] fix sigv pto autogeneration

---
 pori_python/ipr/util.py     |  5 +++++
 tests/test_ipr/test_util.py | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/pori_python/ipr/util.py b/pori_python/ipr/util.py
index 69ac7024..52d8fe1c 100644
--- a/pori_python/ipr/util.py
+++ b/pori_python/ipr/util.py
@@ -61,6 +61,11 @@ def create_variant_name_tuple(variant: IprVariant) -> Tuple[str, str]:
         return (gene, str(variant.get('expressionState', '')))
     elif variant_type == 'cnv':
         return (gene, str(variant.get('cnvState', '')))
+    elif variant_type == 'sigv':
+        return (
+            variant.get('signatureName', variant.get('displayName')),
+            str(variant.get('variantTypeName', '')),
+        )
     variant_split = (
         variant['variant'].split(':', 1)[1] if ':' in variant['variant'] else variant['variant']
     )
diff --git a/tests/test_ipr/test_util.py b/tests/test_ipr/test_util.py
index bbae6d98..100cfc3d 100644
--- a/tests/test_ipr/test_util.py
+++ b/tests/test_ipr/test_util.py
@@ -27,3 +27,20 @@ def test_create_variant_name_tuple(variant, result):
     gene, name = create_variant_name_tuple(variant)
     assert name == result
     assert gene == 'GENE'
+
+
+def test_create_signature_variant_name_tuple():
+    v1 = {
+        'variantType': 'sigv',
+        'displayName': 'test signature signature present',
+        'signatureName': 'test signature',
+        'variantTypeName': 'signature present',
+    }
+    gene, name = create_variant_name_tuple(v1)
+    assert name == 'signature present'
+    assert gene == 'test signature'
+
+    v2 = {'variantType': 'sigv', 'displayName': 'test signature signature present'}
+    gene, name = create_variant_name_tuple(v2)
+    assert name == ''
+    assert gene == 'test signature signature present'

From 50539e15fb03fad9d934504ff6ff8108ebb096c1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Fri, 20 Mar 2026 08:39:27 -0700
Subject: [PATCH 15/64] combine tests

---
 tests/test_ipr/test_util.py | 46 +++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/tests/test_ipr/test_util.py b/tests/test_ipr/test_util.py
index 100cfc3d..9208f51e 100644
--- a/tests/test_ipr/test_util.py
+++ b/tests/test_ipr/test_util.py
@@ -17,30 +17,32 @@ def test_trim_empty_values(input, output_keys):
     [
         [
             {'variantType': 'exp', 'gene': 'GENE', 'expressionState': 'increased expression'},
-            'increased expression',
+            ('GENE', 'increased expression'),
+        ],
+        [
+            {'variantType': 'cnv', 'gene': 'GENE', 'cnvState': 'amplification'},
+            ('GENE', 'amplification'),
+        ],
+        [
+            {'variantType': 'other', 'gene2': 'GENE', 'variant': 'GENE:anything'},
+            ('GENE', 'anything'),
+        ],
+        [
+            {'variantType': 'sigv', 'displayName': 'test signature signature present'},
+            ('test signature signature present', ''),
+        ],
+        [
+            {
+                'variantType': 'sigv',
+                'displayName': 'test signature signature present',
+                'signatureName': 'test signature',
+                'variantTypeName': 'signature present',
+            },
+            ('test signature', 'signature present'),
         ],
-        [{'variantType': 'cnv', 'gene': 'GENE', 'cnvState': 'amplification'}, 'amplification'],
-        [{'variantType': 'other', 'gene2': 'GENE', 'variant': 'GENE:anything'}, 'anything'],
     ],
 )
 def test_create_variant_name_tuple(variant, result):
     gene, name = create_variant_name_tuple(variant)
-    assert name == result
-    assert gene == 'GENE'
-
-
-def test_create_signature_variant_name_tuple():
-    v1 = {
-        'variantType': 'sigv',
-        'displayName': 'test signature signature present',
-        'signatureName': 'test signature',
-        'variantTypeName': 'signature present',
-    }
-    gene, name = create_variant_name_tuple(v1)
-    assert name == 'signature present'
-    assert gene == 'test signature'
-
-    v2 = {'variantType': 'sigv', 'displayName': 'test signature signature present'}
-    gene, name = create_variant_name_tuple(v2)
-    assert name == ''
-    assert gene == 'test signature signature present'
+    assert gene == result[0]
+    assert name == result[1]

From 221b52e077c0f145f539593cf067b5e546e868b5 Mon Sep 17 00:00:00 2001
From: Shirley Shu <147874967+sshugsc@users.noreply.github.com>
Date: Tue, 24 Mar 2026 09:59:41 -0700
Subject: [PATCH 16/64] make it more readable

---
 pori_python/ipr/connection.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index 1cfd260a..92c5c7a0 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -108,7 +108,10 @@ def upload_report(
             projects = self.get("project")
             project_names = [item["name"] for item in projects]
             project_users = {
-                item["name"]: [user["username"] for user in item.get("users", [])]
+                item["name"]: [
+                    user["username"] 
+                    for user in item.get("users", [])
+                ]
                 for item in projects
             }
 

From cf84f961c76839517549ec3af466e4b3acbd8b92 Mon Sep 17 00:00:00 2001
From: sshugsc <sshu@bcgsc.ca>
Date: Tue, 24 Mar 2026 10:05:07 -0700
Subject: [PATCH 17/64] lint

---
 pori_python/ipr/connection.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index e40ba0d4..317ae90b 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -105,13 +105,10 @@ def upload_report(
             # or 'report'. jobStatus is no longer available once the report is successfully
             # uploaded.
 
-            projects = self.get("project")
-            project_names = [item["name"] for item in projects]
+            projects = self.get('project')
+            project_names = [item['name'] for item in projects]
             project_users = {
-                item["name"]: [
-                    user["username"] 
-                    for user in item.get("users", [])
-                ]
+                item['name']: [user['username'] for user in item.get('users', [])]
                 for item in projects
             }
 
@@ -125,9 +122,9 @@ def upload_report(
                 except Exception as err:
                     raise Exception(f'Project creation failed due to {err}')
 
-            if self.username not in project_users[content["project"]]:
+            if self.username not in project_users[content['project']]:
                 raise Exception(
-                    f"User have no permission to create report in project {content['project']}"
+                    f'User have no permission to create report in project {content["project"]}'
                 )
 
             if ignore_extra_fields:

From e76e467b5f928ce57d659abdb0234b864f10d0a3 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 1 Apr 2026 17:26:00 -0700
Subject: [PATCH 18/64] fix unsorted list, tsv header option, missing dupe
 transcript flags

---
 pori_python/ipr/ipr.py      | 26 +++++++++++++--------
 pori_python/ipr/main.py     | 25 +++++++++++++++++---
 tests/test_ipr/test_ipr.py  | 46 +++++++++++++++++++++++++++++++++++++
 tests/test_ipr/test_main.py | 28 +++++++++++++++++++++-
 4 files changed, 111 insertions(+), 14 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index ca1f1c6b..a251009e 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -740,16 +740,21 @@ def ensure_str_list(val):
 
 
 def add_transcript_flags(variant_sources, transcript_flags_df):
-    lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags']))
+    lookup = {}
+    for _, row in transcript_flags_df[['transcript', 'flags']].dropna(subset=['transcript']).iterrows():
+        transcript = row['transcript']
+        transcript_flags = lookup.setdefault(transcript, [])
+        for flag in ensure_str_list(str(row['flags'])):
+            if flag not in transcript_flags:
+                transcript_flags.append(flag)
+    import pdb; pdb.set_trace()
 
     for record in variant_sources:
-        flags_str = lookup.get(record.get('transcript'))
-        if not flags_str:
+        transcript_flags = lookup.get(record.get('transcript'))
+        if not transcript_flags:
             continue
-        # Split on commas and strip whitespace
-        new_flags = ensure_str_list(str(flags_str))
         flags = ensure_str_list(record.setdefault('flags', []))
-        for new_flag in new_flags:
+        for new_flag in transcript_flags:
             if new_flag not in flags:
                 flags.append(new_flag)
         record['flags'] = flags
@@ -762,11 +767,11 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
 
         for key, label in label_map.items():
             transcript = record.get(key)
-            flags_str = lookup.get(transcript)
-            if not flags_str:
+            transcript_flags = lookup.get(transcript)
+            if not transcript_flags:
                 continue
 
-            for flag in ensure_str_list(str(flags_str)):
+            for flag in transcript_flags:
                 new_flag = f'{flag} ({label})'
                 if new_flag not in flags:
                     flags.append(new_flag)
@@ -780,12 +785,13 @@ def get_variant_flags(variant_sources):
         raw_flags = item.get('flags')
         if not raw_flags:  # skips None and ''
             continue
+        unique_flags = list(dict.fromkeys(f for f in ensure_str_list(raw_flags) if f))
         # create record, removing dupes from flags list
         flags.append(
             {
                 'variant': item['key'],
                 'variantType': item['variantType'],
-                'flags': list(set([f for f in ensure_str_list(raw_flags) if f])),
+                'flags': unique_flags,
             }
         )
         item.pop('flags', None)  # remove after extraction
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index c94721a3..4b87d015 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -72,6 +72,27 @@ def timestamp() -> str:
     return datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
 
 
+def load_transcript_flags(path: str) -> pd.DataFrame:
+    transcript_flags_df = pd.read_csv(
+        path,
+        sep='\t',
+        names=['gene', 'transcript', 'flags'],
+        dtype=str,
+        keep_default_na=False,
+    )
+    if transcript_flags_df.empty:
+        return transcript_flags_df
+
+    first_row = transcript_flags_df.iloc[0]
+    if [str(first_row[col]).strip().lower() for col in ['gene', 'transcript', 'flags']] == [
+        'gene',
+        'transcript',
+        'flags',
+    ]:
+        transcript_flags_df = transcript_flags_df.iloc[1:].reset_index(drop=True)
+    return transcript_flags_df
+
+
 def command_interface() -> None:
     """Parse the ipr command from user input based on usage pattern.
     Parsed arguments are used to call the ipr_report() function.
@@ -398,9 +419,7 @@ def ipr_report(
 
     transcript_flags_df = None
     if transcript_flags:
-        transcript_flags_df = pd.read_csv(
-            transcript_flags, sep='\t', names=['gene', 'transcript', 'flags']
-        )
+        transcript_flags_df = load_transcript_flags(transcript_flags)
 
     # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS)
     signature_variants: List[IprSignatureVariant] = preprocess_signature_variants(
diff --git a/tests/test_ipr/test_ipr.py b/tests/test_ipr/test_ipr.py
index 68ea2ccc..687c14b4 100644
--- a/tests/test_ipr/test_ipr.py
+++ b/tests/test_ipr/test_ipr.py
@@ -468,6 +468,20 @@ def test_add_transcript_flags_basic_strips_whitespace_from_comma_separated_flags
         result = add_transcript_flags(variant_sources, df)
         assert set(result[0]['flags']) == {'flag_c', 'flag_d'}
 
+    def test_add_transcript_flags_basic_accumulates_duplicate_transcript_rows(self):
+        variant_sources = [
+            {'transcript': 'T5', 'key': 'k5', 'variantType': 'mut'},
+        ]
+        df = pd.DataFrame(
+            {
+                'gene': ['ENSG1', 'ENSG2'],
+                'transcript': ['T5', 'T5'],
+                'flags': ['flag_a', 'flag_b, flag_c'],
+            }
+        )
+        result = add_transcript_flags(variant_sources, df)
+        assert result[0]['flags'] == ['flag_a', 'flag_b', 'flag_c']
+
     def test_add_transcript_flags_fusions_tags_cterm_flags(self):
         variant_sources = [
             {
@@ -506,6 +520,25 @@ def test_add_transcript_flags_fusions_tags_nterm_flags(self):
         flags = result[0]['flags']
         assert 'nterm_flag (nterm)' in flags
 
+    def test_add_transcript_flags_fusions_accumulates_duplicate_transcript_rows(self):
+        variant_sources = [
+            {
+                'key': 'f2',
+                'variantType': 'fusion',
+                'ctermTranscript': 'CT2',
+                'ntermTranscript': 'NT2',
+            }
+        ]
+        df = pd.DataFrame(
+            {
+                'gene': ['ENSG3', 'ENSG4'],
+                'transcript': ['CT2', 'CT2'],
+                'flags': ['cterm_flag_a', 'cterm_flag_b'],
+            }
+        )
+        result = add_transcript_flags(variant_sources, df)
+        assert result[0]['flags'] == ['cterm_flag_a (cterm)', 'cterm_flag_b (cterm)']
+
     def test_get_variant_flags_converts_string_flags_to_records(self):
         variants = [
             {'key': 'k1', 'variantType': 'mut', 'flags': 'foo'},
@@ -521,6 +554,19 @@ def test_get_variant_flags_deduplicates_and_removes_empty_strings(self):
         out = get_variant_flags(variants)
         assert any(item['variant'] == 'k2' and set(item['flags']) == {'bar'} for item in out)
 
+    def test_get_variant_flags_preserves_input_flag_order_when_deduplicating(self):
+        variants = [
+            {'key': 'k5', 'variantType': 'mut', 'flags': ['flag_b', 'flag_a', 'flag_b', 'flag_c']},
+        ]
+        out = get_variant_flags(variants)
+        assert out == [
+            {
+                'variant': 'k5',
+                'variantType': 'mut',
+                'flags': ['flag_b', 'flag_a', 'flag_c'],
+            }
+        ]
+
     def test_get_variant_flags_skips_null_flags(self):
         variants = [
             {'key': 'k3', 'variantType': 'mut', 'flags': None},
diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py
index 8fe585cd..bf1fe042 100644
--- a/tests/test_ipr/test_main.py
+++ b/tests/test_ipr/test_main.py
@@ -7,7 +7,7 @@
 from unittest.mock import MagicMock, patch
 
 from pori_python.ipr.connection import IprConnection
-from pori_python.ipr.main import command_interface
+from pori_python.ipr.main import command_interface, load_transcript_flags
 from pori_python.types import IprGene
 
 from .constants import EXCLUDE_INTEGRATION_TESTS
@@ -28,6 +28,32 @@ def get_test_file(name: str) -> str:
     return os.path.join(os.path.dirname(__file__), 'test_data', name)
 
 
+class TestLoadTranscriptFlags:
+    def test_accepts_file_without_header(self, tmp_path) -> None:
+        transcript_flags_file = tmp_path / 'transcript_flags.tsv'
+        transcript_flags_file.write_text('ENSG1\tENST1\tflag_a\nENSG2\tENST2\tflag_b, flag_c\n')
+
+        result = load_transcript_flags(str(transcript_flags_file))
+
+        assert result.to_dict(orient='records') == [
+            {'gene': 'ENSG1', 'transcript': 'ENST1', 'flags': 'flag_a'},
+            {'gene': 'ENSG2', 'transcript': 'ENST2', 'flags': 'flag_b, flag_c'},
+        ]
+
+    def test_accepts_file_with_header(self, tmp_path) -> None:
+        transcript_flags_file = tmp_path / 'transcript_flags.tsv'
+        transcript_flags_file.write_text(
+            'gene\ttranscript\tflags\nENSG1\tENST1\tflag_a\nENSG2\tENST2\tflag_b\n'
+        )
+
+        result = load_transcript_flags(str(transcript_flags_file))
+
+        assert result.to_dict(orient='records') == [
+            {'gene': 'ENSG1', 'transcript': 'ENST1', 'flags': 'flag_a'},
+            {'gene': 'ENSG2', 'transcript': 'ENST2', 'flags': 'flag_b'},
+        ]
+
+
 @pytest.fixture(scope='module')
 def report_upload_content(tmp_path_factory) -> Dict:
     mock = MagicMock()

From d3bd260398b28f51f4d4284058ccd46f876e8482 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 1 Apr 2026 17:26:54 -0700
Subject: [PATCH 19/64] fix json format error

---
 pori_python/ipr/content.spec.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
index 9f031899..2a092db7 100644
--- a/pori_python/ipr/content.spec.json
+++ b/pori_python/ipr/content.spec.json
@@ -1191,7 +1191,8 @@
                         "description": "the type of underlying structural variant",
                         "example": "deletion",
                         "type": "string"
-                    },                    "exon1": {
+                    },
+                    "exon1": {
                         "description": "the 5' (n-terminal) exon",
                         "example": 1,
                         "type": [

From 588b097153e598e6c74665dabf6005c02748b401 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 15:23:24 -0700
Subject: [PATCH 20/64] expect two-col transcript csv, other minor fixes

---
 pori_python/ipr/inputs.py     |  2 +-
 pori_python/ipr/main.py       | 12 ++++--------
 tests/test_ipr/test_upload.py |  8 ++++----
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index 568571b2..68f7759a 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -171,7 +171,7 @@
 SIGV_REQ = ['signatureName', 'variantTypeName']
 SIGV_COSMIC = ['signature']  # 1st element used as signatureName key
 SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2']
-SIGV_OPTIONAL = ['displayName', 'flags']
+SIGV_OPTIONAL = ['displayName']
 SIGV_KEY = SIGV_REQ[:]
 
 
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 4b87d015..e647f4ab 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -76,7 +76,7 @@ def load_transcript_flags(path: str) -> pd.DataFrame:
     transcript_flags_df = pd.read_csv(
         path,
         sep='\t',
-        names=['gene', 'transcript', 'flags'],
+        names=['transcript', 'flags'],
         dtype=str,
         keep_default_na=False,
     )
@@ -84,8 +84,7 @@ def load_transcript_flags(path: str) -> pd.DataFrame:
         return transcript_flags_df
 
     first_row = transcript_flags_df.iloc[0]
-    if [str(first_row[col]).strip().lower() for col in ['gene', 'transcript', 'flags']] == [
-        'gene',
+    if [str(first_row[col]).strip().lower() for col in ['transcript', 'flags']] == [
         'transcript',
         'flags',
     ]:
@@ -185,7 +184,7 @@ def command_interface() -> None:
         '--transcript_flags',
         required=False,
         type=file_path,
-        help='TSV without header, with columns: gene, transcript, comma-separated list of flags',
+        help='TSV without header, with two columns: transcripts and flags (comma-separated list of flags eg "MANE"). If header is included, it will be skipped. Flags will be added to any observed variants with matching transcript in the report upload; so, the same transcript identifiers should be used in this csv as are used in the input variants.',
     )
     args = parser.parse_args()
 
@@ -604,7 +603,7 @@ def ipr_report(
                 for s in filter_structural_variants(
                     structural_variants, gkb_matches, gene_information
                 )
-            ],  # TODO NB are we omitting non-matched sv's?
+            ],
             'signatureVariants': [trim_empty_values(s) for s in signature_variants],
             'genes': gene_information,
             'genomicAlterationsIdentified': key_alterations,
@@ -615,9 +614,6 @@ def ipr_report(
         }
     )
 
-    # TODO there are 13 outliers in the test data; if even only three are matched, why are only those three
-    # shown in the expression section? shouldn't we be seeing the non-kbmatched vars there as well?
-
     output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants))
 
     # if input includes hrdScore field, that is ok to pass to db
diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index 70837bf3..6cf7b789 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -35,11 +35,11 @@ def get_test_transcript_flags(json_contents) -> pd.DataFrame:
     """creates a dataframe of transcript flags for test purposes, based on the input json contents"""
     transcript_flags = []
     for item in json_contents['structuralVariants']:
-        transcript_flags.append((item['gene1'], item['ntermTranscript'], 'TRANSCRIPT FLAG'))
-        transcript_flags.append((item['gene2'], item['ctermTranscript'], 'TRANSCRIPT FLAG'))
+        transcript_flags.append((item['ntermTranscript'], 'TRANSCRIPT FLAG'))
+        transcript_flags.append((item['ctermTranscript'], 'TRANSCRIPT FLAG'))
     for item in json_contents['smallMutations']:
-        transcript_flags.append((item['gene'], item['transcript'], 'TRANSCRIPT FLAG'))
-    df = pd.DataFrame(transcript_flags, columns=['gene', 'transcript', 'flags'])
+        transcript_flags.append((item['transcript'], 'TRANSCRIPT FLAG'))
+    df = pd.DataFrame(transcript_flags, columns=['transcript', 'flags'])
     df = df.drop_duplicates()
     return df
 

From 14474b9174aa07422d0ac07352d1f5d2012654c5 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 15:26:53 -0700
Subject: [PATCH 21/64] add to docstring

---
 pori_python/ipr/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index e647f4ab..1eddebfc 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -377,6 +377,7 @@ def ipr_report(
         include_nonspecific_template: if include_ipr_variant_text is True, if no template match is found use template-nonspecific variant comment
         allow_partial_matches: allow matches to statements where not all conditions are satisfied
         tmb_high: mutation burden threshold/cutoff to qualify as 'high'
+        transcript_flags: path to a tsv file with two columns (no header) of transcript identifiers and flags to be added to any observed variants with matching transcript in the report upload. If header is included, it will be skipped. Flags will be added to any observed variants with matching transcript in the report upload; so, the same transcript identifiers should be used in this csv as are used in the input variants.
     Returns:
         ipr_conn.upload_report return dictionary
     """

From 7766f80423ea3c9e48a47128ad5b7fc1b221b991 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 16:07:32 -0700
Subject: [PATCH 22/64] minor refactor to transcript flags func

---
 pori_python/ipr/ipr.py | 51 +++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index a251009e..17f809cb 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -741,41 +741,42 @@ def ensure_str_list(val):
 
 def add_transcript_flags(variant_sources, transcript_flags_df):
     lookup = {}
+
+    # create transcript:flags dict from input df
     for _, row in transcript_flags_df[['transcript', 'flags']].dropna(subset=['transcript']).iterrows():
         transcript = row['transcript']
-        transcript_flags = lookup.setdefault(transcript, [])
+        flags = lookup.setdefault(transcript, [])
         for flag in ensure_str_list(str(row['flags'])):
-            if flag not in transcript_flags:
-                transcript_flags.append(flag)
-    import pdb; pdb.set_trace()
+            if flag not in flags:
+                flags.append(flag)
 
-    for record in variant_sources:
-        transcript_flags = lookup.get(record.get('transcript'))
-        if not transcript_flags:
-            continue
-        flags = ensure_str_list(record.setdefault('flags', []))
-        for new_flag in transcript_flags:
-            if new_flag not in flags:
-                flags.append(new_flag)
-        record['flags'] = flags
-
-    # fusions: check both transcripts for flags and add to the same record
+    # for fusions: check both transcripts for flags and add to the same record
     label_map = {'ctermTranscript': 'cterm', 'ntermTranscript': 'nterm'}
 
+    # single pass: add plain transcript flags and labeled fusion transcript flags
     for record in variant_sources:
         flags = ensure_str_list(record.setdefault('flags', []))
 
-        for key, label in label_map.items():
-            transcript = record.get(key)
-            transcript_flags = lookup.get(transcript)
-            if not transcript_flags:
-                continue
+        if record.get('transcript'):
+            # non-fusion: plain transcript only, no cterm/nterm
+            transcript_flags = lookup.get(record['transcript'])
+            if transcript_flags:
+                for new_flag in transcript_flags:
+                    if new_flag not in flags:
+                        flags.append(new_flag)
+        else:
+            # fusion: check cterm/nterm transcripts with labels
+            for key, label in label_map.items():
+                transcript = record.get(key)
+                transcript_flags = lookup.get(transcript)
+                if not transcript_flags:
+                    continue
+                for flag in transcript_flags:
+                    new_flag = f'{flag} ({label})'
+                    if new_flag not in flags:
+                        flags.append(new_flag)
 
-            for flag in transcript_flags:
-                new_flag = f'{flag} ({label})'
-                if new_flag not in flags:
-                    flags.append(new_flag)
-                record['flags'] = flags
+        record['flags'] = flags
     return variant_sources
 
 

From 1d7e84e92ca574743b7ac59da1e356e5563f38ae Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 16:09:07 -0700
Subject: [PATCH 23/64] minor tscpt func refactor; minor dry-ing in main

---
 pori_python/ipr/ipr.py  |  4 +++-
 pori_python/ipr/main.py | 32 ++++++++++++++++++--------------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 17f809cb..f333a0ce 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -743,7 +743,9 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
     lookup = {}
 
     # create transcript:flags dict from input df
-    for _, row in transcript_flags_df[['transcript', 'flags']].dropna(subset=['transcript']).iterrows():
+    for _, row in (
+        transcript_flags_df[['transcript', 'flags']].dropna(subset=['transcript']).iterrows()
+    ):
         transcript = row['transcript']
         flags = lookup.setdefault(transcript, [])
         for flag in ensure_str_list(str(row['flags'])):
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 1eddebfc..f2140a0d 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -567,19 +567,6 @@ def ipr_report(
         gkb_matches, all_variants, kb_matched_sections['kbMatches']
     )
 
-    variant_sources = [
-        v
-        for source in [
-            [v for v in small_mutations if v['gene'] in genes_with_variants],
-            [v for v in copy_variants if v['gene'] in genes_with_variants],
-            [v for v in expression_variants if v['gene'] in genes_with_variants],
-            signature_variants,
-            filter_structural_variants(structural_variants, gkb_matches, gene_information),
-        ]
-        for v in source
-    ]
-    observed_vars_section = get_variant_flags(variant_sources)
-
     # OUTPUT CONTENT
     # thread safe deep-copy the original content
     output = json.loads(json.dumps(content))
@@ -611,10 +598,27 @@ def ipr_report(
             'variantCounts': variant_counts,
             'analystComments': comments,
             'therapeuticTarget': targets,
-            'observedVariantAnnotations': observed_vars_section,
         }
     )
 
+    # ADD OBSERVED VARIANT ANNOTATIONS SECTION
+    annotatable_variant_sources = [
+        v
+        for source in [
+            output[section]
+            for section in [
+                'smallMutations',
+                'copyVariants',
+                'expressionVariants',
+                'structuralVariants',
+            ]
+            if section in output
+        ]
+        for v in source
+    ]
+
+    output['observedVariantAnnotations'] = get_variant_flags(annotatable_variant_sources)
+
     output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants))
 
     # if input includes hrdScore field, that is ok to pass to db

From 16ae96a7fceb40c36abc30b64a2c6cdde7f52eb7 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 16:32:04 -0700
Subject: [PATCH 24/64] add docstring

---
 pori_python/ipr/ipr.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index f333a0ce..982faf0b 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -740,6 +740,13 @@ def ensure_str_list(val):
 
 
 def add_transcript_flags(variant_sources, transcript_flags_df):
+    """
+    Add flags from the input transcript_flags_df to the variant_sources
+    records based on matching transcript keys.
+     - For non-fusion records with a 'transcript' field, add flags directly based on that field.
+     - For fusion records without a 'transcript' field but with 'ctermTranscript' and
+        'ntermTranscript' fields, add flags based on both transcripts with appropriate labeling
+     """
     lookup = {}
 
     # create transcript:flags dict from input df

From a5490d5fcad17795028e7644d57a4aa4e61203d1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Tue, 14 Apr 2026 16:59:05 -0700
Subject: [PATCH 25/64] fix transcript_csv handling tests

---
 tests/test_ipr/test_main.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py
index bf1fe042..9d7df463 100644
--- a/tests/test_ipr/test_main.py
+++ b/tests/test_ipr/test_main.py
@@ -31,26 +31,26 @@ def get_test_file(name: str) -> str:
 class TestLoadTranscriptFlags:
     def test_accepts_file_without_header(self, tmp_path) -> None:
         transcript_flags_file = tmp_path / 'transcript_flags.tsv'
-        transcript_flags_file.write_text('ENSG1\tENST1\tflag_a\nENSG2\tENST2\tflag_b, flag_c\n')
+        transcript_flags_file.write_text('ENST1\tflag_a\nENST2\tflag_b, flag_c\n')
 
         result = load_transcript_flags(str(transcript_flags_file))
 
         assert result.to_dict(orient='records') == [
-            {'gene': 'ENSG1', 'transcript': 'ENST1', 'flags': 'flag_a'},
-            {'gene': 'ENSG2', 'transcript': 'ENST2', 'flags': 'flag_b, flag_c'},
+            {'transcript': 'ENST1', 'flags': 'flag_a'},
+            {'transcript': 'ENST2', 'flags': 'flag_b, flag_c'},
         ]
 
     def test_accepts_file_with_header(self, tmp_path) -> None:
         transcript_flags_file = tmp_path / 'transcript_flags.tsv'
         transcript_flags_file.write_text(
-            'gene\ttranscript\tflags\nENSG1\tENST1\tflag_a\nENSG2\tENST2\tflag_b\n'
+            'transcript\tflags\nENST1\tflag_a\nENST2\tflag_b\n'
         )
 
         result = load_transcript_flags(str(transcript_flags_file))
 
         assert result.to_dict(orient='records') == [
-            {'gene': 'ENSG1', 'transcript': 'ENST1', 'flags': 'flag_a'},
-            {'gene': 'ENSG2', 'transcript': 'ENST2', 'flags': 'flag_b'},
+            {'transcript': 'ENST1', 'flags': 'flag_a'},
+            {'transcript': 'ENST2', 'flags': 'flag_b'},
         ]
 
 

From dca2c94cb5467c5cdffbddf0d58936bb4b3606c1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Fri, 17 Apr 2026 09:44:16 -0700
Subject: [PATCH 26/64] clearer docstring, formatting

---
 pori_python/ipr/ipr.py      | 2 +-
 pori_python/ipr/main.py     | 4 ++--
 tests/test_ipr/test_main.py | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
index 982faf0b..9a9afb38 100644
--- a/pori_python/ipr/ipr.py
+++ b/pori_python/ipr/ipr.py
@@ -746,7 +746,7 @@ def add_transcript_flags(variant_sources, transcript_flags_df):
      - For non-fusion records with a 'transcript' field, add flags directly based on that field.
      - For fusion records without a 'transcript' field but with 'ctermTranscript' and
         'ntermTranscript' fields, add flags based on both transcripts with appropriate labeling
-     """
+    """
     lookup = {}
 
     # create transcript:flags dict from input df
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index f2140a0d..3319c811 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -184,7 +184,7 @@ def command_interface() -> None:
         '--transcript_flags',
         required=False,
         type=file_path,
-        help='TSV without header, with two columns: transcripts and flags (comma-separated list of flags eg "MANE"). If header is included, it will be skipped. Flags will be added to any observed variants with matching transcript in the report upload; so, the same transcript identifiers should be used in this csv as are used in the input variants.',
+        help='TSV without header, with two columns: transcripts and flags (comma-separated list of flags eg "MANE"). If header is included, it will be skipped. Matching uses direct string comparison, so transcript identifiers must match exactly, including version numbers (e.g., if input variants use ENST00000390477.1, this file must also use ENST00000390477.1, not ENST00000390477).',
     )
     args = parser.parse_args()
 
@@ -377,7 +377,7 @@ def ipr_report(
         include_nonspecific_template: if include_ipr_variant_text is True, if no template match is found use template-nonspecific variant comment
         allow_partial_matches: allow matches to statements where not all conditions are satisfied
         tmb_high: mutation burden threshold/cutoff to qualify as 'high'
-        transcript_flags: path to a tsv file with two columns (no header) of transcript identifiers and flags to be added to any observed variants with matching transcript in the report upload. If header is included, it will be skipped. Flags will be added to any observed variants with matching transcript in the report upload; so, the same transcript identifiers should be used in this csv as are used in the input variants.
+        transcript_flags: path to a tsv file with two columns (no header) of transcript identifiers and flags to be added to any observed variants with matching transcript in the report upload. If header is included, it will be skipped. Matching uses direct string comparison, so transcript identifiers must match exactly, including version numbers (e.g., if input variants use ENST00000390477.1, this file must also use ENST00000390477.1, not ENST00000390477).
     Returns:
         ipr_conn.upload_report return dictionary
     """
diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py
index 9d7df463..fd8e8bb8 100644
--- a/tests/test_ipr/test_main.py
+++ b/tests/test_ipr/test_main.py
@@ -42,9 +42,7 @@ def test_accepts_file_without_header(self, tmp_path) -> None:
 
     def test_accepts_file_with_header(self, tmp_path) -> None:
         transcript_flags_file = tmp_path / 'transcript_flags.tsv'
-        transcript_flags_file.write_text(
-            'transcript\tflags\nENST1\tflag_a\nENST2\tflag_b\n'
-        )
+        transcript_flags_file.write_text('transcript\tflags\nENST1\tflag_a\nENST2\tflag_b\n')
 
         result = load_transcript_flags(str(transcript_flags_file))
 

From 3a0eec3a502678a5aaed94ce00477dc47a6eb7ef Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Fri, 17 Apr 2026 15:29:41 -0700
Subject: [PATCH 27/64] add seqqc section and test

---
 pori_python/ipr/content.spec.json |  96 ++++++++
 tests/test_ipr/test_upload.py     | 394 +++++++++++++++++-------------
 2 files changed, 319 insertions(+), 171 deletions(-)

diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
index 5a1793a2..711f9eb5 100644
--- a/pori_python/ipr/content.spec.json
+++ b/pori_python/ipr/content.spec.json
@@ -892,6 +892,102 @@
             "example": "POG",
             "type": "string"
         },
+        "seqQC": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "reads": {
+                        "description": "Number of reads",
+                        "example": "2534M",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "bioQC": {
+                        "description": "Biological QC status",
+                        "example": "passed",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "labQC": {
+                        "description": "Lab QC status",
+                        "example": "passed",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "sample": {
+                        "description": "Sample identifier, e.g. Tumour DNA, Constitutional DNA",
+                        "example": "Tumour DNA",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "library": {
+                        "description": "Library identifier",
+                        "example": "LIB0001",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "coverage": {
+                        "description": "Sequencing coverage",
+                        "example": "80x",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "inputNg": {
+                        "description": "Input amount in nanograms",
+                        "example": "500",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "inputUg": {
+                        "description": "Input amount in micrograms",
+                        "example": "0.5",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "protocol": {
+                        "description": "Sequencing protocol",
+                        "example": "WGS",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "sampleName": {
+                        "description": "Full sample name",
+                        "example": "SAMPLE1-FF-1",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    },
+                    "duplicateReadsPerc": {
+                        "description": "Percentage of duplicate reads",
+                        "example": "12.3",
+                        "type": [
+                            "string",
+                            "null"
+                        ]
+                    }
+                }
+            }
+        },
         "smallMutations": {
             "default": [],
             "items": {
diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index 2c6fb73c..80bdd4b5 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -13,100 +13,132 @@
 
 from .constants import EXCLUDE_INTEGRATION_TESTS
 
-EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1'
-EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1'
-INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1'
-DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1'
+EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1"
+EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1"
+INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1"
+DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1"
 
 
 def get_test_spec():
-    ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}}
+    ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}}
     ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__
     for key in ipr_gene_keys:
-        ipr_spec['components']['schemas']['genesCreate']['properties'][key] = ''
+        ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = ""
     return ipr_spec
 
 
 def get_test_file(name: str) -> str:
-    return os.path.join(os.path.dirname(__file__), 'test_data', name)
+    return os.path.join(os.path.dirname(__file__), "test_data", name)
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def loaded_reports(tmp_path_factory) -> Generator:
-    json_file = tmp_path_factory.mktemp('inputs') / 'content.json'
-    async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json'
-    patient_id = f'TEST_{str(uuid.uuid4())}'
-    async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}'
+    json_file = tmp_path_factory.mktemp("inputs") / "content.json"
+    async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json"
+    patient_id = f"TEST_{str(uuid.uuid4())}"
+    async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}"
     json_contents = {
-        'comparators': [
-            {'analysisRole': 'expression (disease)', 'name': '1'},
-            {'analysisRole': 'expression (primary site)', 'name': '2'},
-            {'analysisRole': 'expression (biopsy site)', 'name': '3'},
+        "comparators": [
+            {"analysisRole": "expression (disease)", "name": "1"},
+            {"analysisRole": "expression (primary site)", "name": "2"},
+            {"analysisRole": "expression (biopsy site)", "name": "3"},
             {
-                'analysisRole': 'expression (internal pancancer cohort)',
-                'name': '4',
+                "analysisRole": "expression (internal pancancer cohort)",
+                "name": "4",
             },
         ],
-        'patientId': patient_id,
-        'project': 'TEST',
-        'sampleInfo': [
+        "patientId": patient_id,
+        "project": "TEST",
+        "sampleInfo": [
             {
-                'sample': 'Constitutional',
-                'biopsySite': 'Normal tissue',
-                'sampleName': 'SAMPLE1-PB',
-                'primarySite': 'Blood-Peripheral',
-                'collectionDate': '11-11-11',
+                "sample": "Constitutional",
+                "biopsySite": "Normal tissue",
+                "sampleName": "SAMPLE1-PB",
+                "primarySite": "Blood-Peripheral",
+                "collectionDate": "11-11-11",
             },
             {
-                'sample': 'Tumour',
-                'pathoTc': '90%',
-                'biopsySite': 'hepatic',
-                'sampleName': 'SAMPLE2-FF-1',
-                'primarySite': 'Vena Cava-Hepatic',
-                'collectionDate': '12-12-12',
+                "sample": "Tumour",
+                "pathoTc": "90%",
+                "biopsySite": "hepatic",
+                "sampleName": "SAMPLE2-FF-1",
+                "primarySite": "Vena Cava-Hepatic",
+                "collectionDate": "12-12-12",
             },
         ],
-        'msi': [
+        "msi": [
             {
-                'score': 1000.0,
-                'kbCategory': 'microsatellite instability',
+                "score": 1000.0,
+                "kbCategory": "microsatellite instability",
             }
         ],
-        'hrd': {
-            'score': 9999.0,
-            'cutoff': 5,
+        "hrd": {
+            "score": 9999.0,
+            "cutoff": 5,
         },
-        'expressionVariants': json.loads(
-            pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records')
+        "expressionVariants": json.loads(
+            pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json(
+                orient="records"
+            )
         ),
-        'smallMutations': json.loads(
-            pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json(
-                orient='records'
+        "smallMutations": json.loads(
+            pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json(
+                orient="records"
             )
         ),
-        'copyVariants': json.loads(
-            pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json(
-                orient='records'
+        "copyVariants": json.loads(
+            pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json(
+                orient="records"
             )
         ),
-        'structuralVariants': json.loads(
-            pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records')
+        "structuralVariants": json.loads(
+            pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json(
+                orient="records"
+            )
         ),
-        'kbDiseaseMatch': 'colorectal cancer',
-        'cosmicSignatures': pd.read_csv(
-            get_test_file('cosmic_variants.tab'), sep='\t'
+        "kbDiseaseMatch": "colorectal cancer",
+        "cosmicSignatures": pd.read_csv(
+            get_test_file("cosmic_variants.tab"), sep="\t"
         ).signature.tolist(),
-        'hlaTypes': json.loads(
-            pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records')
+        "hlaTypes": json.loads(
+            pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json(
+                orient="records"
+            )
         ),
-        'images': [
+        "images": [
             {
-                'key': 'cnvLoh.circos',
-                'path': 'test/testData/images/cnvLoh.png',
-                'caption': 'Test adding a caption to an image',
+                "key": "cnvLoh.circos",
+                "path": "test/testData/images/cnvLoh.png",
+                "caption": "Test adding a caption to an image",
             }
         ],
-        'config': 'test config',
+        "seqQC": [
+            {
+                "sample": "Tumour DNA",
+                "reads": "2534M",
+                "library": "LIB0001",
+                "coverage": "80x",
+                "inputNg": "500",
+                "protocol": "WGS",
+                "sampleName": "SAMPLE2-FF-1",
+                "bioQC": "passed",
+                "labQC": "passed",
+                "duplicateReadsPerc": "12.3",
+            },
+            {
+                "sample": "Constitutional DNA",
+                "reads": "1200M",
+                "library": "LIB0002",
+                "coverage": "40x",
+                "inputNg": "300",
+                "protocol": "WGS",
+                "sampleName": "SAMPLE1-PB",
+                "bioQC": "passed",
+                "labQC": "passed",
+                "duplicateReadsPerc": "8.1",
+            },
+        ],
+        "config": "test config",
     }
 
     json_file.write_text(
@@ -116,7 +148,7 @@ def loaded_reports(tmp_path_factory) -> Generator:
         )
     )
 
-    json_contents['patientId'] = async_patient_id
+    json_contents["patientId"] = async_patient_id
     async_json_file.write_text(
         json.dumps(
             json_contents,
@@ -125,46 +157,46 @@ def loaded_reports(tmp_path_factory) -> Generator:
     )
 
     argslist = [
-        'ipr',
-        '--username',
-        os.environ.get('IPR_USER', os.environ['USER']),
-        '--password',
-        os.environ['IPR_PASS'],
-        '--graphkb_username',
-        os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])),
-        '--graphkb_password',
-        os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']),
-        '--ipr_url',
-        os.environ['IPR_TEST_URL'],
-        '--graphkb_url',
-        os.environ.get('GRAPHKB_URL', False),
-        '--therapeutics',
-        '--allow_partial_matches',
+        "ipr",
+        "--username",
+        os.environ.get("IPR_USER", os.environ["USER"]),
+        "--password",
+        os.environ["IPR_PASS"],
+        "--graphkb_username",
+        os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])),
+        "--graphkb_password",
+        os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]),
+        "--ipr_url",
+        os.environ["IPR_TEST_URL"],
+        "--graphkb_url",
+        os.environ.get("GRAPHKB_URL", False),
+        "--therapeutics",
+        "--allow_partial_matches",
     ]
 
     sync_argslist = argslist.copy()
-    sync_argslist.extend(['--content', str(json_file)])
-    with patch.object(sys, 'argv', sync_argslist):
-        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
+    sync_argslist.extend(["--content", str(json_file)])
+    with patch.object(sys, "argv", sync_argslist):
+        with patch.object(IprConnection, "get_spec", return_value=get_test_spec()):
             command_interface()
 
     async_argslist = argslist.copy()
-    async_argslist.extend(['--content', str(async_json_file), '--async_upload'])
-    with patch.object(sys, 'argv', async_argslist):
-        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
+    async_argslist.extend(["--content", str(async_json_file), "--async_upload"])
+    with patch.object(sys, "argv", async_argslist):
+        with patch.object(IprConnection, "get_spec", return_value=get_test_spec()):
             command_interface()
 
     ipr_conn = IprConnection(
-        username=os.environ.get('IPR_USER', os.environ['USER']),
-        password=os.environ['IPR_PASS'],
-        url=os.environ['IPR_TEST_URL'],
+        username=os.environ.get("IPR_USER", os.environ["USER"]),
+        password=os.environ["IPR_PASS"],
+        url=os.environ["IPR_TEST_URL"],
     )
-    loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}')
-    async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}')
+    loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}")
+    async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}")
 
     loaded_reports_result = {
-        'sync': (patient_id, loaded_report),
-        'async': (async_patient_id, async_loaded_report),
+        "sync": (patient_id, loaded_report),
+        "async": (async_patient_id, async_loaded_report),
     }
     yield loaded_reports_result
     if DELETE_UPLOAD_TEST_REPORTS:
@@ -173,13 +205,13 @@ def loaded_reports(tmp_path_factory) -> Generator:
 
 
 def get_section(loaded_report, section_name):
-    ident = loaded_report[1]['reports'][0]['ident']
+    ident = loaded_report[1]["reports"][0]["ident"]
     ipr_conn = IprConnection(
-        username=os.environ.get('IPR_USER', os.environ['USER']),
-        password=os.environ['IPR_PASS'],
-        url=os.environ['IPR_TEST_URL'],
+        username=os.environ.get("IPR_USER", os.environ["USER"]),
+        password=os.environ["IPR_PASS"],
+        url=os.environ["IPR_TEST_URL"],
     )
-    return ipr_conn.get(uri=f'reports/{ident}/{section_name}')
+    return ipr_conn.get(uri=f"reports/{ident}/{section_name}")
 
 
 def stringify_sorted(obj):
@@ -192,7 +224,7 @@ def stringify_sorted(obj):
         obj.sort()
         return str(obj)
     elif isinstance(obj, dict):
-        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'):
+        for key in ("ident", "updatedAt", "createdAt", "deletedAt"):
             obj.pop(key, None)
         keys = obj.keys()
         for key in keys:
@@ -208,135 +240,145 @@ def stringify_sorted(obj):
 
 
 @pytest.mark.skipif(
-    not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance'
+    not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance"
+)
+@pytest.mark.skipif(
+    EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
 )
-@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests')
 class TestCreateReport:
     def test_patient_id_loaded_once(self, loaded_reports) -> None:
-        sync_patient_id = loaded_reports['sync'][0]
-        assert loaded_reports['sync'][1]['total'] == 1
-        assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id
-        async_patient_id = loaded_reports['async'][0]
-        assert loaded_reports['async'][1]['total'] == 1
-        assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id
+        sync_patient_id = loaded_reports["sync"][0]
+        assert loaded_reports["sync"][1]["total"] == 1
+        assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id
+        async_patient_id = loaded_reports["async"][0]
+        assert loaded_reports["async"][1]["total"] == 1
+        assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id
 
     def test_expression_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'expression-variants')
-        kbmatched = [item for item in section if item['kbMatches']]
-        assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched]
-        async_section = get_section(loaded_reports['async'], 'expression-variants')
+        section = get_section(loaded_reports["sync"], "expression-variants")
+        kbmatched = [item for item in section if item["kbMatches"]]
+        assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched]
+        async_section = get_section(loaded_reports["async"], "expression-variants")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_structural_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'structural-variants')
-        kbmatched = [item for item in section if item['kbMatches']]
-        assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched]
-        async_section = get_section(loaded_reports['async'], 'structural-variants')
+        section = get_section(loaded_reports["sync"], "structural-variants")
+        kbmatched = [item for item in section if item["kbMatches"]]
+        assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [
+            item["displayName"] for item in kbmatched
+        ]
+        async_section = get_section(loaded_reports["async"], "structural-variants")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_small_mutations_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'small-mutations')
-        kbmatched = [item for item in section if item['kbMatches']]
-        assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched]
-        assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched]
-        async_section = get_section(loaded_reports['async'], 'small-mutations')
+        section = get_section(loaded_reports["sync"], "small-mutations")
+        kbmatched = [item for item in section if item["kbMatches"]]
+        assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched]
+        assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched]
+        async_section = get_section(loaded_reports["async"], "small-mutations")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_copy_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'copy-variants')
-        kbmatched = [item for item in section if item['kbMatches']]
-        assert ('ERBB2', 'amplification') in [
-            (item['gene']['name'], item['displayName']) for item in kbmatched
+        section = get_section(loaded_reports["sync"], "copy-variants")
+        kbmatched = [item for item in section if item["kbMatches"]]
+        assert ("ERBB2", "amplification") in [
+            (item["gene"]["name"], item["displayName"]) for item in kbmatched
         ]
-        async_section = get_section(loaded_reports['async'], 'copy-variants')
+        async_section = get_section(loaded_reports["async"], "copy-variants")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_signature_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'signature-variants')
-        kbmatched = [item for item in section if item['kbMatches']]
+        section = get_section(loaded_reports["sync"], "signature-variants")
+        kbmatched = [item for item in section if item["kbMatches"]]
         # Check for COSMIC signatures
-        assert ('SBS2', 'high signature') in [
-            (item['signatureName'], item['variantTypeName']) for item in kbmatched
+        assert ("SBS2", "high signature") in [
+            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
         ]
         # Check for HRD signature (score 9999 > cutoff 5, so strong signature)
-        assert ('homologous recombination deficiency', 'strong signature') in [
-            (item['signatureName'], item['variantTypeName']) for item in kbmatched
+        assert ("homologous recombination deficiency", "strong signature") in [
+            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
         ]
         # Check for MSI signature
-        assert ('microsatellite instability', 'high signature') in [
-            (item['signatureName'], item['variantTypeName']) for item in kbmatched
+        assert ("microsatellite instability", "high signature") in [
+            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
         ]
-        async_section = get_section(loaded_reports['async'], 'signature-variants')
+        async_section = get_section(loaded_reports["async"], "signature-variants")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_hrd_score_in_report(self, loaded_reports) -> None:
         """Test that HRD score is present in the loaded report."""
-        report = loaded_reports['sync'][1]['reports'][0]
-        assert 'hrdScore' in report
-        assert report['hrdScore'] == 9999.0
+        report = loaded_reports["sync"][1]["reports"][0]
+        assert "hrdScore" in report
+        assert report["hrdScore"] == 9999.0
 
     def test_kb_matches_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'kb-matches')
+        section = get_section(loaded_reports["sync"], "kb-matches")
         observed_and_matched = set(
-            [(item['kbVariant'], item['variant']['displayName']) for item in section]
+            [(item["kbVariant"], item["variant"]["displayName"]) for item in section]
         )
         for pair in [
-            ('ERBB2 amplification', 'amplification'),
-            ('FGFR2 mutation', 'FGFR2:p.R421C'),
-            ('PTP4A3 overexpression', 'increased expression'),
-            ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'),
-            ('CDKN2A mutation', 'CDKN2A:p.T18M'),
+            ("ERBB2 amplification", "amplification"),
+            ("FGFR2 mutation", "FGFR2:p.R421C"),
+            ("PTP4A3 overexpression", "increased expression"),
+            ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"),
+            ("CDKN2A mutation", "CDKN2A:p.T18M"),
         ]:
             assert pair in observed_and_matched
-        async_section = get_section(loaded_reports['async'], 'kb-matches')
+        async_section = get_section(loaded_reports["async"], "kb-matches")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_therapeutic_targets_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'therapeutic-targets')
-        therapeutic_target_genes = set([item['gene'] for item in section])
-        for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']:
+        section = get_section(loaded_reports["sync"], "therapeutic-targets")
+        therapeutic_target_genes = set([item["gene"] for item in section])
+        for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]:
             assert gene in therapeutic_target_genes
-        async_section = get_section(loaded_reports['async'], 'therapeutic-targets')
+        async_section = get_section(loaded_reports["async"], "therapeutic-targets")
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified')
-        variants = set([item['geneVariant'] for item in section])
+        section = get_section(
+            loaded_reports["sync"], "summary/genomic-alterations-identified"
+        )
+        variants = set([item["geneVariant"] for item in section])
         for variant in [
-            'FGFR2:p.R421C',
-            'PTP4A3 (high_percentile)',
-            'ERBB2 (Amplification)',
-            '(EWSR1,FLI1):fusion(e.7,e.4)',
-            'CDKN2A:p.T18M',
+            "FGFR2:p.R421C",
+            "PTP4A3 (high_percentile)",
+            "ERBB2 (Amplification)",
+            "(EWSR1,FLI1):fusion(e.7,e.4)",
+            "CDKN2A:p.T18M",
         ]:
             assert variant in variants
         async_section = get_section(
-            loaded_reports['async'], 'summary/genomic-alterations-identified'
+            loaded_reports["async"], "summary/genomic-alterations-identified"
         )
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_analyst_comments_loaded(self, loaded_reports) -> None:
-        sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments')
-        assert sync_section['comments']
-        async_section = get_section(loaded_reports['async'], 'summary/analyst-comments')
-        assert async_section['comments']
-        assert sync_section['comments'] == async_section['comments']
+        sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments")
+        assert sync_section["comments"]
+        async_section = get_section(loaded_reports["async"], "summary/analyst-comments")
+        assert async_section["comments"]
+        assert sync_section["comments"] == async_section["comments"]
 
     def test_sample_info_loaded(self, loaded_reports) -> None:
-        sync_section = get_section(loaded_reports['sync'], 'sample-info')
-        async_section = get_section(loaded_reports['async'], 'sample-info')
-        async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section)
+        sync_section = get_section(loaded_reports["sync"], "sample-info")
+        async_section = get_section(loaded_reports["async"], "sample-info")
+        async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(
+            async_section
+        )
         assert async_equals_sync
 
-    def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) -> None:
+    def test_multivariant_multiconditionset_statements_loaded(
+        self, loaded_reports
+    ) -> None:
         """
         Checks that multivariant statements and multiple condition sets prepared correctly
         by this package are handled as expected by the api.
@@ -348,31 +390,41 @@ def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports)
         are met.
         This is also a test of multiple condition sets since there are two variants
         in the test data that satisfy one of the conditions (the APC mutation)."""
-        section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements')
-        multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369']
+        section = get_section(
+            loaded_reports["sync"], "kb-matches/kb-matched-statements"
+        )
+        multivariant_stmts = [
+            item for item in section if item["reference"] == "pmid:27302369"
+        ]
 
         # if this statement is entered more than once there may be multiple sets of records to
         # check, so to make sure the count checks work, go stmt_id by stmt_id:
-        stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts]))
+        stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts]))
         for stmt_id in stmt_ids:
-            stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id]
+            stmts = [
+                item for item in multivariant_stmts if item["kbStatementId"] == stmt_id
+            ]
 
             # we expect three stmts, one for each condition set
             assert len(stmts) == 3
 
             # we expect each condition set to have two kb variants in it
             # we expect the two kb variants to be the same in each stmt
-            assert len(stmts[0]['kbMatches']) == 2
-            assert len(stmts[1]['kbMatches']) == 2
-            kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']]
-            kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']]
+            assert len(stmts[0]["kbMatches"]) == 2
+            assert len(stmts[1]["kbMatches"]) == 2
+            kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]]
+            kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]]
             kbmatches1.sort()
             kbmatches2.sort()
-            assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation']
+            assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"]
 
             # we expect the two stmts to have different observed variant sets
-            observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']]
-            observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']]
+            observedVariants1 = [
+                item["variant"]["ident"] for item in stmts[0]["kbMatches"]
+            ]
+            observedVariants2 = [
+                item["variant"]["ident"] for item in stmts[1]["kbMatches"]
+            ]
             observedVariants1.sort()
             observedVariants2.sort()
             assert observedVariants1 != observedVariants2

From 72d6805e1c51e73b627997971b0d07910dc91b39 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 11:04:34 -0700
Subject: [PATCH 28/64] commit to save

---
 pori_python/ipr/content.spec.json |  1 +
 tests/test_ipr/test_upload.py     | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
index 711f9eb5..dfbfb324 100644
--- a/pori_python/ipr/content.spec.json
+++ b/pori_python/ipr/content.spec.json
@@ -893,6 +893,7 @@
             "type": "string"
         },
         "seqQC": {
+            "default": [],
             "type": "array",
             "items": {
                 "type": "object",
diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index 80bdd4b5..f8eaa246 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -368,6 +368,18 @@ def test_analyst_comments_loaded(self, loaded_reports) -> None:
         assert async_section["comments"]
         assert sync_section["comments"] == async_section["comments"]
 
+    def test_seqqc_loaded(self, loaded_reports) -> None:
+        """Test that seqQC data is present in the loaded report."""
+        sync_report = loaded_reports["sync"][1]["reports"][0]
+        assert "seqQC" in sync_report
+        assert len(sync_report["seqQC"]) == 2
+        samples = [item["sample"] for item in sync_report["seqQC"]]
+        assert "Tumour DNA" in samples
+        assert "Constitutional DNA" in samples
+        async_report = loaded_reports["async"][1]["reports"][0]
+        assert "seqQC" in async_report
+        assert len(async_report["seqQC"]) == 2
+
     def test_sample_info_loaded(self, loaded_reports) -> None:
         sync_section = get_section(loaded_reports["sync"], "sample-info")
         async_section = get_section(loaded_reports["async"], "sample-info")

From 4d0181bdcefe95e363cbbfde486d4b2b6f0ea0d7 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 11:08:47 -0700
Subject: [PATCH 29/64] format

---
 tests/test_ipr/test_upload.py | 428 ++++++++++++++++------------------
 1 file changed, 201 insertions(+), 227 deletions(-)

diff --git a/tests/test_ipr/test_upload.py b/tests/test_ipr/test_upload.py
index f8eaa246..06d60eb2 100644
--- a/tests/test_ipr/test_upload.py
+++ b/tests/test_ipr/test_upload.py
@@ -13,132 +13,126 @@
 
 from .constants import EXCLUDE_INTEGRATION_TESTS
 
-EXCLUDE_BCGSC_TESTS = os.environ.get("EXCLUDE_BCGSC_TESTS") == "1"
-EXCLUDE_ONCOKB_TESTS = os.environ.get("EXCLUDE_ONCOKB_TESTS") == "1"
-INCLUDE_UPLOAD_TESTS = os.environ.get("INCLUDE_UPLOAD_TESTS", "0") == "1"
-DELETE_UPLOAD_TEST_REPORTS = os.environ.get("DELETE_UPLOAD_TEST_REPORTS", "1") == "1"
+EXCLUDE_BCGSC_TESTS = os.environ.get('EXCLUDE_BCGSC_TESTS') == '1'
+EXCLUDE_ONCOKB_TESTS = os.environ.get('EXCLUDE_ONCOKB_TESTS') == '1'
+INCLUDE_UPLOAD_TESTS = os.environ.get('INCLUDE_UPLOAD_TESTS', '0') == '1'
+DELETE_UPLOAD_TEST_REPORTS = os.environ.get('DELETE_UPLOAD_TEST_REPORTS', '1') == '1'
 
 
 def get_test_spec():
-    ipr_spec = {"components": {"schemas": {"genesCreate": {"properties": {}}}}}
+    ipr_spec = {'components': {'schemas': {'genesCreate': {'properties': {}}}}}
     ipr_gene_keys = IprGene.__required_keys__ | IprGene.__optional_keys__
     for key in ipr_gene_keys:
-        ipr_spec["components"]["schemas"]["genesCreate"]["properties"][key] = ""
+        ipr_spec['components']['schemas']['genesCreate']['properties'][key] = ''
     return ipr_spec
 
 
 def get_test_file(name: str) -> str:
-    return os.path.join(os.path.dirname(__file__), "test_data", name)
+    return os.path.join(os.path.dirname(__file__), 'test_data', name)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
 def loaded_reports(tmp_path_factory) -> Generator:
-    json_file = tmp_path_factory.mktemp("inputs") / "content.json"
-    async_json_file = tmp_path_factory.mktemp("inputs") / "async_content.json"
-    patient_id = f"TEST_{str(uuid.uuid4())}"
-    async_patient_id = f"TEST_ASYNC_{str(uuid.uuid4())}"
+    json_file = tmp_path_factory.mktemp('inputs') / 'content.json'
+    async_json_file = tmp_path_factory.mktemp('inputs') / 'async_content.json'
+    patient_id = f'TEST_{str(uuid.uuid4())}'
+    async_patient_id = f'TEST_ASYNC_{str(uuid.uuid4())}'
     json_contents = {
-        "comparators": [
-            {"analysisRole": "expression (disease)", "name": "1"},
-            {"analysisRole": "expression (primary site)", "name": "2"},
-            {"analysisRole": "expression (biopsy site)", "name": "3"},
+        'comparators': [
+            {'analysisRole': 'expression (disease)', 'name': '1'},
+            {'analysisRole': 'expression (primary site)', 'name': '2'},
+            {'analysisRole': 'expression (biopsy site)', 'name': '3'},
             {
-                "analysisRole": "expression (internal pancancer cohort)",
-                "name": "4",
+                'analysisRole': 'expression (internal pancancer cohort)',
+                'name': '4',
             },
         ],
-        "patientId": patient_id,
-        "project": "TEST",
-        "sampleInfo": [
+        'patientId': patient_id,
+        'project': 'TEST',
+        'sampleInfo': [
             {
-                "sample": "Constitutional",
-                "biopsySite": "Normal tissue",
-                "sampleName": "SAMPLE1-PB",
-                "primarySite": "Blood-Peripheral",
-                "collectionDate": "11-11-11",
+                'sample': 'Constitutional',
+                'biopsySite': 'Normal tissue',
+                'sampleName': 'SAMPLE1-PB',
+                'primarySite': 'Blood-Peripheral',
+                'collectionDate': '11-11-11',
             },
             {
-                "sample": "Tumour",
-                "pathoTc": "90%",
-                "biopsySite": "hepatic",
-                "sampleName": "SAMPLE2-FF-1",
-                "primarySite": "Vena Cava-Hepatic",
-                "collectionDate": "12-12-12",
+                'sample': 'Tumour',
+                'pathoTc': '90%',
+                'biopsySite': 'hepatic',
+                'sampleName': 'SAMPLE2-FF-1',
+                'primarySite': 'Vena Cava-Hepatic',
+                'collectionDate': '12-12-12',
             },
         ],
-        "msi": [
+        'msi': [
             {
-                "score": 1000.0,
-                "kbCategory": "microsatellite instability",
+                'score': 1000.0,
+                'kbCategory': 'microsatellite instability',
             }
         ],
-        "hrd": {
-            "score": 9999.0,
-            "cutoff": 5,
+        'hrd': {
+            'score': 9999.0,
+            'cutoff': 5,
         },
-        "expressionVariants": json.loads(
-            pd.read_csv(get_test_file("expression.short.tab"), sep="\t").to_json(
-                orient="records"
-            )
+        'expressionVariants': json.loads(
+            pd.read_csv(get_test_file('expression.short.tab'), sep='\t').to_json(orient='records')
         ),
-        "smallMutations": json.loads(
-            pd.read_csv(get_test_file("small_mutations.short.tab"), sep="\t").to_json(
-                orient="records"
+        'smallMutations': json.loads(
+            pd.read_csv(get_test_file('small_mutations.short.tab'), sep='\t').to_json(
+                orient='records'
             )
         ),
-        "copyVariants": json.loads(
-            pd.read_csv(get_test_file("copy_variants.short.tab"), sep="\t").to_json(
-                orient="records"
+        'copyVariants': json.loads(
+            pd.read_csv(get_test_file('copy_variants.short.tab'), sep='\t').to_json(
+                orient='records'
             )
         ),
-        "structuralVariants": json.loads(
-            pd.read_csv(get_test_file("fusions.tab"), sep="\t").to_json(
-                orient="records"
-            )
+        'structuralVariants': json.loads(
+            pd.read_csv(get_test_file('fusions.tab'), sep='\t').to_json(orient='records')
         ),
-        "kbDiseaseMatch": "colorectal cancer",
-        "cosmicSignatures": pd.read_csv(
-            get_test_file("cosmic_variants.tab"), sep="\t"
+        'kbDiseaseMatch': 'colorectal cancer',
+        'cosmicSignatures': pd.read_csv(
+            get_test_file('cosmic_variants.tab'), sep='\t'
         ).signature.tolist(),
-        "hlaTypes": json.loads(
-            pd.read_csv(get_test_file("hla_variants.tab"), sep="\t").to_json(
-                orient="records"
-            )
+        'hlaTypes': json.loads(
+            pd.read_csv(get_test_file('hla_variants.tab'), sep='\t').to_json(orient='records')
         ),
-        "images": [
+        'images': [
             {
-                "key": "cnvLoh.circos",
-                "path": "test/testData/images/cnvLoh.png",
-                "caption": "Test adding a caption to an image",
+                'key': 'cnvLoh.circos',
+                'path': 'test/testData/images/cnvLoh.png',
+                'caption': 'Test adding a caption to an image',
             }
         ],
-        "seqQC": [
+        'seqQC': [
             {
-                "sample": "Tumour DNA",
-                "reads": "2534M",
-                "library": "LIB0001",
-                "coverage": "80x",
-                "inputNg": "500",
-                "protocol": "WGS",
-                "sampleName": "SAMPLE2-FF-1",
-                "bioQC": "passed",
-                "labQC": "passed",
-                "duplicateReadsPerc": "12.3",
+                'sample': 'Tumour DNA',
+                'reads': '2534M',
+                'library': 'LIB0001',
+                'coverage': '80x',
+                'inputNg': '500',
+                'protocol': 'WGS',
+                'sampleName': 'SAMPLE2-FF-1',
+                'bioQC': 'passed',
+                'labQC': 'passed',
+                'duplicateReadsPerc': '12.3',
             },
             {
-                "sample": "Constitutional DNA",
-                "reads": "1200M",
-                "library": "LIB0002",
-                "coverage": "40x",
-                "inputNg": "300",
-                "protocol": "WGS",
-                "sampleName": "SAMPLE1-PB",
-                "bioQC": "passed",
-                "labQC": "passed",
-                "duplicateReadsPerc": "8.1",
+                'sample': 'Constitutional DNA',
+                'reads': '1200M',
+                'library': 'LIB0002',
+                'coverage': '40x',
+                'inputNg': '300',
+                'protocol': 'WGS',
+                'sampleName': 'SAMPLE1-PB',
+                'bioQC': 'passed',
+                'labQC': 'passed',
+                'duplicateReadsPerc': '8.1',
             },
         ],
-        "config": "test config",
+        'config': 'test config',
     }
 
     json_file.write_text(
@@ -148,7 +142,7 @@ def loaded_reports(tmp_path_factory) -> Generator:
         )
     )
 
-    json_contents["patientId"] = async_patient_id
+    json_contents['patientId'] = async_patient_id
     async_json_file.write_text(
         json.dumps(
             json_contents,
@@ -157,46 +151,46 @@ def loaded_reports(tmp_path_factory) -> Generator:
     )
 
     argslist = [
-        "ipr",
-        "--username",
-        os.environ.get("IPR_USER", os.environ["USER"]),
-        "--password",
-        os.environ["IPR_PASS"],
-        "--graphkb_username",
-        os.environ.get("GRAPHKB_USER", os.environ.get("IPR_USER", os.environ["USER"])),
-        "--graphkb_password",
-        os.environ.get("GRAPHKB_PASS", os.environ["IPR_PASS"]),
-        "--ipr_url",
-        os.environ["IPR_TEST_URL"],
-        "--graphkb_url",
-        os.environ.get("GRAPHKB_URL", False),
-        "--therapeutics",
-        "--allow_partial_matches",
+        'ipr',
+        '--username',
+        os.environ.get('IPR_USER', os.environ['USER']),
+        '--password',
+        os.environ['IPR_PASS'],
+        '--graphkb_username',
+        os.environ.get('GRAPHKB_USER', os.environ.get('IPR_USER', os.environ['USER'])),
+        '--graphkb_password',
+        os.environ.get('GRAPHKB_PASS', os.environ['IPR_PASS']),
+        '--ipr_url',
+        os.environ['IPR_TEST_URL'],
+        '--graphkb_url',
+        os.environ.get('GRAPHKB_URL', False),
+        '--therapeutics',
+        '--allow_partial_matches',
     ]
 
     sync_argslist = argslist.copy()
-    sync_argslist.extend(["--content", str(json_file)])
-    with patch.object(sys, "argv", sync_argslist):
-        with patch.object(IprConnection, "get_spec", return_value=get_test_spec()):
+    sync_argslist.extend(['--content', str(json_file)])
+    with patch.object(sys, 'argv', sync_argslist):
+        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
             command_interface()
 
     async_argslist = argslist.copy()
-    async_argslist.extend(["--content", str(async_json_file), "--async_upload"])
-    with patch.object(sys, "argv", async_argslist):
-        with patch.object(IprConnection, "get_spec", return_value=get_test_spec()):
+    async_argslist.extend(['--content', str(async_json_file), '--async_upload'])
+    with patch.object(sys, 'argv', async_argslist):
+        with patch.object(IprConnection, 'get_spec', return_value=get_test_spec()):
             command_interface()
 
     ipr_conn = IprConnection(
-        username=os.environ.get("IPR_USER", os.environ["USER"]),
-        password=os.environ["IPR_PASS"],
-        url=os.environ["IPR_TEST_URL"],
+        username=os.environ.get('IPR_USER', os.environ['USER']),
+        password=os.environ['IPR_PASS'],
+        url=os.environ['IPR_TEST_URL'],
     )
-    loaded_report = ipr_conn.get(uri=f"reports?searchText={patient_id}")
-    async_loaded_report = ipr_conn.get(uri=f"reports?searchText={async_patient_id}")
+    loaded_report = ipr_conn.get(uri=f'reports?searchText={patient_id}')
+    async_loaded_report = ipr_conn.get(uri=f'reports?searchText={async_patient_id}')
 
     loaded_reports_result = {
-        "sync": (patient_id, loaded_report),
-        "async": (async_patient_id, async_loaded_report),
+        'sync': (patient_id, loaded_report),
+        'async': (async_patient_id, async_loaded_report),
     }
     yield loaded_reports_result
     if DELETE_UPLOAD_TEST_REPORTS:
@@ -205,13 +199,13 @@ def loaded_reports(tmp_path_factory) -> Generator:
 
 
 def get_section(loaded_report, section_name):
-    ident = loaded_report[1]["reports"][0]["ident"]
+    ident = loaded_report[1]['reports'][0]['ident']
     ipr_conn = IprConnection(
-        username=os.environ.get("IPR_USER", os.environ["USER"]),
-        password=os.environ["IPR_PASS"],
-        url=os.environ["IPR_TEST_URL"],
+        username=os.environ.get('IPR_USER', os.environ['USER']),
+        password=os.environ['IPR_PASS'],
+        url=os.environ['IPR_TEST_URL'],
     )
-    return ipr_conn.get(uri=f"reports/{ident}/{section_name}")
+    return ipr_conn.get(uri=f'reports/{ident}/{section_name}')
 
 
 def stringify_sorted(obj):
@@ -224,7 +218,7 @@ def stringify_sorted(obj):
         obj.sort()
         return str(obj)
     elif isinstance(obj, dict):
-        for key in ("ident", "updatedAt", "createdAt", "deletedAt"):
+        for key in ('ident', 'updatedAt', 'createdAt', 'deletedAt'):
             obj.pop(key, None)
         keys = obj.keys()
         for key in keys:
@@ -240,157 +234,147 @@ def stringify_sorted(obj):
 
 
 @pytest.mark.skipif(
-    not INCLUDE_UPLOAD_TESTS, reason="excluding tests of upload to live ipr instance"
-)
-@pytest.mark.skipif(
-    EXCLUDE_INTEGRATION_TESTS, reason="excluding long running integration tests"
+    not INCLUDE_UPLOAD_TESTS, reason='excluding tests of upload to live ipr instance'
 )
+@pytest.mark.skipif(EXCLUDE_INTEGRATION_TESTS, reason='excluding long running integration tests')
 class TestCreateReport:
     def test_patient_id_loaded_once(self, loaded_reports) -> None:
-        sync_patient_id = loaded_reports["sync"][0]
-        assert loaded_reports["sync"][1]["total"] == 1
-        assert loaded_reports["sync"][1]["reports"][0]["patientId"] == sync_patient_id
-        async_patient_id = loaded_reports["async"][0]
-        assert loaded_reports["async"][1]["total"] == 1
-        assert loaded_reports["async"][1]["reports"][0]["patientId"] == async_patient_id
+        sync_patient_id = loaded_reports['sync'][0]
+        assert loaded_reports['sync'][1]['total'] == 1
+        assert loaded_reports['sync'][1]['reports'][0]['patientId'] == sync_patient_id
+        async_patient_id = loaded_reports['async'][0]
+        assert loaded_reports['async'][1]['total'] == 1
+        assert loaded_reports['async'][1]['reports'][0]['patientId'] == async_patient_id
 
     def test_expression_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "expression-variants")
-        kbmatched = [item for item in section if item["kbMatches"]]
-        assert "PTP4A3" in [item["gene"]["name"] for item in kbmatched]
-        async_section = get_section(loaded_reports["async"], "expression-variants")
+        section = get_section(loaded_reports['sync'], 'expression-variants')
+        kbmatched = [item for item in section if item['kbMatches']]
+        assert 'PTP4A3' in [item['gene']['name'] for item in kbmatched]
+        async_section = get_section(loaded_reports['async'], 'expression-variants')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_structural_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "structural-variants")
-        kbmatched = [item for item in section if item["kbMatches"]]
-        assert "(EWSR1,FLI1):fusion(e.7,e.4)" in [
-            item["displayName"] for item in kbmatched
-        ]
-        async_section = get_section(loaded_reports["async"], "structural-variants")
+        section = get_section(loaded_reports['sync'], 'structural-variants')
+        kbmatched = [item for item in section if item['kbMatches']]
+        assert '(EWSR1,FLI1):fusion(e.7,e.4)' in [item['displayName'] for item in kbmatched]
+        async_section = get_section(loaded_reports['async'], 'structural-variants')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_small_mutations_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "small-mutations")
-        kbmatched = [item for item in section if item["kbMatches"]]
-        assert "FGFR2:p.R421C" in [item["displayName"] for item in kbmatched]
-        assert "CDKN2A:p.T18M" in [item["displayName"] for item in kbmatched]
-        async_section = get_section(loaded_reports["async"], "small-mutations")
+        section = get_section(loaded_reports['sync'], 'small-mutations')
+        kbmatched = [item for item in section if item['kbMatches']]
+        assert 'FGFR2:p.R421C' in [item['displayName'] for item in kbmatched]
+        assert 'CDKN2A:p.T18M' in [item['displayName'] for item in kbmatched]
+        async_section = get_section(loaded_reports['async'], 'small-mutations')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_copy_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "copy-variants")
-        kbmatched = [item for item in section if item["kbMatches"]]
-        assert ("ERBB2", "amplification") in [
-            (item["gene"]["name"], item["displayName"]) for item in kbmatched
+        section = get_section(loaded_reports['sync'], 'copy-variants')
+        kbmatched = [item for item in section if item['kbMatches']]
+        assert ('ERBB2', 'amplification') in [
+            (item['gene']['name'], item['displayName']) for item in kbmatched
         ]
-        async_section = get_section(loaded_reports["async"], "copy-variants")
+        async_section = get_section(loaded_reports['async'], 'copy-variants')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_signature_variants_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "signature-variants")
-        kbmatched = [item for item in section if item["kbMatches"]]
+        section = get_section(loaded_reports['sync'], 'signature-variants')
+        kbmatched = [item for item in section if item['kbMatches']]
         # Check for COSMIC signatures
-        assert ("SBS2", "high signature") in [
-            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
+        assert ('SBS2', 'high signature') in [
+            (item['signatureName'], item['variantTypeName']) for item in kbmatched
         ]
         # Check for HRD signature (score 9999 > cutoff 5, so strong signature)
-        assert ("homologous recombination deficiency", "strong signature") in [
-            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
+        assert ('homologous recombination deficiency', 'strong signature') in [
+            (item['signatureName'], item['variantTypeName']) for item in kbmatched
         ]
         # Check for MSI signature
-        assert ("microsatellite instability", "high signature") in [
-            (item["signatureName"], item["variantTypeName"]) for item in kbmatched
+        assert ('microsatellite instability', 'high signature') in [
+            (item['signatureName'], item['variantTypeName']) for item in kbmatched
         ]
-        async_section = get_section(loaded_reports["async"], "signature-variants")
+        async_section = get_section(loaded_reports['async'], 'signature-variants')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_hrd_score_in_report(self, loaded_reports) -> None:
         """Test that HRD score is present in the loaded report."""
-        report = loaded_reports["sync"][1]["reports"][0]
-        assert "hrdScore" in report
-        assert report["hrdScore"] == 9999.0
+        report = loaded_reports['sync'][1]['reports'][0]
+        assert 'hrdScore' in report
+        assert report['hrdScore'] == 9999.0
 
     def test_kb_matches_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "kb-matches")
+        section = get_section(loaded_reports['sync'], 'kb-matches')
         observed_and_matched = set(
-            [(item["kbVariant"], item["variant"]["displayName"]) for item in section]
+            [(item['kbVariant'], item['variant']['displayName']) for item in section]
         )
         for pair in [
-            ("ERBB2 amplification", "amplification"),
-            ("FGFR2 mutation", "FGFR2:p.R421C"),
-            ("PTP4A3 overexpression", "increased expression"),
-            ("EWSR1 and FLI1 fusion", "(EWSR1,FLI1):fusion(e.7,e.4)"),
-            ("CDKN2A mutation", "CDKN2A:p.T18M"),
+            ('ERBB2 amplification', 'amplification'),
+            ('FGFR2 mutation', 'FGFR2:p.R421C'),
+            ('PTP4A3 overexpression', 'increased expression'),
+            ('EWSR1 and FLI1 fusion', '(EWSR1,FLI1):fusion(e.7,e.4)'),
+            ('CDKN2A mutation', 'CDKN2A:p.T18M'),
         ]:
             assert pair in observed_and_matched
-        async_section = get_section(loaded_reports["async"], "kb-matches")
+        async_section = get_section(loaded_reports['async'], 'kb-matches')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_therapeutic_targets_loaded(self, loaded_reports) -> None:
-        section = get_section(loaded_reports["sync"], "therapeutic-targets")
-        therapeutic_target_genes = set([item["gene"] for item in section])
-        for gene in ["CDKN2A", "ERBB2", "FGFR2", "PTP4A3"]:
+        section = get_section(loaded_reports['sync'], 'therapeutic-targets')
+        therapeutic_target_genes = set([item['gene'] for item in section])
+        for gene in ['CDKN2A', 'ERBB2', 'FGFR2', 'PTP4A3']:
             assert gene in therapeutic_target_genes
-        async_section = get_section(loaded_reports["async"], "therapeutic-targets")
+        async_section = get_section(loaded_reports['async'], 'therapeutic-targets')
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_genomic_alterations_identified_loaded(self, loaded_reports) -> None:
-        section = get_section(
-            loaded_reports["sync"], "summary/genomic-alterations-identified"
-        )
-        variants = set([item["geneVariant"] for item in section])
+        section = get_section(loaded_reports['sync'], 'summary/genomic-alterations-identified')
+        variants = set([item['geneVariant'] for item in section])
         for variant in [
-            "FGFR2:p.R421C",
-            "PTP4A3 (high_percentile)",
-            "ERBB2 (Amplification)",
-            "(EWSR1,FLI1):fusion(e.7,e.4)",
-            "CDKN2A:p.T18M",
+            'FGFR2:p.R421C',
+            'PTP4A3 (high_percentile)',
+            'ERBB2 (Amplification)',
+            '(EWSR1,FLI1):fusion(e.7,e.4)',
+            'CDKN2A:p.T18M',
         ]:
             assert variant in variants
         async_section = get_section(
-            loaded_reports["async"], "summary/genomic-alterations-identified"
+            loaded_reports['async'], 'summary/genomic-alterations-identified'
         )
         async_equals_sync = stringify_sorted(section) == stringify_sorted(async_section)
         assert async_equals_sync
 
     def test_analyst_comments_loaded(self, loaded_reports) -> None:
-        sync_section = get_section(loaded_reports["sync"], "summary/analyst-comments")
-        assert sync_section["comments"]
-        async_section = get_section(loaded_reports["async"], "summary/analyst-comments")
-        assert async_section["comments"]
-        assert sync_section["comments"] == async_section["comments"]
+        sync_section = get_section(loaded_reports['sync'], 'summary/analyst-comments')
+        assert sync_section['comments']
+        async_section = get_section(loaded_reports['async'], 'summary/analyst-comments')
+        assert async_section['comments']
+        assert sync_section['comments'] == async_section['comments']
 
     def test_seqqc_loaded(self, loaded_reports) -> None:
         """Test that seqQC data is present in the loaded report."""
-        sync_report = loaded_reports["sync"][1]["reports"][0]
-        assert "seqQC" in sync_report
-        assert len(sync_report["seqQC"]) == 2
-        samples = [item["sample"] for item in sync_report["seqQC"]]
-        assert "Tumour DNA" in samples
-        assert "Constitutional DNA" in samples
-        async_report = loaded_reports["async"][1]["reports"][0]
-        assert "seqQC" in async_report
-        assert len(async_report["seqQC"]) == 2
+        sync_report = loaded_reports['sync'][1]['reports'][0]
+        assert 'seqQC' in sync_report
+        assert len(sync_report['seqQC']) == 2
+        samples = [item['sample'] for item in sync_report['seqQC']]
+        assert 'Tumour DNA' in samples
+        assert 'Constitutional DNA' in samples
+        async_report = loaded_reports['async'][1]['reports'][0]
+        assert 'seqQC' in async_report
+        assert len(async_report['seqQC']) == 2
 
     def test_sample_info_loaded(self, loaded_reports) -> None:
-        sync_section = get_section(loaded_reports["sync"], "sample-info")
-        async_section = get_section(loaded_reports["async"], "sample-info")
-        async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(
-            async_section
-        )
+        sync_section = get_section(loaded_reports['sync'], 'sample-info')
+        async_section = get_section(loaded_reports['async'], 'sample-info')
+        async_equals_sync = stringify_sorted(sync_section) == stringify_sorted(async_section)
         assert async_equals_sync
 
-    def test_multivariant_multiconditionset_statements_loaded(
-        self, loaded_reports
-    ) -> None:
+    def test_multivariant_multiconditionset_statements_loaded(self, loaded_reports) -> None:
         """
         Checks that multivariant statements and multiple condition sets prepared correctly
         by this package are handled as expected by the api.
@@ -402,41 +386,31 @@ def test_multivariant_multiconditionset_statements_loaded(
         are met.
         This is also a test of multiple condition sets since there are two variants
         in the test data that satisfy one of the conditions (the APC mutation)."""
-        section = get_section(
-            loaded_reports["sync"], "kb-matches/kb-matched-statements"
-        )
-        multivariant_stmts = [
-            item for item in section if item["reference"] == "pmid:27302369"
-        ]
+        section = get_section(loaded_reports['sync'], 'kb-matches/kb-matched-statements')
+        multivariant_stmts = [item for item in section if item['reference'] == 'pmid:27302369']
 
         # if this statement is entered more than once there may be multiple sets of records to
         # check, so to make sure the count checks work, go stmt_id by stmt_id:
-        stmt_ids = list(set([item["kbStatementId"] for item in multivariant_stmts]))
+        stmt_ids = list(set([item['kbStatementId'] for item in multivariant_stmts]))
         for stmt_id in stmt_ids:
-            stmts = [
-                item for item in multivariant_stmts if item["kbStatementId"] == stmt_id
-            ]
+            stmts = [item for item in multivariant_stmts if item['kbStatementId'] == stmt_id]
 
             # we expect three stmts, one for each condition set
             assert len(stmts) == 3
 
             # we expect each condition set to have two kb variants in it
             # we expect the two kb variants to be the same in each stmt
-            assert len(stmts[0]["kbMatches"]) == 2
-            assert len(stmts[1]["kbMatches"]) == 2
-            kbmatches1 = [item["kbVariant"] for item in stmts[0]["kbMatches"]]
-            kbmatches2 = [item["kbVariant"] for item in stmts[1]["kbMatches"]]
+            assert len(stmts[0]['kbMatches']) == 2
+            assert len(stmts[1]['kbMatches']) == 2
+            kbmatches1 = [item['kbVariant'] for item in stmts[0]['kbMatches']]
+            kbmatches2 = [item['kbVariant'] for item in stmts[1]['kbMatches']]
             kbmatches1.sort()
             kbmatches2.sort()
-            assert kbmatches1 == kbmatches2 == ["APC mutation", "KRAS mutation"]
+            assert kbmatches1 == kbmatches2 == ['APC mutation', 'KRAS mutation']
 
             # we expect the two stmts to have different observed variant sets
-            observedVariants1 = [
-                item["variant"]["ident"] for item in stmts[0]["kbMatches"]
-            ]
-            observedVariants2 = [
-                item["variant"]["ident"] for item in stmts[1]["kbMatches"]
-            ]
+            observedVariants1 = [item['variant']['ident'] for item in stmts[0]['kbMatches']]
+            observedVariants2 = [item['variant']['ident'] for item in stmts[1]['kbMatches']]
             observedVariants1.sort()
             observedVariants2.sort()
             assert observedVariants1 != observedVariants2

From 1e9d46d5b1a20a432f59cc7766875f5c1c62a158 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 11:42:23 -0700
Subject: [PATCH 30/64] handle existing input format

---
 pori_python/ipr/inputs.py     |  40 ++++++++++++
 pori_python/ipr/main.py       |   2 +
 tests/test_ipr/test_inputs.py | 115 ++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index f14fc696..6dbb9401 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -796,6 +796,46 @@ def check_null(checker, instance):
 DefaultValidatingDraft7Validator = extend_with_default(jsonschema.Draft7Validator)
 
 
+def normalize_seqqc(content: Dict) -> Dict:
+    """
+    Normalize seqQC field names from production report format to schema format.
+    
+    Maps inconsistent casing and underscores in field names to match content.spec.json requirements.
+    For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc.
+    
+    Args:
+        content: Report content dictionary that may contain seqQC array
+        
+    Returns:
+        The content dictionary with seqQC fields normalized in-place
+    """
+    # Field name mapping from production/legacy format to schema format
+    field_mapping = {
+        'Reads': 'reads',
+        'Sample': 'sample',
+        'Library': 'library',
+        'Coverage': 'coverage',
+        'Input_ng': 'inputNg',
+        'Input_ug': 'inputUg',
+        'Protocol': 'protocol',
+        'Sample Name': 'sampleName',
+        'Duplicate_Reads_Perc': 'duplicateReadsPerc',
+    }
+    
+    if 'seqQC' in content and isinstance(content['seqQC'], list):
+        for item in content['seqQC']:
+            # Create a new dict with normalized keys
+            normalized_item = {}
+            for old_key, value in item.items():
+                # Use mapped key if it exists, otherwise keep original
+                new_key = field_mapping.get(old_key, old_key)
+                normalized_item[new_key] = value
+            # Replace the item with normalized version
+            content['seqQC'][content['seqQC'].index(item)] = normalized_item
+    
+    return content
+
+
 def validate_report_content(content: Dict, schema_file: str = SPECIFICATION) -> None:
     """
     Validate a report content input JSON object against the schema specification
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index cbb7c128..eea1987e 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -27,6 +27,7 @@
 from .inputs import (
     check_comparators,
     check_variant_links,
+    normalize_seqqc,
     preprocess_copy_variants,
     preprocess_cosmic,
     preprocess_expression_variants,
@@ -380,6 +381,7 @@ def ipr_report(
         return ipr_result
 
     # validate the JSON content follows the specification
+    normalize_seqqc(content)
     try:
         validate_report_content(content)
     except jsonschema.exceptions.ValidationError as err:
diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py
index 4bdd6b6d..d6e12493 100644
--- a/tests/test_ipr/test_inputs.py
+++ b/tests/test_ipr/test_inputs.py
@@ -17,6 +17,7 @@
     check_comparators,
     check_variant_links,
     create_graphkb_sv_notation,
+    normalize_seqqc,
     preprocess_copy_variants,
     preprocess_cosmic,
     preprocess_expression_variants,
@@ -558,3 +559,117 @@ def test_valid_json_inputs(example_name: str):
     with open(os.path.join(DATA_DIR, 'json_examples', f'{example_name}.json'), 'r') as fh:
         content = json.load(fh)
     validate_report_content(content)
+
+
+class TestNormalizeSeqQC:
+    """Test seqQC field name normalization from production format to schema format."""
+
+    def test_normalize_seqqc_production_format(self):
+        """Test normalization of production report field names."""
+        content = {
+            'seqQC': [
+                {
+                    'Reads': '2407M',
+                    'Sample': 'Tumour DNA',
+                    'Library': 'LIB0001',
+                    'Coverage': '96X',
+                    'Input_ng': 400,
+                    'Input_ug': '',
+                    'Protocol': 'Genome Shotgun FFPE 4.2',
+                    'Sample Name': 'SAMPLE-T-01',
+                    'bioQC': 'Passed',
+                    'labQC': 'Approved',
+                    'Duplicate_Reads_Perc': 18,
+                }
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        assert result['seqQC'][0]['reads'] == '2407M'
+        assert result['seqQC'][0]['sample'] == 'Tumour DNA'
+        assert result['seqQC'][0]['library'] == 'LIB0001'
+        assert result['seqQC'][0]['coverage'] == '96X'
+        assert result['seqQC'][0]['inputNg'] == 400
+        assert result['seqQC'][0]['inputUg'] == ''
+        assert result['seqQC'][0]['protocol'] == 'Genome Shotgun FFPE 4.2'
+        assert result['seqQC'][0]['sampleName'] == 'SAMPLE-T-01'
+        assert result['seqQC'][0]['bioQC'] == 'Passed'
+        assert result['seqQC'][0]['labQC'] == 'Approved'
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 18
+        # Old keys should be gone
+        assert 'Reads' not in result['seqQC'][0]
+        assert 'Sample' not in result['seqQC'][0]
+
+    def test_normalize_seqqc_already_normalized(self):
+        """Test that already-normalized field names are preserved."""
+        content = {
+            'seqQC': [
+                {
+                    'reads': '1200M',
+                    'sample': 'Constitutional DNA',
+                    'library': 'LIB0002',
+                    'coverage': '40x',
+                    'inputNg': '300',
+                    'protocol': 'WGS',
+                    'sampleName': 'SAMPLE-N-01',
+                    'bioQC': 'passed',
+                    'labQC': 'passed',
+                    'duplicateReadsPerc': '8.1',
+                }
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        # All normalized keys should still exist with same values
+        assert result['seqQC'][0]['reads'] == '1200M'
+        assert result['seqQC'][0]['sample'] == 'Constitutional DNA'
+        assert result['seqQC'][0]['inputNg'] == '300'
+
+    def test_normalize_seqqc_no_seqqc_field(self):
+        """Test that content without seqQC is unchanged."""
+        content = {
+            'patientId': 'TEST001',
+            'project': 'TEST',
+        }
+
+        result = normalize_seqqc(content)
+
+        assert result == content
+        assert 'seqQC' not in result
+
+    def test_normalize_seqqc_empty_seqqc(self):
+        """Test that empty seqQC array is handled."""
+        content = {'seqQC': []}
+
+        result = normalize_seqqc(content)
+
+        assert result['seqQC'] == []
+
+    def test_normalize_seqqc_multiple_items(self):
+        """Test normalization of multiple seqQC items."""
+        content = {
+            'seqQC': [
+                {
+                    'Reads': '2534M',
+                    'Sample': 'Tumour DNA',
+                    'Duplicate_Reads_Perc': 12.3,
+                },
+                {
+                    'Reads': '1200M',
+                    'Sample': 'Constitutional DNA',
+                    'Duplicate_Reads_Perc': 8.1,
+                },
+            ]
+        }
+
+        result = normalize_seqqc(content)
+
+        assert len(result['seqQC']) == 2
+        assert result['seqQC'][0]['reads'] == '2534M'
+        assert result['seqQC'][0]['sample'] == 'Tumour DNA'
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
+        assert result['seqQC'][1]['reads'] == '1200M'
+        assert result['seqQC'][1]['sample'] == 'Constitutional DNA'
+        assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1

From 1c8153c32362eecfefda138eb9ee951539b795ca Mon Sep 17 00:00:00 2001
From: sshugsc <sshu@bcgsc.ca>
Date: Mon, 20 Apr 2026 15:13:38 -0700
Subject: [PATCH 31/64] typo

---
 pori_python/ipr/connection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index 317ae90b..4847c9ee 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -124,7 +124,7 @@ def upload_report(
 
             if self.username not in project_users[content['project']]:
                 raise Exception(
-                    f'User have no permission to create report in project {content["project"]}'
+                    f'User has no permission to create report in project {content["project"]}'
                 )
 
             if ignore_extra_fields:

From 8ab54b96bed33cd4b7c8d25bf71c348f675302cb Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 15:44:11 -0700
Subject: [PATCH 32/64] check permission first, increase mins_to_wait

---
 pori_python/ipr/connection.py     |  69 +++++++++++------
 pori_python/ipr/main.py           |   8 +-
 tests/test_ipr/test_connection.py | 123 ++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 25 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index 317ae90b..53eb2827 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -93,10 +93,55 @@ def delete(self, uri: str, data: Dict = {}, **kwargs) -> Dict:
             **kwargs,
         )
 
+    def check_upload_permission(self, project_name: str) -> None:
+        """Check that the current user has permission to upload to the given project.
+
+        Fetches all projects and the current user info (including groups and
+        projects) up front. Checks for admin, manager, create report access,
+        all projects access, and project membership. Creates the project if
+        it does not yet exist.
+        """
+        projects = self.get('project')
+        project_exists = any(p['name'] == project_name for p in projects)
+
+        user = self.get('user/me')
+        user_groups = user.get('groups', []) if isinstance(user, dict) else []
+        group_names = {
+            group.get('name', '').strip().lower()
+            if isinstance(group, dict)
+            else group.strip().lower()
+            for group in user_groups
+        }
+
+        is_admin = 'admin' in group_names
+        is_manager = 'manager' in group_names
+        has_create_report_access = 'create report access' in group_names
+        has_all_projects_access = 'all projects access' in group_names
+
+        # admins and managers can always create reports
+        can_create_report = is_admin or is_manager or has_create_report_access
+
+        user_projects = user.get('projects', []) if isinstance(user, dict) else []
+        has_project_access = (
+            is_admin
+            or has_all_projects_access
+            or any(isinstance(p, dict) and p.get('name') == project_name for p in user_projects)
+        )
+
+        if not can_create_report:
+            raise Exception(
+                f'User does not have report creation permission'
+            )
+
+        if not has_project_access:
+            raise Exception(
+                f'User has no permission to create report in project {project_name}'
+            )
+
     def upload_report(
         self,
         content: Dict,
-        mins_to_wait: int = 5,
+        mins_to_wait: int = 10,
         async_upload: bool = False,
         ignore_extra_fields: bool = False,
     ) -> Dict:
@@ -105,28 +150,6 @@ def upload_report(
             # or 'report'. jobStatus is no longer available once the report is successfully
             # uploaded.
 
-            projects = self.get('project')
-            project_names = [item['name'] for item in projects]
-            project_users = {
-                item['name']: [user['username'] for user in item.get('users', [])]
-                for item in projects
-            }
-
-            # if project is not exist, create one
-            if content['project'] not in project_names:
-                logger.info(
-                    f'Project not found - attempting to create project {content["project"]}'
-                )
-                try:
-                    self.post('project', {'name': content['project']})
-                except Exception as err:
-                    raise Exception(f'Project creation failed due to {err}')
-
-            if self.username not in project_users[content['project']]:
-                raise Exception(
-                    f'User have no permission to create report in project {content["project"]}'
-                )
-
             if ignore_extra_fields:
                 initial_result = self.post('reports-async?ignore_extra_fields=true', content)
             else:
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 3319c811..0d95efe9 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -152,7 +152,7 @@ def command_interface() -> None:
     )
     parser.add_argument(
         '--mins_to_wait',
-        default=5,
+        default=10,
         action='store',
         help='is using reports-async, number of minutes to wait before throwing error',
     )
@@ -337,7 +337,7 @@ def ipr_report(
     match_germline: bool = False,
     custom_kb_match_filter: Optional[Callable] = None,
     async_upload: bool = False,
-    mins_to_wait: int = 5,
+    mins_to_wait: int = 10,
     include_ipr_variant_text: bool = True,
     include_nonspecific_disease: bool = False,
     include_nonspecific_project: bool = False,
@@ -396,6 +396,10 @@ def ipr_report(
     else:
         logger.warning('No ipr_url given')
 
+    # Verify upload permission before doing any expensive processing
+    if ipr_upload and ipr_conn:
+        ipr_conn.check_upload_permission(content['project'])
+
     if validate_json:
         if not ipr_conn:
             raise ValueError('ipr_url required to validate json')
diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py
index d83ac79a..3cd851a9 100644
--- a/tests/test_ipr/test_connection.py
+++ b/tests/test_ipr/test_connection.py
@@ -95,3 +95,126 @@ def request(*args, **kwargs):
                         )
                     },
                 )
+
+
+class TestCheckUploadPermission:
+    def _user_response(self, groups=None, projects=None):
+        return {
+            'groups': [{'name': g} for g in (groups or [])],
+            'projects': [{'name': p} for p in (projects or [])],
+        }
+
+    def test_rejects_user_without_create_report_access(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[[{'name': 'TEST'}], self._user_response(projects=['TEST'])]
+        )
+        conn.post = mock.MagicMock()
+
+        with pytest.raises(Exception, match='User does not have report creation permission'):
+            conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_rejects_user_without_project_access(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(groups=['create report access'], projects=['OTHER']),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        with pytest.raises(Exception, match='User has no permission to create report in project TEST'):
+            conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_allows_user_with_project_and_create_report_access(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(groups=['create report access'], projects=['TEST']),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_manager_has_implicit_create_report_access(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(groups=['manager'], projects=['TEST']),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_admin_bypasses_all_checks(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(groups=['admin'], projects=[]),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_admin_creates_missing_project(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'OTHER'}],
+                self._user_response(groups=['admin'], projects=[]),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_called_once_with('project', {'name': 'TEST'})
+
+    def test_all_projects_access_without_project_membership(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(
+                    groups=['create report access', 'all projects access'], projects=[]
+                ),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_creates_missing_project_for_all_projects_access_user(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'OTHER'}],
+                self._user_response(
+                    groups=['create report access', 'all projects access'], projects=[]
+                ),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        conn.check_upload_permission('TEST')
+
+        conn.post.assert_called_once_with('project', {'name': 'TEST'})

From acb3f8ee135390882804a713bc064d55040cec9e Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 15:44:55 -0700
Subject: [PATCH 33/64] format

---
 pori_python/ipr/connection.py     | 8 ++------
 tests/test_ipr/test_connection.py | 4 +++-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index 53eb2827..88a4caa3 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -129,14 +129,10 @@ def check_upload_permission(self, project_name: str) -> None:
         )
 
         if not can_create_report:
-            raise Exception(
-                f'User does not have report creation permission'
-            )
+            raise Exception(f'User does not have report creation permission')
 
         if not has_project_access:
-            raise Exception(
-                f'User has no permission to create report in project {project_name}'
-            )
+            raise Exception(f'User has no permission to create report in project {project_name}')
 
     def upload_report(
         self,
diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py
index 3cd851a9..b5156f29 100644
--- a/tests/test_ipr/test_connection.py
+++ b/tests/test_ipr/test_connection.py
@@ -126,7 +126,9 @@ def test_rejects_user_without_project_access(self):
         )
         conn.post = mock.MagicMock()
 
-        with pytest.raises(Exception, match='User has no permission to create report in project TEST'):
+        with pytest.raises(
+            Exception, match='User has no permission to create report in project TEST'
+        ):
             conn.check_upload_permission('TEST')
 
         conn.post.assert_not_called()

From 39469e69b094eb0b4351d4658e0fe57b7246374a Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 16:18:37 -0700
Subject: [PATCH 34/64] fix: create missing project in check_upload_permission
 and add user/me mock to tests

---
 pori_python/ipr/connection.py | 3 +++
 tests/test_ipr/test_main.py   | 5 +++++
 tests/test_ipr/test_probe.py  | 5 +++++
 3 files changed, 13 insertions(+)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index 88a4caa3..cc21a993 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -134,6 +134,9 @@ def check_upload_permission(self, project_name: str) -> None:
         if not has_project_access:
             raise Exception(f'User has no permission to create report in project {project_name}')
 
+        if not project_exists and can_create_report and has_project_access:
+            self.post('project', {'name': project_name})
+
     def upload_report(
         self,
         content: Dict,
diff --git a/tests/test_ipr/test_main.py b/tests/test_ipr/test_main.py
index fd8e8bb8..3dbd0aa5 100644
--- a/tests/test_ipr/test_main.py
+++ b/tests/test_ipr/test_main.py
@@ -110,6 +110,11 @@ def side_effect_function(*args, **kwargs):
             return [{'name': 'genomic', 'ident': '001'}]
         elif args[0] == 'project':
             return [{'name': 'TEST', 'ident': '001'}]
+        elif args[0] == 'user/me':
+            return {
+                'groups': [{'name': 'admin'}],
+                'projects': [{'name': 'TEST'}],
+            }
         else:
             return []
 
diff --git a/tests/test_ipr/test_probe.py b/tests/test_ipr/test_probe.py
index 43ead9f1..ec93599c 100644
--- a/tests/test_ipr/test_probe.py
+++ b/tests/test_ipr/test_probe.py
@@ -25,6 +25,11 @@ def side_effect_function(*args, **kwargs):
             return [{'name': 'genomic', 'ident': '001'}]
         elif args[0] == 'project':
             return [{'name': 'TEST', 'ident': '001'}]
+        elif args[0] == 'user/me':
+            return {
+                'groups': [{'name': 'admin'}],
+                'projects': [{'name': 'TEST'}],
+            }
         else:
             return []
 

From 37ced1ee570360cc1baa6f2d289838c4181d2201 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 16:48:32 -0700
Subject: [PATCH 35/64] remove report creation option

---
 pori_python/ipr/connection.py     |  5 +----
 tests/test_ipr/test_connection.py | 30 ------------------------------
 2 files changed, 1 insertion(+), 34 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index cc21a993..c7d444d1 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -98,8 +98,7 @@ def check_upload_permission(self, project_name: str) -> None:
 
         Fetches all projects and the current user info (including groups and
         projects) up front. Checks for admin, manager, create report access,
-        all projects access, and project membership. Creates the project if
-        it does not yet exist.
+        all projects access, and project membership.
         """
         projects = self.get('project')
         project_exists = any(p['name'] == project_name for p in projects)
@@ -134,8 +133,6 @@ def check_upload_permission(self, project_name: str) -> None:
         if not has_project_access:
             raise Exception(f'User has no permission to create report in project {project_name}')
 
-        if not project_exists and can_create_report and has_project_access:
-            self.post('project', {'name': project_name})
 
     def upload_report(
         self,
diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py
index b5156f29..aea50d38 100644
--- a/tests/test_ipr/test_connection.py
+++ b/tests/test_ipr/test_connection.py
@@ -175,20 +175,6 @@ def test_admin_bypasses_all_checks(self):
 
         conn.post.assert_not_called()
 
-    def test_admin_creates_missing_project(self):
-        conn = IprConnection('user', 'pass')
-        conn.get = mock.MagicMock(
-            side_effect=[
-                [{'name': 'OTHER'}],
-                self._user_response(groups=['admin'], projects=[]),
-            ]
-        )
-        conn.post = mock.MagicMock()
-
-        conn.check_upload_permission('TEST')
-
-        conn.post.assert_called_once_with('project', {'name': 'TEST'})
-
     def test_all_projects_access_without_project_membership(self):
         conn = IprConnection('user', 'pass')
         conn.get = mock.MagicMock(
@@ -204,19 +190,3 @@ def test_all_projects_access_without_project_membership(self):
         conn.check_upload_permission('TEST')
 
         conn.post.assert_not_called()
-
-    def test_creates_missing_project_for_all_projects_access_user(self):
-        conn = IprConnection('user', 'pass')
-        conn.get = mock.MagicMock(
-            side_effect=[
-                [{'name': 'OTHER'}],
-                self._user_response(
-                    groups=['create report access', 'all projects access'], projects=[]
-                ),
-            ]
-        )
-        conn.post = mock.MagicMock()
-
-        conn.check_upload_permission('TEST')
-
-        conn.post.assert_called_once_with('project', {'name': 'TEST'})

From a788ee4aae5035f6162a62fb15294da153aee4cd Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 20 Apr 2026 16:48:50 -0700
Subject: [PATCH 36/64] format

---
 pori_python/ipr/connection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index c7d444d1..a9010864 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -133,7 +133,6 @@ def check_upload_permission(self, project_name: str) -> None:
         if not has_project_access:
             raise Exception(f'User has no permission to create report in project {project_name}')
 
-
     def upload_report(
         self,
         content: Dict,

From 0f1fccb85600da5f539aa17453359eb34085aef3 Mon Sep 17 00:00:00 2001
From: Dustin Bleile <dbleile@bcgsc.ca>
Date: Thu, 23 Apr 2026 13:50:42 -0700
Subject: [PATCH 37/64] SDEV-5340 - output_json - use json.dump so output is
 formatted.

---
 pori_python/ipr/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 3319c811..2ed1d3b0 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -654,7 +654,7 @@ def ipr_report(
     if always_write_output_json:
         logger.info(f'Writing IPR upload json to: {output_json_path}')
         with open(output_json_path, 'w') as fh:
-            fh.write(json.dumps(output))
+            json.dump(output, fh, indent=4)
 
     logger.info(f'made {graphkb_conn.request_count} requests to graphkb')
     logger.info(f'average load {int(graphkb_conn.load or 0)} req/s')

From 6ab42bc66ac0ead37ffc9a555598b27ea0feb509 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Tue, 28 Apr 2026 11:44:35 -0700
Subject: [PATCH 38/64] Add GraphKBConnection.version

---
 pori_python/graphkb/util.py     | 12 ++++++++++++
 tests/test_graphkb/test_util.py | 15 +++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py
index 23c28963..075084ff 100644
--- a/pori_python/graphkb/util.py
+++ b/pori_python/graphkb/util.py
@@ -354,6 +354,18 @@ def get_source(self, name: str) -> Record:
             raise AssertionError(f'Unable to unqiuely identify source with name {name}')
         return source[0]
 
+    @property
+    def version(self) -> Dict[str, str]:
+        """
+        Retrieve GraphKB components version
+
+        Returns:
+            Dict[str, str]: component keys with version values
+
+            e.g. > {"api":"3.17.3","db":"production","parser":"2.1.0","schema":"4.1.1"}
+        """
+        return self.request('version')
+
 
 def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:
     """
diff --git a/tests/test_graphkb/test_util.py b/tests/test_graphkb/test_util.py
index 36760b2a..e0173a0f 100644
--- a/tests/test_graphkb/test_util.py
+++ b/tests/test_graphkb/test_util.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import re
 
 from pori_python.graphkb import GraphKBConnection, util
 
@@ -149,3 +150,17 @@ def test_stringifyVariant_positional(self, conn, rid, createdAt, stringifiedVari
         variant = conn.get_record_by_id(rid)
         if variant and variant.get('createdAt', None) == createdAt:
             assert util.stringifyVariant(variant=variant, **opt) == stringifiedVariant
+
+
+class TestVersion:
+    def test_version(self, conn):
+        version = conn.version
+        assert version['db'] in [
+            'production',
+            'production-sync-dev',
+            'production-sync-staging',
+        ]
+        SEMANTIC_VERSIONING_REGEX = re.compile(r'^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)$')
+        assert SEMANTIC_VERSIONING_REGEX.match(version['api'])
+        assert SEMANTIC_VERSIONING_REGEX.match(version['parser'])
+        assert SEMANTIC_VERSIONING_REGEX.match(version['schema'])

From e0ee8ec7cdc8e7901cbde5e53877898f45de3bc1 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Fri, 8 May 2026 10:49:43 -0700
Subject: [PATCH 39/64] format with ruff

---
 pori_python/ipr/inputs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index f76843e0..2dfac499 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -804,13 +804,13 @@ def check_null(checker, instance):
 def normalize_seqqc(content: Dict) -> Dict:
     """
     Normalize seqQC field names from production report format to schema format.
-    
+
     Maps inconsistent casing and underscores in field names to match content.spec.json requirements.
     For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc.
-    
+
     Args:
         content: Report content dictionary that may contain seqQC array
-        
+
     Returns:
         The content dictionary with seqQC fields normalized in-place
     """
@@ -826,7 +826,7 @@ def normalize_seqqc(content: Dict) -> Dict:
         'Sample Name': 'sampleName',
         'Duplicate_Reads_Perc': 'duplicateReadsPerc',
     }
-    
+
     if 'seqQC' in content and isinstance(content['seqQC'], list):
         for item in content['seqQC']:
             # Create a new dict with normalized keys
@@ -837,7 +837,7 @@ def normalize_seqqc(content: Dict) -> Dict:
                 normalized_item[new_key] = value
             # Replace the item with normalized version
             content['seqQC'][content['seqQC'].index(item)] = normalized_item
-    
+
     return content
 
 

From dcedd6390c095baac3c7750fc89bf8b7c2e0e1c0 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 11 May 2026 13:01:55 -0700
Subject: [PATCH 40/64] fix issues raised in pr

---
 pori_python/ipr/content.spec.json |  6 +++++
 pori_python/ipr/inputs.py         |  8 +++---
 tests/test_ipr/test_inputs.py     | 45 +++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
index 830218ae..c994ba6f 100644
--- a/pori_python/ipr/content.spec.json
+++ b/pori_python/ipr/content.spec.json
@@ -971,6 +971,8 @@
                         "example": "500",
                         "type": [
                             "string",
+                            "number",
+                            "integer",
                             "null"
                         ]
                     },
@@ -979,6 +981,8 @@
                         "example": "0.5",
                         "type": [
                             "string",
+                            "number",
+                            "integer",
                             "null"
                         ]
                     },
@@ -1003,6 +1007,8 @@
                         "example": "12.3",
                         "type": [
                             "string",
+                            "number",
+                            "integer",
                             "null"
                         ]
                     }
diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index 2dfac499..5c7f10fb 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -806,7 +806,7 @@ def normalize_seqqc(content: Dict) -> Dict:
     Normalize seqQC field names from production report format to schema format.
 
     Maps inconsistent casing and underscores in field names to match content.spec.json requirements.
-    For example: 'Reads' -> 'reads', 'Sample_Name' -> 'sampleName', etc.
+    For example: 'Reads' -> 'reads', 'Sample Name' -> 'sampleName', etc.
 
     Args:
         content: Report content dictionary that may contain seqQC array
@@ -828,7 +828,9 @@ def normalize_seqqc(content: Dict) -> Dict:
     }
 
     if 'seqQC' in content and isinstance(content['seqQC'], list):
-        for item in content['seqQC']:
+        for i, item in enumerate(content['seqQC']):
+            if not isinstance(item, dict):
+                continue
             # Create a new dict with normalized keys
             normalized_item = {}
             for old_key, value in item.items():
@@ -836,7 +838,7 @@ def normalize_seqqc(content: Dict) -> Dict:
                 new_key = field_mapping.get(old_key, old_key)
                 normalized_item[new_key] = value
             # Replace the item with normalized version
-            content['seqQC'][content['seqQC'].index(item)] = normalized_item
+            content['seqQC'][i] = normalized_item
 
     return content
 
diff --git a/tests/test_ipr/test_inputs.py b/tests/test_ipr/test_inputs.py
index d6e12493..f3cd6f99 100644
--- a/tests/test_ipr/test_inputs.py
+++ b/tests/test_ipr/test_inputs.py
@@ -673,3 +673,48 @@ def test_normalize_seqqc_multiple_items(self):
         assert result['seqQC'][1]['reads'] == '1200M'
         assert result['seqQC'][1]['sample'] == 'Constitutional DNA'
         assert result['seqQC'][1]['duplicateReadsPerc'] == 8.1
+
+    def test_normalize_seqqc_numeric_fields_pass_validation(self):
+        """Test that integer/float values for inputNg, inputUg, duplicateReadsPerc pass schema validation."""
+        content = {
+            'patientId': 'PATIENT001',
+            'kbDiseaseMatch': 'colorectal cancer',
+            'project': 'TEST',
+            'template': 'genomic',
+            'seqQC': [
+                {
+                    'reads': '2407M',
+                    'sample': 'Tumour DNA',
+                    'library': 'LIB0001',
+                    'inputNg': 400,
+                    'inputUg': 0.4,
+                    'duplicateReadsPerc': 18,
+                }
+            ],
+        }
+        # Should not raise
+        validate_report_content(content)
+
+    def test_normalize_seqqc_numeric_float_duplicateReadsPerc_passes_validation(self):
+        """Test that a float duplicateReadsPerc value passes schema validation after normalization."""
+        content = {
+            'patientId': 'PATIENT001',
+            'kbDiseaseMatch': 'colorectal cancer',
+            'project': 'TEST',
+            'template': 'genomic',
+            'seqQC': [
+                {
+                    'Reads': '2534M',
+                    'Sample': 'Tumour DNA',
+                    'Duplicate_Reads_Perc': 12.3,
+                    'Input_ng': 500,
+                    'Input_ug': 0.5,
+                }
+            ],
+        }
+        result = normalize_seqqc(content)
+        assert result['seqQC'][0]['duplicateReadsPerc'] == 12.3
+        assert result['seqQC'][0]['inputNg'] == 500
+        assert result['seqQC'][0]['inputUg'] == 0.5
+        # Should not raise after normalization
+        validate_report_content(result)

From 416146e0ba5b9b67d73ef89e65cf2cfad4c3153c Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 11 May 2026 13:14:41 -0700
Subject: [PATCH 41/64] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 pori_python/ipr/inputs.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index 5c7f10fb..25901fc3 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -826,17 +826,25 @@ def normalize_seqqc(content: Dict) -> Dict:
         'Sample Name': 'sampleName',
         'Duplicate_Reads_Perc': 'duplicateReadsPerc',
     }
+    normalized_keys = set(field_mapping.values())
 
     if 'seqQC' in content and isinstance(content['seqQC'], list):
         for i, item in enumerate(content['seqQC']):
             if not isinstance(item, dict):
                 continue
-            # Create a new dict with normalized keys
+            # Preserve already-normalized keys (and unrelated keys) first so
+            # legacy aliases cannot overwrite them based on insertion order.
             normalized_item = {}
-            for old_key, value in item.items():
-                # Use mapped key if it exists, otherwise keep original
-                new_key = field_mapping.get(old_key, old_key)
-                normalized_item[new_key] = value
+            for key, value in item.items():
+                if key in normalized_keys or key not in field_mapping:
+                    normalized_item[key] = value
+
+            # Add legacy aliases only when the normalized key is not already
+            # present. This makes collision handling explicit and stable.
+            for old_key, new_key in field_mapping.items():
+                if old_key in item and new_key not in normalized_item:
+                    normalized_item[new_key] = item[old_key]
+
             # Replace the item with normalized version
             content['seqQC'][i] = normalized_item
 

From e2c25efa7017b42af99c19805eee475a7a31402e Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Mon, 11 May 2026 13:15:35 -0700
Subject: [PATCH 42/64] normalize before upload_json as well

---
 pori_python/ipr/main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index 9d8f0568..c249b791 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -403,6 +403,10 @@ def ipr_report(
         ipr_result = ipr_conn.validate_json(content)
         return ipr_result
 
+    # seqqc normalization is a bridging measure only;
+    # validate_json should be called on non-normalized json
+    normalize_seqqc(content)
+
     if upload_json:
         if not ipr_conn:
             raise ValueError('ipr_url required to upload json')
@@ -412,7 +416,6 @@ def ipr_report(
         return ipr_result
 
     # validate the JSON content follows the specification
-    normalize_seqqc(content)
     try:
         validate_report_content(content)
     except jsonschema.exceptions.ValidationError as err:

From ca8d5f096f6467f7cb5f1622e3b8d2e4ca136119 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Mon, 11 May 2026 15:16:41 -0700
Subject: [PATCH 43/64] Update get_cancer_genes() to fit consensus defenition
 of cancer gene

---
 pori_python/graphkb/genes.py     | 31 ++++++++++++++++++++++++++-----
 tests/test_graphkb/test_genes.py | 20 ++++++++++++--------
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 09da3ed7..f2899ede 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -24,11 +24,11 @@
 )
 from .match import get_equivalent_features
 from .util import get_rid, logger, looks_like_rid
-from .vocab import get_terms_set
+from .vocab import convert_to_rid_list, get_terms_set, query_by_name
 
 
 def _get_tumourigenesis_genes_list(
-    conn: GraphKBConnection, relevance: str, sources: List[str], ignore_cache: bool = False
+    conn: GraphKBConnection, relevance: str | List[str], sources: str | List[str], ignore_cache: bool = False
 ) -> List[Ontology]:
     statements = cast(
         List[Statement],
@@ -66,7 +66,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
     Returns:
         gene (Feature) records
     """
-    return _get_tumourigenesis_genes_list(conn, ONCOGENE, [ONCOKB_SOURCE_NAME])
+    return _get_tumourigenesis_genes_list(conn, ONCOGENE, ONCOKB_SOURCE_NAME)
 
 
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
@@ -78,20 +78,41 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     Returns:
         gene (Feature) records
     """
-    return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, [ONCOKB_SOURCE_NAME])
+    return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, ONCOKB_SOURCE_NAME)
 
 
 def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     """Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500.
 
+    Cancer genes include oncogenes, tumour supressor genes and other cancer genes.
+
     Args:
         conn: the graphkb connection object
 
     Returns:
         gene (Feature) records
     """
+    cancer_gene_rid = convert_to_rid_list(
+        conn.query(
+            query_by_name('Vocabulary', CANCER_GENE)
+        )
+    )
+    associated_terms = conn.post(
+        '/subgraphs/Vocabulary',
+        {
+            'subgraphType': 'children',
+            'base': cancer_gene_rid,
+        },
+    )
+    associated_term_names = list(
+        map(
+            lambda x: x['name'],
+            associated_terms['result']['g']['nodes'].values(),
+        ),
+    )
+
     return _get_tumourigenesis_genes_list(
-        conn, CANCER_GENE, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME]
+        conn, associated_term_names, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME]
     )
 
 
diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py
index 90efe5d4..f3440f83 100644
--- a/tests/test_graphkb/test_genes.py
+++ b/tests/test_graphkb/test_genes.py
@@ -27,7 +27,7 @@
 
 CANONICAL_ONCOGENES = ['kras', 'nras', 'alk']
 CANONICAL_TS = ['cdkn2a', 'tp53']
-CANONICAL_CG = ['alb']
+CANONICAL_OTHER_CG = ['alb']
 CANONICAL_FUSION_GENES = ['alk', 'ewsr1', 'fli1']
 CANONICAL_STRUCTURAL_VARIANT_GENES = ['brca1', 'dpyd', 'pten']
 CANNONICAL_THERAPY_GENES = ['erbb2', 'brca2', 'egfr']
@@ -119,7 +119,7 @@ def test_oncogene(conn):
         assert gene in names
     for gene in CANONICAL_TS:
         assert gene not in names
-    for gene in CANONICAL_CG:
+    for gene in CANONICAL_OTHER_CG:
         assert gene not in names
 
 
@@ -131,7 +131,7 @@ def test_tumour_supressors(conn):
         assert gene in names
     for gene in CANONICAL_ONCOGENES:
         assert gene not in names
-    for gene in CANONICAL_CG:
+    for gene in CANONICAL_OTHER_CG:
         assert gene not in names
 
 
@@ -142,12 +142,12 @@ def test_tumour_supressors(conn):
 def test_cancer_genes(conn):
     result = get_cancer_genes(conn)
     names = {row['name'] for row in result}
-    for gene in CANONICAL_CG:
+    for gene in CANONICAL_OTHER_CG:
         assert gene in names
     for gene in CANONICAL_TS:
-        assert gene not in names
+        assert gene in names
     for gene in CANONICAL_ONCOGENES:
-        assert gene not in names
+        assert gene in names
 
 
 @pytest.mark.skipif(
@@ -254,7 +254,7 @@ def test_get_gene_information(conn):
         conn,
         CANONICAL_ONCOGENES
         + CANONICAL_TS
-        + CANONICAL_CG
+        + CANONICAL_OTHER_CG
         + CANONICAL_FUSION_GENES
         + CANONICAL_STRUCTURAL_VARIANT_GENES
         + CANNONICAL_THERAPY_GENES
@@ -300,7 +300,11 @@ def test_get_gene_information(conn):
             f'Missed kbStatementRelated {gene}'
         )
 
-    for gene in CANONICAL_CG:
+    for gene in (
+        CANONICAL_ONCOGENES
+        + CANONICAL_TS
+        + CANONICAL_OTHER_CG
+    ):
         assert gene in [g['name'] for g in gene_info if g.get('cancerGeneListMatch')], (
             f'Missed cancerGeneListMatch {gene}'
         )

From bca3e1d912dc3bcf5d694d0e5a100e26bfe157bf Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Mon, 11 May 2026 15:23:10 -0700
Subject: [PATCH 44/64] linting

---
 pori_python/graphkb/genes.py     | 11 +++++------
 tests/test_graphkb/test_genes.py |  6 +-----
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index f2899ede..4f6de23a 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -28,7 +28,10 @@
 
 
 def _get_tumourigenesis_genes_list(
-    conn: GraphKBConnection, relevance: str | List[str], sources: str | List[str], ignore_cache: bool = False
+    conn: GraphKBConnection,
+    relevance: str | List[str],
+    sources: str | List[str],
+    ignore_cache: bool = False,
 ) -> List[Ontology]:
     statements = cast(
         List[Statement],
@@ -92,11 +95,7 @@ def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     Returns:
         gene (Feature) records
     """
-    cancer_gene_rid = convert_to_rid_list(
-        conn.query(
-            query_by_name('Vocabulary', CANCER_GENE)
-        )
-    )
+    cancer_gene_rid = convert_to_rid_list(conn.query(query_by_name('Vocabulary', CANCER_GENE)))
     associated_terms = conn.post(
         '/subgraphs/Vocabulary',
         {
diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py
index f3440f83..5986b2f0 100644
--- a/tests/test_graphkb/test_genes.py
+++ b/tests/test_graphkb/test_genes.py
@@ -300,11 +300,7 @@ def test_get_gene_information(conn):
             f'Missed kbStatementRelated {gene}'
         )
 
-    for gene in (
-        CANONICAL_ONCOGENES
-        + CANONICAL_TS
-        + CANONICAL_OTHER_CG
-    ):
+    for gene in CANONICAL_ONCOGENES + CANONICAL_TS + CANONICAL_OTHER_CG:
         assert gene in [g['name'] for g in gene_info if g.get('cancerGeneListMatch')], (
             f'Missed cancerGeneListMatch {gene}'
         )

From d6f7c9aef32e0ac6629ac2e5f1669afb3db06b41 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Tue, 12 May 2026 08:30:27 -0700
Subject: [PATCH 45/64] Add tumourigenesis to CANCER_GENE for backward
 compatibility

---
 pori_python/graphkb/constants.py | 5 ++++-
 pori_python/graphkb/genes.py     | 6 ++++--
 pori_python/graphkb/vocab.py     | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pori_python/graphkb/constants.py b/pori_python/graphkb/constants.py
index fe22f4a0..07f686b9 100644
--- a/pori_python/graphkb/constants.py
+++ b/pori_python/graphkb/constants.py
@@ -59,7 +59,10 @@
 TSO500_SOURCE_NAME = 'tso500'
 ONCOGENE = 'oncogenic'
 TUMOUR_SUPPRESSIVE = 'tumour suppressive'
-CANCER_GENE = 'cancer gene'
+CANCER_GENE = [
+    'cancer gene',
+    'tumourigenesis',
+]  # KBDEV-1532. tumourigenesis for backward compatibility
 FUSION_NAMES = ['structural variant', 'fusion']
 
 GSC_PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ['cancer genome interpreter', 'civic']
diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 4f6de23a..833536fe 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -95,12 +95,14 @@ def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     Returns:
         gene (Feature) records
     """
-    cancer_gene_rid = convert_to_rid_list(conn.query(query_by_name('Vocabulary', CANCER_GENE)))
+    cancer_gene_rids = convert_to_rid_list(
+        conn.query(query_by_name('Vocabulary', CANCER_GENE)),
+    )
     associated_terms = conn.post(
         '/subgraphs/Vocabulary',
         {
             'subgraphType': 'children',
-            'base': cancer_gene_rid,
+            'base': cancer_gene_rids,
         },
     )
     associated_term_names = list(
diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py
index e9242a7a..2beec5b1 100644
--- a/pori_python/graphkb/vocab.py
+++ b/pori_python/graphkb/vocab.py
@@ -6,7 +6,7 @@
 from .util import convert_to_rid_list
 
 
-def query_by_name(ontology_class: str, base_term_name: str) -> Dict:
+def query_by_name(ontology_class: str, base_term_name: str | list[str]) -> Dict:
     return {'target': ontology_class, 'filters': {'name': base_term_name}}
 
 

From 458f88fe042c253a2c3026c1594becc5be2ec8d8 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Tue, 12 May 2026 09:43:17 -0700
Subject: [PATCH 46/64] Use Union in type hint instead of pipe

---
 pori_python/graphkb/genes.py | 6 +++---
 pori_python/graphkb/vocab.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 833536fe..bdc4b17c 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Dict, List, Sequence, Set, Tuple, cast
+from typing import Any, Dict, List, Sequence, Set, Tuple, cast, Union
 from typing_extensions import deprecated
 
 from pori_python.types import IprGene, Ontology, Record, Statement, Variant
@@ -29,8 +29,8 @@
 
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
-    relevance: str | List[str],
-    sources: str | List[str],
+    relevance: Union[str, list[str]],
+    sources: Union[str, list[str]],
     ignore_cache: bool = False,
 ) -> List[Ontology]:
     statements = cast(
diff --git a/pori_python/graphkb/vocab.py b/pori_python/graphkb/vocab.py
index 2beec5b1..bb96e5f5 100644
--- a/pori_python/graphkb/vocab.py
+++ b/pori_python/graphkb/vocab.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict, Iterable, List, Set, cast
+from typing import Callable, Dict, Iterable, List, Set, cast, Union
 
 from pori_python.types import Ontology
 
@@ -6,7 +6,7 @@
 from .util import convert_to_rid_list
 
 
-def query_by_name(ontology_class: str, base_term_name: str | list[str]) -> Dict:
+def query_by_name(ontology_class: str, base_term_name: Union[str, list[str]]) -> Dict:
     return {'target': ontology_class, 'filters': {'name': base_term_name}}
 
 

From aa3b8849d54241ca01c2e7a60ca33009235586d2 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Wed, 13 May 2026 15:29:36 -0700
Subject: [PATCH 47/64] add project_exists check and tests

---
 pori_python/ipr/connection.py     |  6 +++++-
 tests/test_ipr/test_connection.py | 29 ++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/pori_python/ipr/connection.py b/pori_python/ipr/connection.py
index a9010864..75458d41 100644
--- a/pori_python/ipr/connection.py
+++ b/pori_python/ipr/connection.py
@@ -102,6 +102,10 @@ def check_upload_permission(self, project_name: str) -> None:
         """
         projects = self.get('project')
         project_exists = any(p['name'] == project_name for p in projects)
+        if not project_exists:
+            raise Exception(
+                f'Project {project_name} does not exist or user does not have permission to view it'
+            )
 
         user = self.get('user/me')
         user_groups = user.get('groups', []) if isinstance(user, dict) else []
@@ -128,7 +132,7 @@ def check_upload_permission(self, project_name: str) -> None:
         )
 
         if not can_create_report:
-            raise Exception(f'User does not have report creation permission')
+            raise Exception('User does not have report creation permission')
 
         if not has_project_access:
             raise Exception(f'User has no permission to create report in project {project_name}')
diff --git a/tests/test_ipr/test_connection.py b/tests/test_ipr/test_connection.py
index aea50d38..e2f8d4f5 100644
--- a/tests/test_ipr/test_connection.py
+++ b/tests/test_ipr/test_connection.py
@@ -147,7 +147,34 @@ def test_allows_user_with_project_and_create_report_access(self):
 
         conn.post.assert_not_called()
 
-    def test_manager_has_implicit_create_report_access(self):
+    def test_project_not_found_raises(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(side_effect=[[{'name': 'OTHER'}]])
+        conn.post = mock.MagicMock()
+
+        with pytest.raises(Exception, match='Project TEST does not exist'):
+            conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_manager_without_project_membership_raises(self):
+        conn = IprConnection('user', 'pass')
+        conn.get = mock.MagicMock(
+            side_effect=[
+                [{'name': 'TEST'}],
+                self._user_response(groups=['manager'], projects=[]),
+            ]
+        )
+        conn.post = mock.MagicMock()
+
+        with pytest.raises(
+            Exception, match='User has no permission to create report in project TEST'
+        ):
+            conn.check_upload_permission('TEST')
+
+        conn.post.assert_not_called()
+
+    def test_manager_with_project_membership_allowed(self):
         conn = IprConnection('user', 'pass')
         conn.get = mock.MagicMock(
             side_effect=[

From 9942425176f261f2c3e2ede4f61d142090c083ce Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:20:06 -0700
Subject: [PATCH 48/64] Add get_related_records() and get_related_terms() to
 GraphKBConnection

---
 pori_python/graphkb/util.py     | 58 +++++++++++++++++++++++++++++++++
 tests/test_graphkb/test_util.py | 29 ++++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py
index 075084ff..2508b30b 100644
--- a/pori_python/graphkb/util.py
+++ b/pori_python/graphkb/util.py
@@ -366,6 +366,64 @@ def version(self) -> Dict[str, str]:
         """
         return self.request('version')
 
+    def get_related_records(
+        self,
+        base: Union[str, list[str]],
+        ontology: str,
+        subgraphType: str,
+        returnProperties=[],
+    ):
+        """
+        Given some base node RIDs, an ontology class and a subgraph type,
+        leverage the subgraphs route to return the list of related nodes.
+
+        Args:
+            base: the base node RIDs to start the graph traversal from
+            ontology: the ontology class to traverse
+            subgraphType: the type of traversal. See options in API specs
+            returnProperties: additional record properties to return
+
+        Returns:
+            list of related node record(s) traversed
+        """
+        related = self.post(
+            uri=f'/subgraphs/{ontology}',
+            data={
+                'base': base if isinstance(base, list) else [base],
+                'subgraphType': subgraphType,
+                'returnProperties': returnProperties,
+            },
+        )
+        return related['result']['g']['nodes']
+
+    def get_related_terms(
+        self,
+        terms: Union[str, list[str]],
+        ontology: str = 'Vocabulary',
+        subgraphType: str = 'similar',
+    ) -> list[str]:
+        """
+        Given some base term name(s), an ontology class and a subgraph type,
+        leverage the subgraphs route to return the list of related term name(s)
+
+        Args:
+            terms: the base term name(s) to start the graph traversal from
+            ontology: the ontology class to traverse
+            subgraphType: the type of traversal
+
+        Returns:
+            list of related term name(s)
+        """
+        rids = convert_to_rid_list(self.query({'target': ontology, 'filters': {'name': terms}}))
+        nodes = self.get_related_records(
+            base=rids,
+            ontology=ontology,
+            subgraphType=subgraphType,
+        )
+        return list(
+            map(lambda x: x['name'], nodes.values()),
+        )
+
 
 def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:
     """
diff --git a/tests/test_graphkb/test_util.py b/tests/test_graphkb/test_util.py
index e0173a0f..dbbb2c2b 100644
--- a/tests/test_graphkb/test_util.py
+++ b/tests/test_graphkb/test_util.py
@@ -152,7 +152,7 @@ def test_stringifyVariant_positional(self, conn, rid, createdAt, stringifiedVari
             assert util.stringifyVariant(variant=variant, **opt) == stringifiedVariant
 
 
-class TestVersion:
+class TestGraphKBConnection:
     def test_version(self, conn):
         version = conn.version
         assert version['db'] in [
@@ -164,3 +164,30 @@ def test_version(self, conn):
         assert SEMANTIC_VERSIONING_REGEX.match(version['api'])
         assert SEMANTIC_VERSIONING_REGEX.match(version['parser'])
         assert SEMANTIC_VERSIONING_REGEX.match(version['schema'])
+
+    def test_get_related_records(self, conn):
+        base = util.convert_to_rid_list(
+            conn.query({'target': 'Vocabulary', 'filters': {'name': 'missense'}})
+        )
+        records = conn.get_related_records(
+            base=base,
+            ontology='Vocabulary',
+            subgraphType='similar',
+            returnProperties=['displayName'],
+        )
+        assert 'missense mutation' in list(map(lambda x: x['displayName'], records.values()))
+
+    def test_get_related_terms(self, conn):
+        # with defaults
+        vocab_terms = conn.get_related_terms(
+            terms='missense',
+        )
+        assert 'missense mutation' in vocab_terms
+
+        # overriding ontology & subgraphType defaults
+        disease_terms = conn.get_related_terms(
+            terms='all solid tumors',
+            ontology='Disease',
+            subgraphType='parents',
+        )
+        assert 'cancer' in disease_terms

From f13cf6bfb43bf8e677cae587266bc9d1fb06e21a Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:23:34 -0700
Subject: [PATCH 49/64] Refactor get_cancer_genes() to use get_related_terms()

---
 pori_python/graphkb/genes.py | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index bdc4b17c..4d109818 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -24,9 +24,7 @@
 )
 from .match import get_equivalent_features
 from .util import get_rid, logger, looks_like_rid
-from .vocab import convert_to_rid_list, get_terms_set, query_by_name
-
-
+from .vocab import get_terms_set
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
     relevance: Union[str, list[str]],
@@ -85,8 +83,8 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
 
 
 def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
-    """Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500.
-
+    """
+    Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500.
     Cancer genes include oncogenes, tumour supressor genes and other cancer genes.
 
     Args:
@@ -95,25 +93,12 @@ def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     Returns:
         gene (Feature) records
     """
-    cancer_gene_rids = convert_to_rid_list(
-        conn.query(query_by_name('Vocabulary', CANCER_GENE)),
-    )
-    associated_terms = conn.post(
-        '/subgraphs/Vocabulary',
-        {
-            'subgraphType': 'children',
-            'base': cancer_gene_rids,
-        },
+    cancer_gene_terms = conn.get_related_terms(
+        terms=CANCER_GENE,
+        subgraphType='children',
     )
-    associated_term_names = list(
-        map(
-            lambda x: x['name'],
-            associated_terms['result']['g']['nodes'].values(),
-        ),
-    )
-
     return _get_tumourigenesis_genes_list(
-        conn, associated_term_names, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME]
+        conn, cancer_gene_terms, [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME]
     )
 
 

From f3a7f53a1719b4ef034ac4acd58fb6d7682a62be Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:25:19 -0700
Subject: [PATCH 50/64] Add get_cancer_gene_flags()

---
 pori_python/graphkb/genes.py     | 106 +++++++++++++++++++++++++++++++
 tests/test_graphkb/test_genes.py |  22 +++++++
 2 files changed, 128 insertions(+)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 4d109818..09b3d9f7 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -25,6 +25,112 @@
 from .match import get_equivalent_features
 from .util import get_rid, logger, looks_like_rid
 from .vocab import get_terms_set
+
+
+def get_cancer_gene_flags(
+    conn: GraphKBConnection,
+    flags: bool = False,
+    ignore_cache: bool = False,
+) -> Union[List, Dict]:
+    """
+    Return all cancer genes, optionally sorted by flags.
+
+    Flag definitions:
+        oncogenic: relevance 'oncogenic' from OncoKB
+        tumourSuppressive: relevance 'tumour suppressive' from OncoKB
+        cancerGene: relevance 'cancer gene' AND child terms ('oncogenic', 'tumour suppressive', 'other cancer gene'), from OncoKB AND TSO500
+
+    Args:
+        conn: the graphkb connection object
+        namesOnly: if only the gene names should be returned
+
+    Returns (if flags=False; default): list of unique gene records
+        [ <record>, <record>, ... ]
+
+    Returns (if flags=True): dict of flags as keys, and list of gene records as value
+        {
+            'oncogenic': [ <record>, <record>, ... ],
+            'tumourSuppressive' = [ <record>, <record>, ... ],
+            'cancerGene' = [ <record>, <record>, ... ],
+        }
+    """
+    # all cancer gene statements
+    CANCER_GENES = conn.get_related_terms(
+        terms=CANCER_GENE,
+        subgraphType='children',
+    )
+    statements = cast(
+        List[Statement],
+        conn.query(
+            {
+                'target': 'Statement',
+                'filters': {
+                    'relevance': {'target': 'Vocabulary', 'filters': {'name': CANCER_GENES}}
+                },
+                'returnProperties': [
+                    'source.name',
+                    'relevance.name',
+                    *[f'subject.{prop}' for prop in GENE_RETURN_PROPERTIES],
+                ],
+            },
+            ignore_cache=ignore_cache,
+        ),
+    )
+
+    # post-query filtering (faster)
+    cancerGeneStms = list(
+        filter(
+            lambda r: (
+                r['subject']['@class'] == 'Feature'
+                and r['subject']['biotype'] == 'gene'
+                and r['source']['name'] in [ONCOKB_SOURCE_NAME, TSO500_SOURCE_NAME]
+            ),
+            statements,
+        )
+    )
+    oncogenicStms = list(
+        filter(
+            lambda r: (
+                r['relevance']['name'] == ONCOGENE and r['source']['name'] == ONCOKB_SOURCE_NAME
+            ),
+            cancerGeneStms,
+        )
+    )
+    tumourSuppressiveStms = list(
+        filter(
+            lambda r: (
+                r['relevance']['name'] == TUMOUR_SUPPRESSIVE
+                and r['source']['name'] == ONCOKB_SOURCE_NAME
+            ),
+            cancerGeneStms,
+        )
+    )
+
+    # Returning a sorted list of unique gene records, based on iProbe requirements
+    # Unique by name, sorted by displayName
+    names = set()  # for unique gene names tracking
+    if not flags:
+        return cast(
+            List[Record],
+            sorted(
+                [
+                    r['subject']
+                    for r in cancerGeneStms
+                    if r['subject']['name'] not in names and not names.add(r['subject']['name'])
+                ],
+                key=lambda gene: gene['displayName'],
+            ),
+        )
+
+    # Returning a Dict of flags, with list of associated gene records
+    # Duplicates are ok
+    return {
+        'cancerGene': [r['subject'] for r in cancerGeneStms],
+        'oncogenic': [r['subject'] for r in oncogenicStms],
+        'tumourSuppressive': [r['subject'] for r in tumourSuppressiveStms],
+    }
+
+
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
     relevance: Union[str, list[str]],
diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py
index 5986b2f0..d53b4e9d 100644
--- a/tests/test_graphkb/test_genes.py
+++ b/tests/test_graphkb/test_genes.py
@@ -8,6 +8,7 @@
 from pori_python.graphkb import GraphKBConnection
 from pori_python.graphkb.genes import (
     get_cancer_genes,
+    get_cancer_gene_flags,
     get_cancer_predisposition_info,
     get_gene_information,
     get_gene_linked_cancer_predisposition_info,
@@ -111,6 +112,27 @@ def conn():
     return conn
 
 
+@pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data')
+def test_cancer_gene_flags(conn):
+    # wo/ flags
+    result = get_cancer_gene_flags(conn)
+    for gene in [*CANONICAL_OTHER_CG, *CANONICAL_TS, *CANONICAL_ONCOGENES]:
+        assert gene in {row['name'] for row in result}
+    # w/ flags
+    result = get_cancer_gene_flags(conn, flags=True)
+    for gene in [*CANONICAL_OTHER_CG, *CANONICAL_TS, *CANONICAL_ONCOGENES]:
+        assert gene in {row['name'] for row in result['cancerGene']}
+    for gene in CANONICAL_TS:
+        assert gene in {row['name'] for row in result['tumourSuppressive']}
+        assert gene not in {row['name'] for row in result['oncogenic']}
+    for gene in CANONICAL_ONCOGENES:
+        assert gene in {row['name'] for row in result['oncogenic']}
+        assert gene not in {row['name'] for row in result['tumourSuppressive']}
+    for gene in [*CANONICAL_OTHER_CG]:
+        assert gene not in {row['name'] for row in result['oncogenic']}
+        assert gene not in {row['name'] for row in result['tumourSuppressive']}
+
+
 @pytest.mark.skipif(EXCLUDE_ONCOKB_TESTS, reason='excluding tests that depend on oncokb data')
 def test_oncogene(conn):
     result = get_oncokb_oncogenes(conn)

From 27ec42952e29a299b729213ab66ecae2852377b9 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:26:28 -0700
Subject: [PATCH 51/64] Deprecate _get_tumourigenesis_genes_list(),
 get_oncokb_oncogenes(), get_oncokb_tumour_supressors() and get_cancer_genes()

---
 pori_python/graphkb/genes.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 09b3d9f7..82fd9c7f 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -131,6 +131,7 @@ def get_cancer_gene_flags(
     }
 
 
+@deprecated('fuctionality replaced by get_cancer_gene_flags')
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
     relevance: Union[str, list[str]],
@@ -164,6 +165,7 @@ def _get_tumourigenesis_genes_list(
     return [gene for gene in genes.values()]
 
 
+@deprecated('fuctionality replaced by get_cancer_gene_flags')
 def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
     """Get the list of oncogenes stored in GraphKB derived from OncoKB.
 
@@ -176,6 +178,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
     return _get_tumourigenesis_genes_list(conn, ONCOGENE, ONCOKB_SOURCE_NAME)
 
 
+@deprecated('fuctionality replaced by get_cancer_gene_flags')
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     """Get the list of tumour supressor genes stored in GraphKB derived from OncoKB.
 
@@ -188,6 +191,7 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, ONCOKB_SOURCE_NAME)
 
 
+@deprecated('fuctionality replaced by get_cancer_gene_flags')
 def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     """
     Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500.

From 22f5b412697f2975fdd542951ee8c6b6c2bcb2bc Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:27:19 -0700
Subject: [PATCH 52/64] Update get_gene_information()  to use
 get_cancer_gene_flags()

---
 pori_python/graphkb/genes.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 82fd9c7f..63d14a9b 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -630,12 +630,12 @@ def get_gene_information(
                     # PositionalVariant without a reference2 implies a smallMutation type
                     gene_flags['knownSmallMutation'].add(condition['reference1'])  # type: ignore
 
-    logger.info('fetching oncogenes list')
-    gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn))
-    logger.info('fetching tumour supressors list')
-    gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn))
-    logger.info('fetching cancerGeneListMatch list')
-    gene_flags['cancerGeneListMatch'] = convert_to_rid_set(get_cancer_genes(graphkb_conn))
+    # cancer gene flags
+    logger.info('fetching cancer genes')
+    cancer_gene_flags = get_cancer_gene_flags(graphkb_conn, flags=True)
+    gene_flags['oncogene'] = convert_to_rid_set(cancer_gene_flags['oncogenic'])
+    gene_flags['tumourSuppressor'] = convert_to_rid_set(cancer_gene_flags['tumourSuppressive'])
+    gene_flags['cancerGeneListMatch'] = convert_to_rid_set(cancer_gene_flags['cancerGene'])
 
     logger.info('fetching therapeutic associated genes lists')
     gene_flags['therapeuticAssociated'] = convert_to_rid_set(

From 2e56ea8e36b1cbb6c1a33b68ef4113040c7207d0 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 13 May 2026 16:29:47 -0700
Subject: [PATCH 53/64] Add equivalent gene name caching to
 get_gene_information()

---
 pori_python/graphkb/genes.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 63d14a9b..6c62e601 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -644,8 +644,14 @@ def get_gene_information(
 
     logger.info(f'Setting gene_info flags on {len(gene_names)} genes')
     result: List[IprGene] = []
+    EQUIVALENT_CACHE = {}
     for gene_name in gene_names:
-        equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name))
+        if gene_name not in EQUIVALENT_CACHE:
+            EQUIVALENT_CACHE[gene_name] = convert_to_rid_set(
+                get_equivalent_features(graphkb_conn, gene_name)
+            )
+        equivalent = EQUIVALENT_CACHE[gene_name]
+
         row: Dict[str, str | bool] = {'name': gene_name}
         flagged = False
         for flag in gene_flags:

From 07139315f7b7c971e94303d7e267bb36b55c07a6 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Thu, 14 May 2026 08:47:57 -0700
Subject: [PATCH 54/64] Revert equivalent gene names caching in
 get_gene_information()

---
 pori_python/graphkb/genes.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 6c62e601..63d14a9b 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -644,14 +644,8 @@ def get_gene_information(
 
     logger.info(f'Setting gene_info flags on {len(gene_names)} genes')
     result: List[IprGene] = []
-    EQUIVALENT_CACHE = {}
     for gene_name in gene_names:
-        if gene_name not in EQUIVALENT_CACHE:
-            EQUIVALENT_CACHE[gene_name] = convert_to_rid_set(
-                get_equivalent_features(graphkb_conn, gene_name)
-            )
-        equivalent = EQUIVALENT_CACHE[gene_name]
-
+        equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name))
         row: Dict[str, str | bool] = {'name': gene_name}
         flagged = False
         for flag in gene_flags:

From 5c3fe7a8b4e9d40bed3ee463a55c946a521876e3 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Thu, 14 May 2026 08:50:11 -0700
Subject: [PATCH 55/64] Add test to test_cancer_gene_flags()

---
 pori_python/graphkb/genes.py     | 2 +-
 tests/test_graphkb/test_genes.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 63d14a9b..4ae4710e 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -108,8 +108,8 @@ def get_cancer_gene_flags(
 
     # Returning a sorted list of unique gene records, based on iProbe requirements
     # Unique by name, sorted by displayName
-    names = set()  # for unique gene names tracking
     if not flags:
+        names = set()  # for unique gene names tracking
         return cast(
             List[Record],
             sorted(
diff --git a/tests/test_graphkb/test_genes.py b/tests/test_graphkb/test_genes.py
index d53b4e9d..c2ba87a7 100644
--- a/tests/test_graphkb/test_genes.py
+++ b/tests/test_graphkb/test_genes.py
@@ -116,6 +116,9 @@ def conn():
 def test_cancer_gene_flags(conn):
     # wo/ flags
     result = get_cancer_gene_flags(conn)
+    assert [r['displayName'] for r in result] == sorted(
+        list({r['displayName'] for r in result}),  # makes displayName unique and sorted
+    )
     for gene in [*CANONICAL_OTHER_CG, *CANONICAL_TS, *CANONICAL_ONCOGENES]:
         assert gene in {row['name'] for row in result}
     # w/ flags

From 0444682f3463b3908694005240d2627ef6bbd602 Mon Sep 17 00:00:00 2001
From: Eleanor Lewis <elewis@bcgsc.ca>
Date: Thu, 14 May 2026 17:11:26 -0700
Subject: [PATCH 56/64] do not rely on side effect

---
 pori_python/ipr/inputs.py | 4 +++-
 pori_python/ipr/main.py   | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
index 25901fc3..beec4f3f 100644
--- a/pori_python/ipr/inputs.py
+++ b/pori_python/ipr/inputs.py
@@ -812,8 +812,9 @@ def normalize_seqqc(content: Dict) -> Dict:
         content: Report content dictionary that may contain seqQC array
 
     Returns:
-        The content dictionary with seqQC fields normalized in-place
+        A new content dictionary with seqQC fields normalized
     """
+    content = {**content}
     # Field name mapping from production/legacy format to schema format
     field_mapping = {
         'Reads': 'reads',
@@ -829,6 +830,7 @@ def normalize_seqqc(content: Dict) -> Dict:
     normalized_keys = set(field_mapping.values())
 
     if 'seqQC' in content and isinstance(content['seqQC'], list):
+        content['seqQC'] = list(content['seqQC'])
         for i, item in enumerate(content['seqQC']):
             if not isinstance(item, dict):
                 continue
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
index c249b791..3df5819e 100644
--- a/pori_python/ipr/main.py
+++ b/pori_python/ipr/main.py
@@ -405,7 +405,7 @@ def ipr_report(
 
     # seqqc normalization is a bridging measure only;
     # validate_json should be called on non-normalized json
-    normalize_seqqc(content)
+    content = normalize_seqqc(content)
 
     if upload_json:
         if not ipr_conn:

From 977c1a8630a1548d10bbeed134fce80e30fde4a0 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 09:08:38 -0700
Subject: [PATCH 57/64] Fix typo

---
 pori_python/graphkb/genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 4ae4710e..3ba090c3 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -131,7 +131,7 @@ def get_cancer_gene_flags(
     }
 
 
-@deprecated('fuctionality replaced by get_cancer_gene_flags')
+@deprecated('functionality replaced by get_cancer_gene_flags')
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
     relevance: Union[str, list[str]],

From 482ea30f123fc9b30730ee6e2fcba62cd6729764 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:21:52 -0700
Subject: [PATCH 58/64] Fix formatting in get_cancer_gene_flags()

---
 pori_python/graphkb/genes.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 3ba090c3..1b4f903e 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -31,7 +31,7 @@ def get_cancer_gene_flags(
     conn: GraphKBConnection,
     flags: bool = False,
     ignore_cache: bool = False,
-) -> Union[List, Dict]:
+) -> Union[List[Record], Dict[str, List[Record]]]:
     """
     Return all cancer genes, optionally sorted by flags.
 
@@ -42,7 +42,8 @@ def get_cancer_gene_flags(
 
     Args:
         conn: the graphkb connection object
-        namesOnly: if only the gene names should be returned
+        flags: if the results should be sorted by flags
+        ignore_cache: if cache should be ignored when querying GraphKB API
 
     Returns (if flags=False; default): list of unique gene records
         [ <record>, <record>, ... ]
@@ -50,12 +51,12 @@ def get_cancer_gene_flags(
     Returns (if flags=True): dict of flags as keys, and list of gene records as value
         {
             'oncogenic': [ <record>, <record>, ... ],
-            'tumourSuppressive' = [ <record>, <record>, ... ],
-            'cancerGene' = [ <record>, <record>, ... ],
+            'tumourSuppressive': [ <record>, <record>, ... ],
+            'cancerGene': [ <record>, <record>, ... ],
         }
     """
     # all cancer gene statements
-    CANCER_GENES = conn.get_related_terms(
+    cancer_genes = conn.get_related_terms(
         terms=CANCER_GENE,
         subgraphType='children',
     )
@@ -65,7 +66,7 @@ def get_cancer_gene_flags(
             {
                 'target': 'Statement',
                 'filters': {
-                    'relevance': {'target': 'Vocabulary', 'filters': {'name': CANCER_GENES}}
+                    'relevance': {'target': 'Vocabulary', 'filters': {'name': cancer_genes}}
                 },
                 'returnProperties': [
                     'source.name',

From d8a68fa64cadd8b98bbd150b96a4ff445fab10f8 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:23:32 -0700
Subject: [PATCH 59/64] Refactor unique gene filtering in
 get_cancer_gene_flags()

---
 pori_python/graphkb/genes.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 1b4f903e..3ce5c6c3 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -110,17 +110,17 @@ def get_cancer_gene_flags(
     # Returning a sorted list of unique gene records, based on iProbe requirements
     # Unique by name, sorted by displayName
     if not flags:
-        names = set()  # for unique gene names tracking
+        seen: set = set()
+        unique_genes: List[Record] = []
+        for r in cancerGeneStms:
+            name = r['subject']['name']
+            if name not in seen:
+                seen.add(name)
+                unique_genes.append(r['subject'])
+
         return cast(
             List[Record],
-            sorted(
-                [
-                    r['subject']
-                    for r in cancerGeneStms
-                    if r['subject']['name'] not in names and not names.add(r['subject']['name'])
-                ],
-                key=lambda gene: gene['displayName'],
-            ),
+            sorted(unique_genes, key=lambda gene: gene['displayName']),
         )
 
     # Returning a Dict of flags, with list of associated gene records

From 49aaeaa05cd552adea59c8dc4b9ad90558a21596 Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:24:23 -0700
Subject: [PATCH 60/64] Fix _get_tumourigenesis_genes_list() signature

---
 pori_python/graphkb/genes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 3ce5c6c3..3548523b 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -135,8 +135,8 @@ def get_cancer_gene_flags(
 @deprecated('functionality replaced by get_cancer_gene_flags')
 def _get_tumourigenesis_genes_list(
     conn: GraphKBConnection,
-    relevance: Union[str, list[str]],
-    sources: Union[str, list[str]],
+    relevance: Union[str, List[str]],
+    sources: Union[str, List[str]],
     ignore_cache: bool = False,
 ) -> List[Ontology]:
     statements = cast(

From aa2bb7f15563959260a24059167b7b836e68107e Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:24:39 -0700
Subject: [PATCH 61/64] Fix typo

---
 pori_python/graphkb/genes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pori_python/graphkb/genes.py b/pori_python/graphkb/genes.py
index 3548523b..376693af 100644
--- a/pori_python/graphkb/genes.py
+++ b/pori_python/graphkb/genes.py
@@ -166,7 +166,7 @@ def _get_tumourigenesis_genes_list(
     return [gene for gene in genes.values()]
 
 
-@deprecated('fuctionality replaced by get_cancer_gene_flags')
+@deprecated('functionality replaced by get_cancer_gene_flags')
 def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
     """Get the list of oncogenes stored in GraphKB derived from OncoKB.
 
@@ -179,7 +179,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
     return _get_tumourigenesis_genes_list(conn, ONCOGENE, ONCOKB_SOURCE_NAME)
 
 
-@deprecated('fuctionality replaced by get_cancer_gene_flags')
+@deprecated('functionality replaced by get_cancer_gene_flags')
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     """Get the list of tumour supressor genes stored in GraphKB derived from OncoKB.
 
@@ -192,7 +192,7 @@ def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
     return _get_tumourigenesis_genes_list(conn, TUMOUR_SUPPRESSIVE, ONCOKB_SOURCE_NAME)
 
 
-@deprecated('fuctionality replaced by get_cancer_gene_flags')
+@deprecated('functionality replaced by get_cancer_gene_flags')
 def get_cancer_genes(conn: GraphKBConnection) -> List[Ontology]:
     """
     Get the list of cancer genes stored in GraphKB derived from OncoKB & TSO500.

From 9a5eb038b1a675b73511223ddc34a520dab253be Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:25:44 -0700
Subject: [PATCH 62/64] Fix get_related_records() & get_related_terms()
 signatures

---
 pori_python/graphkb/util.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py
index 2508b30b..d52508a2 100644
--- a/pori_python/graphkb/util.py
+++ b/pori_python/graphkb/util.py
@@ -368,11 +368,11 @@ def version(self) -> Dict[str, str]:
 
     def get_related_records(
         self,
-        base: Union[str, list[str]],
+        base: Union[str, List[str]],
         ontology: str,
         subgraphType: str,
-        returnProperties=[],
-    ):
+        returnProperties: Optional[List[str]] = None,
+    ) -> List[Record]:
         """
         Given some base node RIDs, an ontology class and a subgraph type,
         leverage the subgraphs route to return the list of related nodes.
@@ -391,17 +391,17 @@ def get_related_records(
             data={
                 'base': base if isinstance(base, list) else [base],
                 'subgraphType': subgraphType,
-                'returnProperties': returnProperties,
+                'returnProperties': returnProperties or [],
             },
         )
         return related['result']['g']['nodes']
 
     def get_related_terms(
         self,
-        terms: Union[str, list[str]],
+        terms: Union[str, List[str]],
         ontology: str = 'Vocabulary',
         subgraphType: str = 'similar',
-    ) -> list[str]:
+    ) -> List[str]:
         """
         Given some base term name(s), an ontology class and a subgraph type,
         leverage the subgraphs route to return the list of related term name(s)

From 27b8ed572e47b4363f827aa90a0cd0c02ac3eabe Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Fri, 15 May 2026 10:26:09 -0700
Subject: [PATCH 63/64] Simplify filtering in get_related_terms()

---
 pori_python/graphkb/util.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pori_python/graphkb/util.py b/pori_python/graphkb/util.py
index d52508a2..a2a9bb14 100644
--- a/pori_python/graphkb/util.py
+++ b/pori_python/graphkb/util.py
@@ -420,9 +420,7 @@ def get_related_terms(
             ontology=ontology,
             subgraphType=subgraphType,
         )
-        return list(
-            map(lambda x: x['name'], nodes.values()),
-        )
+        return [x['name'] for x in nodes.values()]
 
 
 def get_rid(conn: GraphKBConnection, target: str, name: str) -> str:

From e54ebfe93fc7d9827ae572ebdd8e4bb1b7119f1b Mon Sep 17 00:00:00 2001
From: mathieulemieux <mathieu@mathieulemieux.com>
Date: Wed, 20 May 2026 08:18:06 -0700
Subject: [PATCH 64/64] Bump version from 1.4.0 to 1.5.0

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index e3b7d63c..0cdeb341 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,7 +18,7 @@ known_standard_library = requests
 
 [metadata]
 name = pori_python
-version = 1.4.0
+version = 1.5.0
 url = https://github.com/bcgsc/pori_python
 author_email = dat@bcgsc.ca
 maintainer_email = dat@bcgsc.ca