perf: fix low-hanging performance issues in MetaCAT and linking (#400)

bgriffen · web-flow · commit 8a630ba767cf · 2026-04-07T14:52:07.000+01:00
* perf(metacat): scope max_seq_len and batch slice to current batch

create_batch_piped_data was computing max_seq_len over the entire
dataset on every batch call, and slicing data[start_ind:end_ind]
three times. Scope both to a single batch slice — reduces padding
overhead and eliminates redundant iteration.

* perf(linking): update similarities in-place during disambiguation

Replace list copy + clear + rebuild with a simple in-place loop.
Eliminates three intermediate list allocations in the disambiguation
hot path.

* perf(metacat): replace O(n) dict values scan with O(1) key lookup

undersample_data and encode_category_values both checked membership
against category_value2id.values() (linear scan) on every iteration.
Since label_data dicts are keyed by the same IDs, check membership
against the dict itself (O(1) hash lookup).

* perf(metacat): use append instead of list concatenation in eval

dict.get(k, []) + [item] allocates a new list on every iteration,
making example collection O(n*k). Use setdefault + append for O(1)
amortized per insertion.
diff --git a/medcat-v2/medcat/components/addons/meta_cat/data_utils.py b/medcat-v2/medcat/components/addons/meta_cat/data_utils.py
@@ -319,10 +319,9 @@ def undersample_data(data: list, category_value2id: dict, label_data_,
             label_data_counter[sample[-1]] += 1
 
     label_data = {v: 0 for v in category_value2id.values()}
-    for i in range(len(data_undersampled)):
-        if data_undersampled[i][2] in category_value2id.values():
-            label_data[data_undersampled[i][2]] = (
-                label_data[data_undersampled[i][2]] + 1)
+    for sample in data_undersampled:
+        if sample[2] in label_data:
+            label_data[sample[2]] += 1
     logger.info("Updated number of samples per label (for 2-phase learning):"
                 " %s", label_data)
     return data_undersampled
@@ -414,9 +413,9 @@ def encode_category_values(data: list[tuple[list, list, str]],
 
     # Creating dict with labels and its number of samples
     label_data_ = {v: 0 for v in category_value2id.values()}
-    for i in range(len(data)):
-        if data[i][2] in category_value2id.values():
-            label_data_[data[i][2]] = label_data_[data[i][2]] + 1
+    for sample in data:
+        if sample[2] in label_data_:
+            label_data_[sample[2]] += 1
 
     logger.info("Original number of samples per label: %s", label_data_)
 
diff --git a/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py b/medcat-v2/medcat/components/addons/meta_cat/ml_utils.py
@@ -63,14 +63,15 @@ def create_batch_piped_data(data: list[tuple[list[int], int, Optional[int]]],
         y (Optional[torch.Tensor]):
             class label of the data
     """
-    max_seq_len = max([len(x[0]) for x in data])
+    batch = data[start_ind:end_ind]
+    max_seq_len = max(len(x[0]) for x in batch)
     x = [x[0][0:max_seq_len] + [pad_id] * max(0, max_seq_len - len(x[0]))
-         for x in data[start_ind:end_ind]]
-    cpos = [x[1] for x in data[start_ind:end_ind]]
+         for x in batch]
+    cpos = [x[1] for x in batch]
     y = None
     if len(data[0]) == 3:
         # Means we have the y column
-        y = torch.tensor([x[2] for x in data[start_ind:end_ind]],
+        y = torch.tensor([x[2] for x in batch],
                          dtype=torch.long).to(device)
 
     x2 = torch.tensor(x, dtype=torch.long).to(device)
@@ -511,10 +512,10 @@ def _eval_predictions(
         info = "Predicted: {}, True: {}".format(pred, y)
         if pred != y:
             # We made a mistake
-            examples['FN'][y] = examples['FN'].get(y, []) + [(info, text)]
-            examples['FP'][pred] = examples['FP'].get(pred, []) + [(info, text)]
+            examples['FN'].setdefault(y, []).append((info, text))
+            examples['FP'].setdefault(pred, []).append((info, text))
         else:
-            examples['TP'][y] = examples['TP'].get(y, []) + [(info, text)]
+            examples['TP'].setdefault(y, []).append((info, text))
 
     return {'precision': precision, 'recall': recall, 'f1': f1,
             'examples': examples, 'confusion matrix': confusion}
diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py
@@ -231,10 +231,9 @@ def _preprocess_disamb_similarities(self, entity: MutableEntity,
             pref_freq = self.config.prefer_frequent_concepts
             scales = [np.log10(cnt / m) * pref_freq if cnt > 10 else 0
                       for cnt in cnts]
-            old_sims = list(similarities)
-            similarities.clear()
-            similarities += [float(min(0.99, sim + sim * scale))
-                             for sim, scale in zip(old_sims, scales)]
+            for i, scale in enumerate(scales):
+                similarities[i] = float(min(0.99,
+                                            similarities[i] + similarities[i] * scale))
 
     def get_all_similarities(self, cuis: list[str], entity: MutableEntity,
                              name: str, doc: MutableDocument,