From 12b7c5286b18a325271bd791c29d5c9ae5789d7f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 14:57:57 +0100 Subject: [PATCH 01/10] CU-869cw9zmj: Use faster way to calculate unit vector --- medcat-v2/medcat/utils/matutils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/utils/matutils.py b/medcat-v2/medcat/utils/matutils.py index 1fcce1629..ac05ee2d4 100644 --- a/medcat-v2/medcat/utils/matutils.py +++ b/medcat-v2/medcat/utils/matutils.py @@ -15,7 +15,8 @@ def unitvec(vec: np.ndarray) -> np.ndarray: Returns: np.ndarray: The new unit vector. """ - return vec / np.linalg.norm(vec) + vec /= np.sqrt(vec @ vec) + return vec @overload From 93fe94cc4a55d3dd8b686a8b718113d697ff873c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 15:14:03 +0100 Subject: [PATCH 02/10] CU-869cw9zmj: Speed up context vector obtaining --- .../linking/vector_context_model.py | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 9815c171e..acf9f18f3 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -91,9 +91,10 @@ def get_context_tokens(self, entity: MutableEntity, doc: MutableDocument, return tokens_left, tokens_center, tokens_right - def _tokens2vecs(self, tokens: Sequence[Union[MutableToken, str]] - ) -> Iterable[np.ndarray]: - for step, tkn in enumerate(tokens): + def _tokens2vecs(self, tokens: Sequence[Union[MutableToken, str]], + step_start: int = 0 + ) -> Iterable[np.ndarray]: + for step, tkn in enumerate(tokens, start=step_start): lower = tkn.lower() if isinstance(tkn, str) else tkn.base.lower if lower not in self.vocab: continue @@ -137,26 +138,52 @@ def get_context_vectors(self, entity: MutableEntity, """ vectors: dict[str, np.ndarray] = {} - context_vector_sizes = self.config.context_vector_sizes - for context_type, window_size in context_vector_sizes.items(): - tokens_left, tokens_center, tokens_right = self.get_context_tokens( - entity, doc, window_size, per_doc_valid_token_cache) + # Sort ascending so each iteration is a superset of the previous + sorted_contexts = sorted( + self.config.context_vector_sizes.items(), key=lambda x: x[1]) - values: list[np.ndarray] = [] - # Add left - values.extend(self._tokens2vecs(tokens_left)) + prev_left: list[MutableToken] = [] + prev_right: list[MutableToken] = [] + # Accumulated weighted vecs from previous (smaller) windows, + # excluding center (center is the same for all window sizes) + prev_left_vecs: list[np.ndarray] = [] + prev_right_vecs: list[np.ndarray] = [] + center_vecs: Optional[list[np.ndarray]] = None # same for all windows - if not self.config.context_ignore_center_tokens: - # Add center - values.extend( - self._preprocess_center_tokens(cui, tokens_center)) + for context_type, window_size in sorted_contexts: + tokens_left, tokens_center, tokens_right = self.get_context_tokens( + entity, doc, window_size, per_doc_valid_token_cache) - # Add right - values.extend(self._tokens2vecs(tokens_right)) + # New outer tokens only — the inner ones were already processed + new_left = tokens_left[:len(tokens_left) - len(prev_left)] + new_right = tokens_right[len(prev_right):] + + # step_start for new left tokens: they are further from centre + # so their step index is + # len(tokens_left) - len(new_left) ... len(tokens_left)-1 + # i.e. the new tokens are the outermost, highest-step ones + new_left_vecs = list(self._tokens2vecs( + new_left, step_start=len(prev_left))) + new_right_vecs = list(self._tokens2vecs( + new_right, step_start=len(prev_right))) + + prev_left_vecs = new_left_vecs + prev_left_vecs + prev_right_vecs = prev_right_vecs + new_right_vecs + prev_left = tokens_left + prev_right = tokens_right + + # Center is identical for all window sizes, only compute once + if center_vecs is None: + if not self.config.context_ignore_center_tokens: + center_vecs = list( + self._preprocess_center_tokens(cui, tokens_center)) + else: + center_vecs = [] + values = prev_left_vecs + center_vecs + prev_right_vecs if values: - value = np.average(values, axis=0) - vectors[context_type] = value + vectors[context_type] = np.average(values, axis=0) + return vectors def similarity(self, cui: str, entity: MutableEntity, doc: MutableDocument, From 7856c12f2d622021dfb6fa60f54e12491cf9cde6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 16:51:05 +0100 Subject: [PATCH 03/10] CU-869ctq789: Avoid leaking normalized vectors --- medcat-v2/medcat/components/linking/vector_context_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index acf9f18f3..14434279a 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -511,7 +511,7 @@ def update_context_vectors(to_update: dict[str, np.ndarray], cui: str, "Is Negative: %s, LR: %.5f, b: %.3f", cui, context_type, similarity, negative, lr, b) cv = to_update[context_type] - similarity_after = np.dot(unitvec(cv), unitvec(vector)) + similarity_after = np.dot(unitvec(cv.copy()), unitvec(vector)) logger.debug("Similarity before vs after: %.5f vs %.5f", similarity, similarity_after) else: From 7759c65d86149611869298049579d665417db922 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 14 Apr 2026 14:19:18 +0100 Subject: [PATCH 04/10] Revert "CU-869ctq789: Avoid leaking normalized vectors" This reverts commit 7856c12f2d622021dfb6fa60f54e12491cf9cde6. --- medcat-v2/medcat/components/linking/vector_context_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 14434279a..acf9f18f3 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -511,7 +511,7 @@ def update_context_vectors(to_update: dict[str, np.ndarray], cui: str, "Is Negative: %s, LR: %.5f, b: %.3f", cui, context_type, similarity, negative, lr, b) cv = to_update[context_type] - similarity_after = np.dot(unitvec(cv.copy()), unitvec(vector)) + similarity_after = np.dot(unitvec(cv), unitvec(vector)) logger.debug("Similarity before vs after: %.5f vs %.5f", similarity, similarity_after) else: From 916b7a58c3cbf2c63bbd6db2a6ab76b576aff7d3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 14 Apr 2026 14:19:19 +0100 Subject: [PATCH 05/10] Revert "CU-869cw9zmj: Use faster way to calculate unit vector" This reverts commit 12b7c5286b18a325271bd791c29d5c9ae5789d7f. --- medcat-v2/medcat/utils/matutils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/medcat-v2/medcat/utils/matutils.py b/medcat-v2/medcat/utils/matutils.py index ac05ee2d4..1fcce1629 100644 --- a/medcat-v2/medcat/utils/matutils.py +++ b/medcat-v2/medcat/utils/matutils.py @@ -15,8 +15,7 @@ def unitvec(vec: np.ndarray) -> np.ndarray: Returns: np.ndarray: The new unit vector. """ - vec /= np.sqrt(vec @ vec) - return vec + return vec / np.linalg.norm(vec) @overload From b391b251ac485a1c8060d313cae35c0415e522d1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 14 Apr 2026 15:33:40 +0100 Subject: [PATCH 06/10] CU-869cw9zmj: Fix usage of tokens in the correct order --- medcat-v2/medcat/components/linking/vector_context_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index acf9f18f3..1f18eaf6f 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -155,7 +155,7 @@ def get_context_vectors(self, entity: MutableEntity, entity, doc, window_size, per_doc_valid_token_cache) # New outer tokens only — the inner ones were already processed - new_left = tokens_left[:len(tokens_left) - len(prev_left)] + new_left = tokens_left[len(prev_left):] new_right = tokens_right[len(prev_right):] # step_start for new left tokens: they are further from centre From 201c94e891e10df74693c795f5645ff5f1c75f29 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 15 Apr 2026 08:58:36 +0100 Subject: [PATCH 07/10] CU-869cw9zmj: Add small comment on left token slicing --- medcat-v2/medcat/components/linking/vector_context_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 1f18eaf6f..ddd5133d5 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -155,6 +155,8 @@ def get_context_vectors(self, entity: MutableEntity, entity, doc, window_size, per_doc_valid_token_cache) # New outer tokens only — the inner ones were already processed + # NOTE: left hand tokens are in order of closest first, which is why + # we're slicing from the start of the list new_left = tokens_left[len(prev_left):] new_right = tokens_right[len(prev_right):] From 675d5b49cfc76f223f71bbe3877c959f2c37253b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 24 Apr 2026 10:00:50 +0100 Subject: [PATCH 08/10] CU-869cw9zmj: Separate centre context vectors calculation outside the loop --- .../components/linking/vector_context_model.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index ddd5133d5..e67ca35de 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -148,7 +148,13 @@ def get_context_vectors(self, entity: MutableEntity, # excluding center (center is the same for all window sizes) prev_left_vecs: list[np.ndarray] = [] prev_right_vecs: list[np.ndarray] = [] - center_vecs: Optional[list[np.ndarray]] = None # same for all windows + + # Center is identical for all window sizes, only compute once + if not self.config.context_ignore_center_tokens: + center_vecs = list( + self._preprocess_center_tokens(cui, tokens_center)) + else: + center_vecs = [] for context_type, window_size in sorted_contexts: tokens_left, tokens_center, tokens_right = self.get_context_tokens( @@ -174,14 +180,6 @@ def get_context_vectors(self, entity: MutableEntity, prev_left = tokens_left prev_right = tokens_right - # Center is identical for all window sizes, only compute once - if center_vecs is None: - if not self.config.context_ignore_center_tokens: - center_vecs = list( - self._preprocess_center_tokens(cui, tokens_center)) - else: - center_vecs = [] - values = prev_left_vecs + center_vecs + prev_right_vecs if values: vectors[context_type] = np.average(values, axis=0) From e3c5fa286e8773a7defdb420d0e168de49d08f9e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 24 Apr 2026 10:11:38 +0100 Subject: [PATCH 09/10] CU-869cw9zmj: Get centre tokens separately only if they're required --- .../components/linking/vector_context_model.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index e67ca35de..27f636a35 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -58,6 +58,7 @@ def __init__(self, cui2info: dict[str, CUIInfo], def get_context_tokens(self, entity: MutableEntity, doc: MutableDocument, size: int, per_doc_valid_token_cache: 'PerDocumentTokenCache', + fill_centre_tokens: bool = True, ) -> tuple[list[MutableToken], list[MutableToken], list[MutableToken]]: @@ -83,8 +84,11 @@ def get_context_tokens(self, entity: MutableEntity, doc: MutableDocument, per_doc_valid_token_cache[tkn]] # Reverse because the first token should be the one closest to center tokens_left.reverse() - tokens_center: list[MutableToken] = list( - cast(Iterable[MutableToken], entity)) + if fill_centre_tokens: + tokens_center: list[MutableToken] = list( + cast(Iterable[MutableToken], entity)) + else: + tokens_center = [] _right_tokens = doc[end_ind + 1:end_ind + 1 + size] tokens_right = [tkn for tkn in _right_tokens if per_doc_valid_token_cache[tkn]] @@ -151,14 +155,17 @@ def get_context_vectors(self, entity: MutableEntity, # Center is identical for all window sizes, only compute once if not self.config.context_ignore_center_tokens: + tokens_center = list( + cast(Iterable[MutableToken], entity)) center_vecs = list( self._preprocess_center_tokens(cui, tokens_center)) else: center_vecs = [] for context_type, window_size in sorted_contexts: - tokens_left, tokens_center, tokens_right = self.get_context_tokens( - entity, doc, window_size, per_doc_valid_token_cache) + tokens_left, _, tokens_right = self.get_context_tokens( + entity, doc, window_size, per_doc_valid_token_cache, + fill_centre_tokens=False) # New outer tokens only — the inner ones were already processed # NOTE: left hand tokens are in order of closest first, which is why From 0281c224c48731528385e93f2c71d5fc02f92017 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 24 Apr 2026 10:21:49 +0100 Subject: [PATCH 10/10] CU-869cw9zmj: Fix linting issue --- medcat-v2/medcat/components/linking/vector_context_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 27f636a35..c009d4cee 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -155,7 +155,7 @@ def get_context_vectors(self, entity: MutableEntity, # Center is identical for all window sizes, only compute once if not self.config.context_ignore_center_tokens: - tokens_center = list( + tokens_center = list( cast(Iterable[MutableToken], entity)) center_vecs = list( self._preprocess_center_tokens(cui, tokens_center))