From fc0b189b375fce51f1714bab4c14c7ebc396f673 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 3 Feb 2026 10:24:07 +0000 Subject: [PATCH 1/4] CU-869c0g9f7: Add test for embedding linker disamibguation --- .../linking/test_embedding_linker.py | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/medcat-v2/tests/components/linking/test_embedding_linker.py b/medcat-v2/tests/components/linking/test_embedding_linker.py index 187bc2189..658ecc52f 100644 --- a/medcat-v2/tests/components/linking/test_embedding_linker.py +++ b/medcat-v2/tests/components/linking/test_embedding_linker.py @@ -1,13 +1,17 @@ from medcat.components.linking import embedding_linker from medcat.components import types from medcat.config import Config +from medcat.data.entities import Entity from medcat.vocab import Vocab +from medcat.cat import CAT from medcat.cdb.concepts import CUIInfo, NameInfo from medcat.components.types import TrainableComponent from medcat.components.types import _DEFAULT_LINKING as DEF_LINKING import unittest from ..helper import ComponentInitTests +from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH + class FakeDocument: linked_ents = [] ner_ents = [] @@ -64,4 +68,36 @@ def test_linker_is_not_trainable(self): def test_linker_processes_document(self): doc = FakeDocument("Test Document") - self.linker(doc) \ No newline at end of file + self.linker(doc) + + +class EmbeddingModelDisambiguationTests(unittest.TestCase): + PLACEHOLDER = "{SOME_PLACEHOLDER}" + TEXT = f"""The issue has a lot to do with the {PLACEHOLDER}""" + + @classmethod + def setUpClass(cls) -> None: + cls.model = CAT.load_model_pack(UNPACKED_EXAMPLE_MODEL_PACK_PATH) + cls.model.config.components.linking = embedding_linker.EmbeddingLinking() + cls.model._recreate_pipe() + linker: embedding_linker.Linker = cls.model.pipe.get_component( + types.CoreComponentType.linking) + linker.create_embeddings() + + def assert_has_name(self, out_ents: dict[int, Entity], name: str): + self.assertTrue( + any(ent["source_value"] == name for ent in out_ents.values()) + ) + + def test_does_disambiguation(self): + used_names = 0 + for name, info in self.model.cdb.name2info.items(): + if len(info['per_cui_status']) <= 1: + continue + used_names += 1 + with self.subTest(name): + cur_text = self.TEXT.replace(self.PLACEHOLDER, name) + out_ents = self.model.get_entities(cur_text)["entities"] + self.assert_has_name(out_ents, name) + self.assertGreater(used_names, 0) + From a71550b6bf20556657f7aef9221be356f8300b46 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 3 Feb 2026 10:25:37 +0000 Subject: [PATCH 2/4] CU-869c0g9f7: Add some improved typing to embedding linker --- medcat-v2/medcat/components/linking/embedding_linker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index 80d495dd8..b3094ee9f 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -619,7 +619,8 @@ def _generate_link_candidates( entity.link_candidates = list(cuis) - def _pre_inference(self, doc: MutableDocument) -> tuple[list, list]: + def _pre_inference(self, doc: MutableDocument + ) -> tuple[list[MutableEntity], list[MutableEntity]]: """Checking all entities for entites with only a single link candidate and to avoid full inference step. If we want to calculate similarities, or not use link candidates then just return the entities""" @@ -643,8 +644,8 @@ def _pre_inference(self, doc: MutableDocument) -> tuple[list, list]: if self.cnf_l.always_calculate_similarity: return [], filtered_ents - le = [] - to_infer = [] + le: list[MutableEntity] = [] + to_infer: list[MutableEntity] = [] for entity in all_ents: if len(entity.link_candidates) == 1: # if the include filter exists and the only cui is in it From 097b3270223812d58d5e16e1c659a3e0f59f59fe Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 3 Feb 2026 10:28:51 +0000 Subject: [PATCH 3/4] CU-869c0g9f7: Fix issue with embedding linker doing disambiguation --- medcat-v2/medcat/components/linking/embedding_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index b3094ee9f..1966b6e9c 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -654,7 +654,7 @@ def _pre_inference(self, doc: MutableDocument entity.context_similarity = 1 le.append(entity) continue - elif self.cnf_l.use_ner_link_candidates: + elif self.cnf_l.use_ner_link_candidates and not len(entity.link_candidates): continue # it has to be inferred due to filters or number of link candidates to_infer.append(entity) From 893e5561abb1d64709bacb28ee297cbce9a96252 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 3 Feb 2026 12:38:25 +0000 Subject: [PATCH 4/4] CU-869c0g9f7: Simplify condition a little --- medcat-v2/medcat/components/linking/embedding_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index 1966b6e9c..c0e8b594c 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -654,7 +654,7 @@ def _pre_inference(self, doc: MutableDocument entity.context_similarity = 1 le.append(entity) continue - elif self.cnf_l.use_ner_link_candidates and not len(entity.link_candidates): + elif self.cnf_l.use_ner_link_candidates and not entity.link_candidates: continue # it has to be inferred due to filters or number of link candidates to_infer.append(entity)