diff --git a/.semversioner/next-release/patch-20260308082306195595.json b/.semversioner/next-release/patch-20260308082306195595.json new file mode 100644 index 000000000..16cc06617 --- /dev/null +++ b/.semversioner/next-release/patch-20260308082306195595.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Add Top-K and min co-occurrence filters to NLP edge extraction to prevent O(N^2) relationship explosion" +} diff --git a/packages/graphrag/graphrag/config/defaults.py b/packages/graphrag/graphrag/config/defaults.py index 640933581..db64e8302 100644 --- a/packages/graphrag/graphrag/config/defaults.py +++ b/packages/graphrag/graphrag/config/defaults.py @@ -188,6 +188,8 @@ class ExtractGraphNLPDefaults: text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults) concurrent_requests: int = 25 async_mode: AsyncType = AsyncType.Threaded + max_entities_per_chunk: int = 0 + min_co_occurrence: int = 1 @dataclass diff --git a/packages/graphrag/graphrag/config/models/extract_graph_nlp_config.py b/packages/graphrag/graphrag/config/models/extract_graph_nlp_config.py index 5ab587cf2..bafe99c98 100644 --- a/packages/graphrag/graphrag/config/models/extract_graph_nlp_config.py +++ b/packages/graphrag/graphrag/config/models/extract_graph_nlp_config.py @@ -72,3 +72,16 @@ class ExtractGraphNLPConfig(BaseModel): description="The async mode to use.", default=graphrag_config_defaults.extract_graph_nlp.async_mode, ) + max_entities_per_chunk: int = Field( + description="Maximum number of noun-phrase entities to retain per text chunk " + "when building co-occurrence edges. Entities are ranked by global frequency " + "and only the top-K are paired, reducing edges from O(N^2) to O(K^2). " + "Set to 0 to disable (keep all entities).", + default=graphrag_config_defaults.extract_graph_nlp.max_entities_per_chunk, + ) + min_co_occurrence: int = Field( + description="Minimum number of text units in which an edge must co-occur " + "to be retained. Edges appearing in fewer text units are discarded as " + "likely coincidental. Set to 1 to disable filtering.", + default=graphrag_config_defaults.extract_graph_nlp.min_co_occurrence, + ) diff --git a/packages/graphrag/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/packages/graphrag/graphrag/index/operations/build_noun_graph/build_noun_graph.py index ece890a57..c57840164 100644 --- a/packages/graphrag/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/packages/graphrag/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -25,6 +25,8 @@ async def build_noun_graph( text_analyzer: BaseNounPhraseExtractor, normalize_edge_weights: bool, cache: Cache, + max_entities_per_chunk: int = 0, + min_co_occurrence: int = 1, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Build a noun graph from text units.""" title_to_ids = await _extract_nodes( @@ -49,6 +51,8 @@ async def build_noun_graph( title_to_ids, nodes_df=nodes_df, normalize_edge_weights=normalize_edge_weights, + max_entities_per_chunk=max_entities_per_chunk, + min_co_occurrence=min_co_occurrence, ) return (nodes_df, edges_df) @@ -100,10 +104,23 @@ def _extract_edges( title_to_ids: dict[str, list[str]], nodes_df: pd.DataFrame, normalize_edge_weights: bool = True, + max_entities_per_chunk: int = 0, + min_co_occurrence: int = 1, ) -> pd.DataFrame: """Build co-occurrence edges between noun phrases. Nodes that appear in the same text unit are connected. + + Two optional filters reduce O(N^2) edge explosion in + entity-dense corpora (e.g. scientific/technical text): + + * ``max_entities_per_chunk`` – When > 0, only the K most + globally-frequent entities per text unit are paired, + capping edges at C(K,2) instead of C(N,2). + * ``min_co_occurrence`` – When > 1, edges that appear in + fewer than this many text units are discarded, removing + coincidental co-occurrences. + Returns edges with schema [source, target, weight, text_unit_ids]. """ if not title_to_ids: @@ -111,6 +128,10 @@ def _extract_edges( columns=["source", "target", "weight", "text_unit_ids"], ) + entity_freq: dict[str, int] = { + t: len(ids) for t, ids in title_to_ids.items() + } + text_unit_to_titles: dict[str, list[str]] = defaultdict(list) for title, tu_ids in title_to_ids.items(): for tu_id in tu_ids: @@ -118,9 +139,17 @@ def _extract_edges( edge_map: dict[tuple[str, str], list[str]] = defaultdict(list) for tu_id, titles in text_unit_to_titles.items(): - if len(titles) < 2: + unique_titles = sorted(set(titles)) + if len(unique_titles) < 2: continue - for pair in combinations(sorted(set(titles)), 2): + if max_entities_per_chunk > 0 and len(unique_titles) > max_entities_per_chunk: + unique_titles = sorted( + unique_titles, + key=lambda t: entity_freq.get(t, 0), + reverse=True, + )[:max_entities_per_chunk] + unique_titles.sort() + for pair in combinations(unique_titles, 2): edge_map[pair].append(tu_id) records = [ @@ -131,7 +160,17 @@ def _extract_edges( "text_unit_ids": tu_ids, } for (src, tgt), tu_ids in edge_map.items() + if len(tu_ids) >= min_co_occurrence ] + + if len(records) < len(edge_map): + logger.info( + "Edge co-occurrence filter: %d -> %d edges (min_co_occurrence=%d)", + len(edge_map), + len(records), + min_co_occurrence, + ) + edges_df = pd.DataFrame( records, columns=["source", "target", "weight", "text_unit_ids"], diff --git a/packages/graphrag/graphrag/index/workflows/extract_graph_nlp.py b/packages/graphrag/graphrag/index/workflows/extract_graph_nlp.py index d4cae458b..8e4724107 100644 --- a/packages/graphrag/graphrag/index/workflows/extract_graph_nlp.py +++ b/packages/graphrag/graphrag/index/workflows/extract_graph_nlp.py @@ -52,6 +52,10 @@ async def run_workflow( relationships_table=relationships_table, text_analyzer=text_analyzer, normalize_edge_weights=(config.extract_graph_nlp.normalize_edge_weights), + max_entities_per_chunk=( + config.extract_graph_nlp.max_entities_per_chunk + ), + min_co_occurrence=config.extract_graph_nlp.min_co_occurrence, ) logger.info("Workflow completed: extract_graph_nlp") @@ -65,6 +69,8 @@ async def extract_graph_nlp( relationships_table: Table, text_analyzer: BaseNounPhraseExtractor, normalize_edge_weights: bool, + max_entities_per_chunk: int = 0, + min_co_occurrence: int = 1, ) -> dict[str, list[dict[str, Any]]]: """Extract noun-phrase graph and stream results to output tables.""" extracted_nodes, extracted_edges = await build_noun_graph( @@ -72,6 +78,8 @@ async def extract_graph_nlp( text_analyzer=text_analyzer, normalize_edge_weights=normalize_edge_weights, cache=cache, + max_entities_per_chunk=max_entities_per_chunk, + min_co_occurrence=min_co_occurrence, ) if len(extracted_nodes) == 0: diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index ae898d456..8625038b0 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -213,6 +213,8 @@ def assert_extract_graph_nlp_configs( assert actual.normalize_edge_weights == expected.normalize_edge_weights assert_text_analyzer_configs(actual.text_analyzer, expected.text_analyzer) assert actual.concurrent_requests == expected.concurrent_requests + assert actual.max_entities_per_chunk == expected.max_entities_per_chunk + assert actual.min_co_occurrence == expected.min_co_occurrence def assert_prune_graph_configs( diff --git a/tests/unit/indexing/operations/test_build_noun_graph.py b/tests/unit/indexing/operations/test_build_noun_graph.py new file mode 100644 index 000000000..e416e0128 --- /dev/null +++ b/tests/unit/indexing/operations/test_build_noun_graph.py @@ -0,0 +1,266 @@ +# Copyright (C) 2026 Microsoft Corporation. +# Licensed under the MIT License + +"""Tests for build_noun_graph edge extraction with Top-K and co-occurrence filters. + +Validates that _extract_edges correctly applies max_entities_per_chunk (Top-K) +and min_co_occurrence filters to control the O(N^2) co-occurrence edge explosion +in entity-dense corpora. +""" + +import pandas as pd + +from graphrag.index.operations.build_noun_graph.build_noun_graph import ( + _extract_edges, +) + + +def _make_nodes_df(title_to_ids: dict[str, list[str]]) -> pd.DataFrame: + """Build a nodes DataFrame from a title_to_ids mapping.""" + return pd.DataFrame( + [ + {"title": t, "frequency": len(ids), "text_unit_ids": ids} + for t, ids in title_to_ids.items() + ], + columns=["title", "frequency", "text_unit_ids"], + ) + + +class TestExtractEdgesDefaults: + """Baseline behaviour with default parameters (no filtering).""" + + def test_empty_input(self): + """Empty title_to_ids produces an empty edges DataFrame.""" + edges = _extract_edges({}, pd.DataFrame(), normalize_edge_weights=False) + assert len(edges) == 0 + assert list(edges.columns) == ["source", "target", "weight", "text_unit_ids"] + + def test_single_entity_no_edges(self): + """A single entity cannot form any pairs.""" + title_to_ids = {"alpha": ["tu1", "tu2"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges(title_to_ids, nodes, normalize_edge_weights=False) + assert len(edges) == 0 + + def test_two_entities_one_chunk(self): + """Two entities in the same chunk produce exactly one edge.""" + title_to_ids = {"alpha": ["tu1"], "beta": ["tu1"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges(title_to_ids, nodes, normalize_edge_weights=False) + assert len(edges) == 1 + assert edges.iloc[0]["weight"] == 1 + + def test_co_occurrence_weight(self): + """Weight equals the number of shared text units.""" + title_to_ids = {"alpha": ["tu1", "tu2", "tu3"], "beta": ["tu1", "tu2", "tu3"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges(title_to_ids, nodes, normalize_edge_weights=False) + assert len(edges) == 1 + assert edges.iloc[0]["weight"] == 3 + + def test_all_pairs_generated(self): + """N entities in one chunk produce C(N,2) edges.""" + title_to_ids = { + "a": ["tu1"], + "b": ["tu1"], + "c": ["tu1"], + "d": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges(title_to_ids, nodes, normalize_edge_weights=False) + # C(4,2) = 6 + assert len(edges) == 6 + + +class TestMaxEntitiesPerChunk: + """Top-K entity filtering per text unit.""" + + def test_disabled_when_zero(self): + """max_entities_per_chunk=0 keeps all entities (default).""" + title_to_ids = { + "a": ["tu1"], + "b": ["tu1"], + "c": ["tu1"], + "d": ["tu1"], + "e": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=0, + ) + # C(5,2) = 10 + assert len(edges) == 10 + + def test_caps_entities_per_chunk(self): + """Only top-K most frequent entities are paired per chunk.""" + # Frequencies: a=3, b=3, c=1, d=1, e=1 (all in tu1) + title_to_ids = { + "a": ["tu1", "tu2", "tu3"], + "b": ["tu1", "tu2", "tu3"], + "c": ["tu1"], + "d": ["tu1"], + "e": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=2, + ) + # Only a and b survive the top-2 filter → C(2,2)=1 edge + assert len(edges) == 1 + sources_targets = set(edges.iloc[0][["source", "target"]]) + assert sources_targets == {"a", "b"} + + def test_no_effect_when_below_limit(self): + """Top-K has no effect when chunk has fewer entities than K.""" + title_to_ids = {"a": ["tu1"], "b": ["tu1"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=10, + ) + assert len(edges) == 1 + + def test_selects_by_global_frequency(self): + """Top-K selection uses global frequency, not per-chunk count.""" + # In tu1: a, b, c, d all present + # Global freq: a=5, b=4, c=1, d=1 + # Top-2 by global freq → a, b + title_to_ids = { + "a": ["tu1", "tu2", "tu3", "tu4", "tu5"], + "b": ["tu1", "tu2", "tu3", "tu4"], + "c": ["tu1"], + "d": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=2, + ) + assert len(edges) == 1 + sources_targets = set(edges.iloc[0][["source", "target"]]) + assert sources_targets == {"a", "b"} + + def test_reduces_quadratic_explosion(self): + """Top-K significantly reduces edges in dense chunks.""" + # 20 entities in one chunk: C(20,2) = 190 edges without limit + title_to_ids = {chr(65 + i): ["tu1"] for i in range(20)} + nodes = _make_nodes_df(title_to_ids) + + edges_all = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False, max_entities_per_chunk=0 + ) + edges_k5 = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False, max_entities_per_chunk=5 + ) + # C(20,2) = 190, C(5,2) = 10 + assert len(edges_all) == 190 + assert len(edges_k5) == 10 + + +class TestMinCoOccurrence: + """Minimum co-occurrence threshold filtering.""" + + def test_default_keeps_all(self): + """min_co_occurrence=1 keeps all edges (default).""" + title_to_ids = {"a": ["tu1"], "b": ["tu1"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False, min_co_occurrence=1 + ) + assert len(edges) == 1 + + def test_filters_low_co_occurrence(self): + """Edges appearing in fewer than min_co_occurrence chunks are removed.""" + title_to_ids = { + "a": ["tu1", "tu2"], + "b": ["tu1", "tu2"], + "c": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False, min_co_occurrence=2 + ) + # a-b co-occur in tu1,tu2 (weight=2) → kept + # a-c co-occur in tu1 only (weight=1) → removed + # b-c co-occur in tu1 only (weight=1) → removed + assert len(edges) == 1 + assert set(edges.iloc[0][["source", "target"]]) == {"a", "b"} + + def test_removes_all_when_threshold_too_high(self): + """All edges removed when threshold exceeds max weight.""" + title_to_ids = {"a": ["tu1"], "b": ["tu1"]} + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False, min_co_occurrence=5 + ) + assert len(edges) == 0 + + +class TestCombinedFilters: + """Top-K and co-occurrence filters work together.""" + + def test_both_filters_applied(self): + """Top-K limits entities, then co-occurrence filters weak edges.""" + # 6 entities in tu1 and tu2: a(freq=5), b(freq=4), c(freq=3), d/e/f(freq=1) + title_to_ids = { + "a": ["tu1", "tu2", "tu3", "tu4", "tu5"], + "b": ["tu1", "tu2", "tu3", "tu4"], + "c": ["tu1", "tu2", "tu3"], + "d": ["tu1"], + "e": ["tu1"], + "f": ["tu1"], + } + nodes = _make_nodes_df(title_to_ids) + edges = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=3, + min_co_occurrence=2, + ) + # Top-3 in tu1: a, b, c → pairs: a-b, a-c, b-c + # Top-3 in tu2: a, b, c → same pairs + # Top-3 in tu3: a, b, c → same pairs + # Top-3 in tu4: a, b (only 2 entities, no pairs from tu5) + # a-b: tu1,tu2,tu3,tu4 (weight=4) ✓ + # a-c: tu1,tu2,tu3 (weight=3) ✓ + # b-c: tu1,tu2,tu3 (weight=3) ✓ + assert len(edges) == 3 + for _, row in edges.iterrows(): + assert row["weight"] >= 2 + + def test_backward_compatible_defaults(self): + """Default parameters produce the same result as original code.""" + title_to_ids = { + "x": ["tu1", "tu2"], + "y": ["tu1"], + "z": ["tu2"], + } + nodes = _make_nodes_df(title_to_ids) + + edges_default = _extract_edges( + title_to_ids, nodes, normalize_edge_weights=False + ) + edges_explicit = _extract_edges( + title_to_ids, + nodes, + normalize_edge_weights=False, + max_entities_per_chunk=0, + min_co_occurrence=1, + ) + assert len(edges_default) == len(edges_explicit) + assert set( + zip(edges_default["source"], edges_default["target"]) + ) == set( + zip(edges_explicit["source"], edges_explicit["target"]) + )