From 7483659bed34e6dbbae28efaae3947e4d86436eb Mon Sep 17 00:00:00 2001 From: jichao wang Date: Sat, 20 Jun 2026 04:05:34 +0100 Subject: [PATCH] fix(compiler): preserve frontmatter when closing "---" has no trailing newline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `frontmatter.split` returns the frontmatter block ending in a bare "\n---" (not "\n---\n") for a page that ends right at the closing delimiter with no trailing newline — e.g. a frontmatter-only concept/entity page. Both `_prepend_source_to_frontmatter` and `_remove_source_from_frontmatter` located the closing delimiter with `fm_block.rpartition("\n---\n")`, which finds nothing in that case: `fm_prefix` becomes "" and every existing frontmatter line (the opening "---", `type`, `description`, and the prior `sources:` list) is dropped — silently corrupting the page and losing source provenance. Strip whichever closing form is actually present ("\n---\n" or a bare "\n---") and re-append the same one. Pages that have a body (the common case) are byte-for-byte unaffected. This is reached in practice via `_add_related_link` (run for every related concept/entity during compile) and the `openkb remove` flow (`remove_doc_from_{concept,entity}_pages`). Adds `TestFrontmatterSourceMutation` covering prepend + remove on a no-trailing-newline page, plus a with-body regression guard. --- openkb/agent/compiler.py | 20 +++++++++++++------- tests/test_compiler.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index c2b92619..35e8ad84 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -940,12 +940,15 @@ def _prepend_source_to_frontmatter(text: str, source_file: str) -> str: return text fm_block, body = parts - # Split the fm_block into lines for per-line manipulation. fm_block ends - # with "\n---\n"; strip the trailing closing delimiter + newline to get - # the prefix lines (opening "---" + content lines), then re-append after. - fm_prefix, _, _ = fm_block.rpartition("\n---\n") + # Strip the trailing closing delimiter to get the prefix lines (opening + # "---" + content lines), then re-append it. `frontmatter.split` leaves the + # closing at the end of fm_block as either "\n---\n" or a bare "\n---" (when + # the page ends at the delimiter with no trailing newline). Assuming only + # "\n---\n" would, for the bare form, make the strip below collapse the + # whole block and drop every existing frontmatter key. + closing = "\n---\n" if fm_block.endswith("\n---\n") else "\n---" + fm_prefix = fm_block[: -len(closing)] fm_lines = fm_prefix.split("\n") - closing = "\n---\n" for i, line in enumerate(fm_lines): if not line.lstrip().startswith("sources:"): @@ -981,9 +984,12 @@ def _remove_source_from_frontmatter(text: str, source_file: str) -> tuple[str, b return text, False fm_block, body = parts - fm_prefix, _, _ = fm_block.rpartition("\n---\n") + # See _prepend_source_to_frontmatter: the closing delimiter may be "\n---\n" + # or a bare "\n---" (no trailing newline); strip whichever is present so the + # existing frontmatter lines (and the sources: line we need) are preserved. + closing = "\n---\n" if fm_block.endswith("\n---\n") else "\n---" + fm_prefix = fm_block[: -len(closing)] fm_lines = fm_prefix.split("\n") - closing = "\n---\n" for i, line in enumerate(fm_lines): if not line.lstrip().startswith("sources:"): diff --git a/tests/test_compiler.py b/tests/test_compiler.py index dfc6d6ff..6ec387a9 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -27,11 +27,44 @@ _parse_entities_plan, _filter_entity_items, _ENTITY_TYPE_LIST, + _prepend_source_to_frontmatter, + _remove_source_from_frontmatter, remove_doc_from_entity_pages, ) from openkb.config import resolve_entity_types +class TestFrontmatterSourceMutation: + """``_prepend``/``_remove_source_from_frontmatter`` must preserve existing + frontmatter even when the page ends at the closing ``---`` with no trailing + newline — ``frontmatter.split`` then returns a block ending in a bare + ``\\n---`` rather than ``\\n---\\n``. + """ + + def test_prepend_preserves_keys_without_trailing_newline(self): + text = '---\nsources: ["summaries/p1.md"]\ntype: "Concept"\ndescription: "Focus"\n---' + out = _prepend_source_to_frontmatter(text, "summaries/p2.md") + assert out.startswith("---\n") # opening delimiter kept + assert 'type: "Concept"' in out # other keys kept + assert 'description: "Focus"' in out + assert "summaries/p1.md" in out # existing source kept + assert "summaries/p2.md" in out # new source prepended + + def test_remove_preserves_keys_without_trailing_newline(self): + text = '---\ntype: "Organization"\nsources: ["summaries/doc.md"]\n---' + out, now_empty = _remove_source_from_frontmatter(text, "summaries/doc.md") + assert now_empty is True # it was the only source + assert 'type: "Organization"' in out # other key preserved + assert "summaries/doc.md" not in out # source removed + + def test_prepend_with_body_is_unchanged(self): + text = '---\nsources: ["a.md"]\ntype: "Concept"\n---\n\nBody.\n' + out = _prepend_source_to_frontmatter(text, "b.md") + assert out.startswith("---\n") + assert "b.md" in out and "a.md" in out + assert out.endswith("\n\nBody.\n") # body + closing untouched + + class TestParseJson: def test_plain_json(self): assert _parse_json('[{"name": "foo"}]') == [{"name": "foo"}]