diff --git a/src/harmony/matching/matcher.py b/src/harmony/matching/matcher.py
index 46a59db..911fb6c 100644
--- a/src/harmony/matching/matcher.py
+++ b/src/harmony/matching/matcher.py
@@ -28,6 +28,7 @@
 import os
 import pathlib
 import statistics
+import warnings
 from collections import Counter, OrderedDict
 from typing import List, Callable, Optional, Union
 
@@ -57,6 +58,14 @@
 DetectorFactory.seed = 0
 
 
+_CATALOGUE_DEPRECATION_MESSAGE = (
+    "The catalogue-matching code path is deprecated and will be removed in a "
+    "future release. The hosted catalogue search is now backed by a Weaviate "
+    "index (see https://harmonydata.ac.uk/search). This function is retained "
+    "only for backwards compatibility with existing callers."
+)
+
+
 # This has been tested on 16 GB RAM production server, 1000 seems a safe number (TW, 15 Dec 2024)
 def get_batch_size(default=1000):
     try:
@@ -216,6 +225,11 @@ def match_instruments_with_catalogue_instruments(
     """
     Match instruments with catalogue instruments.
 
+    .. deprecated::
+        The catalogue path was replaced by a Weaviate index
+        (https://harmonydata.ac.uk/search) because it did not scale. This
+        function will be removed in a future release.
+
     :param instruments: The instruments.
     :param catalogue_data: The catalogue data.
     :param vectorisation_function: A function to vectorize a text.
@@ -224,6 +238,8 @@ def match_instruments_with_catalogue_instruments(
         Index 1 in the tuple contains a list of closest instrument matches from the catalog for all the instruments.
     """
 
+    warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)
+
     # Gather all questions
     all_questions: List[str] = []
     for instrument in instruments:
@@ -275,6 +291,11 @@ def match_questions_with_catalogue_instruments(
     Each question from the list will receive the closest instrument match for it.
     The closest instrument match for all questions is returned as a result of this function.
 
+    .. deprecated::
+        The catalogue path was replaced by a Weaviate index
+        (https://harmonydata.ac.uk/search) because it did not scale. This
+        function will be removed in a future release.
+
     :param questions: The questions.
     :param catalogue_data: The catalogue data.
     :param all_instruments_text_vectors: A list of text vectors of all questions found in all the instruments uploaded.
@@ -283,6 +304,8 @@ def match_questions_with_catalogue_instruments(
     :return: A list of closest instrument matches for the questions provided.
     """
 
+    warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)
+
     # Catalogue data
     catalogue_instrument_idx_to_catalogue_questions_idx: List[List[int]] = catalogue_data[
         "instrument_idx_to_question_idx"
@@ -516,6 +539,11 @@ def match_query_with_catalogue_instruments(
     """
     Match query with catalogue instruments.
 
+    .. deprecated::
+        The catalogue path was replaced by a Weaviate index
+        (https://harmonydata.ac.uk/search) because it did not scale. This
+        function will be removed in a future release.
+
     :param query: The query.
     :param catalogue_data: The catalogue data.
     :param vectorisation_function: A function to vectorize a text.
@@ -525,6 +553,8 @@ def match_query_with_catalogue_instruments(
         E.g. {"instruments": [...], "new_text_vectors": {...}}.
     """
 
+    warnings.warn(_CATALOGUE_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2)
+
     response = {"instruments": [], "new_text_vectors": {}}
 
     # Catalogue data
diff --git a/tests/test_catalogue_deprecation_warnings.py b/tests/test_catalogue_deprecation_warnings.py
new file mode 100644
index 0000000..67e8b38
--- /dev/null
+++ b/tests/test_catalogue_deprecation_warnings.py
@@ -0,0 +1,99 @@
+"""Verify the three catalogue-matching entry points emit DeprecationWarning.
+
+The catalogue path was replaced by a Weaviate
+index and these functions are slated for removal. They must emit a
+DeprecationWarning whose message references Weaviate so library users can
+discover the replacement.
+"""
+import sys
+import warnings
+
+import numpy as np
+
+sys.path.append("../src")
+
+from harmony.matching.matcher import (
+    match_instruments_with_catalogue_instruments,
+    match_query_with_catalogue_instruments,
+    match_questions_with_catalogue_instruments,
+)
+from harmony.schemas.requests.text import Instrument, Question
+from harmony.schemas.text_vector import TextVector
+
+
+def _minimal_catalogue_data():
+    return {
+        "instrument_idx_to_question_idx": [[0]],
+        "all_embeddings_concatenated": np.array([[1.0, 0.0]]),
+        "all_instruments": [{"instrument_name": "X", "metadata": {"source": "ref"}}],
+        "all_questions": ["q"],
+    }
+
+
+def _assert_deprecation_mentions_weaviate(records):
+    dep = [r for r in records if issubclass(r.category, DeprecationWarning)]
+    assert dep, "expected at least one DeprecationWarning"
+    assert any("weaviate" in str(r.message).lower() for r in dep), (
+        f"DeprecationWarning should mention Weaviate; got: "
+        f"{[str(r.message) for r in dep]}"
+    )
+
+
+def test_match_questions_with_catalogue_instruments_is_deprecated():
+    questions = [Question(question_text="q")]
+    vectors = [TextVector(text="q", vector=[1.0, 0.0], is_negated=False, is_query=False)]
+    with warnings.catch_warnings(record=True) as recs:
+        warnings.simplefilter("always")
+        try:
+            match_questions_with_catalogue_instruments(
+                questions=questions,
+                catalogue_data=_minimal_catalogue_data(),
+                all_instruments_text_vectors=vectors,
+                questions_are_from_one_instrument=True,
+            )
+        except Exception:
+            pass
+    _assert_deprecation_mentions_weaviate(recs)
+
+
+def test_match_query_with_catalogue_instruments_is_deprecated():
+    with warnings.catch_warnings(record=True) as recs:
+        warnings.simplefilter("always")
+        try:
+            match_query_with_catalogue_instruments(
+                query="hello",
+                catalogue_data=_minimal_catalogue_data(),
+                vectorisation_function=lambda texts: np.array([[1.0, 0.0]] * len(texts)),
+                texts_cached_vectors={},
+            )
+        except Exception:
+            pass
+    _assert_deprecation_mentions_weaviate(recs)
+
+
+def test_match_instruments_with_catalogue_instruments_is_deprecated():
+    instruments = [
+        Instrument(
+            file_id="f",
+            instrument_id="i",
+            instrument_name="I",
+            file_name="f.pdf",
+            file_type="pdf",
+            file_section="s",
+            language="en",
+            questions=[Question(question_text="q")],
+        )
+    ]
+    with warnings.catch_warnings(record=True) as recs:
+        warnings.simplefilter("always")
+        try:
+            match_instruments_with_catalogue_instruments(
+                instruments=instruments,
+                catalogue_data=_minimal_catalogue_data(),
+                vectorisation_function=lambda texts: np.array([[1.0, 0.0]] * len(texts)),
+                texts_cached_vectors={},
+                is_negate=False,
+            )
+        except Exception:
+            pass
+    _assert_deprecation_mentions_weaviate(recs)
\ No newline at end of file
diff --git a/tests/test_match_catalogue_instruments.py b/tests/test_match_catalogue_instruments.py
new file mode 100644
index 0000000..0111d93
--- /dev/null
+++ b/tests/test_match_catalogue_instruments.py
@@ -0,0 +1,271 @@
+"""Characterization tests for match_questions_with_catalogue_instruments.
+
+These tests pin down the function's current observable behavior using a small,
+deterministic synthetic catalogue. They are scoped to the catalogue deprecation
+window (see PR #133) and will be removed alongside the catalogue functions.
+"""
+import numpy as np
+import pytest
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:The catalogue-matching code path is deprecated:DeprecationWarning"
+)
+
+from harmony.matching.matcher import match_questions_with_catalogue_instruments
+from harmony.schemas.requests.text import Question
+from harmony.schemas.text_vector import TextVector
+
+
+def _catalogue_data():
+    """Synthetic catalogue with 3 instruments sharing some questions.
+
+    Catalogue question texts (and their indices):
+        0: "anxious"
+        1: "nervous"
+        2: "sad"
+        3: "tired"
+
+    Embeddings are 2-D and chosen so cosine similarity is predictable:
+        anxious -> [1, 0]
+        nervous -> [0.9, 0.1]   (close to anxious)
+        sad     -> [0, 1]
+        tired   -> [0.1, 0.9]   (close to sad)
+
+    Instrument membership:
+        inst 0 ("GAD"):  questions [0, 1]            -> anxiety
+        inst 1 ("PHQ"):  questions [2, 3]            -> depression
+        inst 2 ("MIXED"): questions [1, 2]           -> shared
+    """
+    all_questions = ["anxious", "nervous", "sad", "tired"]
+    all_embeddings = np.array(
+        [[1.0, 0.0], [0.9, 0.1], [0.0, 1.0], [0.1, 0.9]],
+        dtype=np.float64,
+    )
+    instrument_idx_to_question_idx = [[0, 1], [2, 3], [1, 2]]
+    all_instruments = [
+        {"instrument_name": "GAD", "metadata": {"source": "ref", "url": "u0", "sweep_id": "s0"}},
+        {"instrument_name": "PHQ", "metadata": {"source": "ref", "url": "u1", "sweep_id": "s1"}},
+        {"instrument_name": "MIXED", "metadata": {"source": "ref", "url": "u2"}},
+    ]
+    return {
+        "instrument_idx_to_question_idx": instrument_idx_to_question_idx,
+        "all_embeddings_concatenated": all_embeddings,
+        "all_instruments": all_instruments,
+        "all_questions": all_questions,
+    }
+
+
+def _input_questions_and_vectors():
+    """Two input questions; first matches 'anxious' best, second matches 'sad' best."""
+    questions = [
+        Question(question_text="I feel anxious"),
+        Question(question_text="I feel sad"),
+    ]
+    text_vectors = [
+        TextVector(text="I feel anxious", vector=[1.0, 0.0], is_negated=False, is_query=False),
+        TextVector(text="I feel sad",     vector=[0.0, 1.0], is_negated=False, is_query=False),
+    ]
+    return questions, text_vectors
+
+
+def test_empty_catalogue_returns_empty_list():
+    catalogue = _catalogue_data()
+    catalogue["all_embeddings_concatenated"] = np.zeros((0, 0))
+    questions, vectors = _input_questions_and_vectors()
+    result = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    assert result == []
+
+
+def test_each_matched_instrument_has_required_metadata_fields():
+    """First input top-matches 'anxious' (in GAD); second top-matches 'sad' (in PHQ and MIXED).
+    All three catalogue instruments end up in the result; each result carries the full
+    metadata bundle the downstream consumers depend on."""
+    catalogue = _catalogue_data()
+    questions, vectors = _input_questions_and_vectors()
+    result = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    names = [r.instrument_name for r in result]
+    assert set(names) == {"GAD", "PHQ", "MIXED"}
+    for r in result:
+        assert r.metadata["num_matched_questions"] >= 1
+        assert r.metadata["num_ref_instrument_questions"] >= 1
+        assert "info" in r.metadata
+        assert r.metadata["mean_cosine_similarity"] is not None
+
+
+def test_full_output_snapshot_for_synthetic_catalogue():
+    """Pin the full structure of the result against a known input.
+
+    Any field drift — info string format, url, sweep, counts, mean similarity —
+    breaks this test. This is the byte-identical guarantee the refactor promises.
+    """
+    catalogue = _catalogue_data()
+    questions, vectors = _input_questions_and_vectors()
+    result = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    serialized = [r.model_dump() for r in result]
+    expected = [
+        {
+            "instrument_name": "GAD",
+            "instrument_url": "u0",
+            "source": "REF",
+            "sweep": "s0",
+            "metadata": {
+                "info": (
+                    "GAD Sweep s0 matched 1 question(s) in your instrument, "
+                    "your instrument contains 2 question(s). "
+                    "The reference instrument contains 2 question(s)."
+                ),
+                "num_matched_questions": 1,
+                "num_ref_instrument_questions": 2,
+                "mean_cosine_similarity": pytest.approx(1.0),
+            },
+        },
+        {
+            "instrument_name": "PHQ",
+            "instrument_url": "u1",
+            "source": "REF",
+            "sweep": "s1",
+            "metadata": {
+                "info": (
+                    "PHQ Sweep s1 matched 1 question(s) in your instrument, "
+                    "your instrument contains 2 question(s). "
+                    "The reference instrument contains 2 question(s)."
+                ),
+                "num_matched_questions": 1,
+                "num_ref_instrument_questions": 2,
+                "mean_cosine_similarity": pytest.approx(1.0),
+            },
+        },
+        {
+            "instrument_name": "MIXED",
+            "instrument_url": "u2",
+            "source": "REF",
+            "sweep": "",
+            "metadata": {
+                "info": (
+                    "MIXED Sweep UNKNOWN matched 1 question(s) in your instrument, "
+                    "your instrument contains 2 question(s). "
+                    "The reference instrument contains 2 question(s)."
+                ),
+                "num_matched_questions": 1,
+                "num_ref_instrument_questions": 2,
+                "mean_cosine_similarity": pytest.approx(1.0),
+            },
+        },
+    ]
+    assert serialized == expected
+
+
+def test_two_inputs_with_same_top_match_each_contribute_similarity_to_owning_instruments():
+    """Both input questions top-match Q2 ('sad'), which is contained in PHQ and MIXED.
+
+    The second nested loop in `match_questions_with_catalogue_instruments` appends
+    similarity per input question (not per instrument), so each owning instrument
+    receives TWO similarity entries here. This invariant is exactly what the
+    reverse-index refactor of that loop must preserve. GAD contains neither top
+    match and must not appear in the result.
+    """
+    catalogue = _catalogue_data()
+    questions = [
+        Question(question_text="I am sad"),
+        Question(question_text="feeling down"),
+    ]
+    vectors = [
+        TextVector(text="I am sad", vector=[0.0, 1.0], is_negated=False, is_query=False),
+        TextVector(text="feeling down", vector=[0.0, 1.0], is_negated=False, is_query=False),
+    ]
+    result = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=False,
+    )
+    by_name = {r.instrument_name: r for r in result}
+    assert by_name["PHQ"].metadata["num_matched_questions"] == 2
+    assert by_name["MIXED"].metadata["num_matched_questions"] == 2
+    assert "GAD" not in by_name
+
+
+def test_closest_question_attached_to_each_input_question():
+    catalogue = _catalogue_data()
+    questions, vectors = _input_questions_and_vectors()
+    match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    assert questions[0].closest_catalogue_question_match.question == "anxious"
+    assert questions[1].closest_catalogue_question_match.question == "sad"
+    # 'anxious' lives in GAD (idx 0) -> seen_in_instruments contains GAD
+    assert any(
+        si.instrument_name == "GAD"
+        for si in questions[0].closest_catalogue_question_match.seen_in_instruments
+    )
+    # 'sad' lives in PHQ (idx 1) and MIXED (idx 2)
+    seen_names_q1 = {
+        si.instrument_name
+        for si in questions[1].closest_catalogue_question_match.seen_in_instruments
+    }
+    assert seen_names_q1 == {"PHQ", "MIXED"}
+
+
+def test_info_string_one_vs_many_instruments():
+    catalogue = _catalogue_data()
+    questions, vectors = _input_questions_and_vectors()
+
+    res_single = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    res_multi = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=False,
+    )
+    # The wording differs by branch — pin it down
+    assert "in your instrument," in res_single[0].metadata["info"]
+    assert "in all of your instruments," in res_multi[0].metadata["info"]
+
+
+def test_orphan_top_match_yields_empty_seen_in_and_empty_result():
+    """An isolated catalogue question (in no instrument) is a legal top-match target.
+    `seen_in_instruments` must be `[]` and the top-instruments list must be `[]`
+    rather than crashing on a missing key."""
+    catalogue = _catalogue_data()
+    # Add a 5th catalogue question that no instrument references
+    catalogue["all_questions"] = catalogue["all_questions"] + ["orphan"]
+    catalogue["all_embeddings_concatenated"] = np.vstack(
+        [catalogue["all_embeddings_concatenated"], np.array([[0.5, 0.5]])]
+    )
+    # Input vector deliberately closest to the orphan
+    questions = [Question(question_text="balanced")]
+    vectors = [TextVector(text="balanced", vector=[0.5, 0.5], is_negated=False, is_query=False)]
+    result = match_questions_with_catalogue_instruments(
+        questions=questions,
+        catalogue_data=catalogue,
+        all_instruments_text_vectors=vectors,
+        questions_are_from_one_instrument=True,
+    )
+    # No instrument contains the orphan, so closest_catalogue_question_match.seen_in_instruments is empty,
+    # and the top-instruments list is also empty (no instrument got a match).
+    assert questions[0].closest_catalogue_question_match.question == "orphan"
+    assert questions[0].closest_catalogue_question_match.seen_in_instruments == []
+    assert result == []