From b0638c1a65bf5ab78e0e1b53d088ac22b70fc1f7 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 23 Feb 2026 16:43:14 +0100 Subject: [PATCH 1/5] Add chunk prioritization and always RAG support - Add configurable RAG strategies: always RAG which is performed at each query (OKP Solr + BYOK) and tool RAG can be used independently or together - Add chunk prioritization with score multipliers per vector store for always RAG - Added knobs in config to select the RAG strategy - Tool RAG defaults to enabled=True for backward compatibility - Update lightspeed stack configuration enrichment script to build the solr section in llama stack and fix bugs in building the vector stores - Updated byok and rag documentation - Updated unit tests --- docs/byok_guide.md | 97 +++- docs/config.md | 63 ++- docs/rag_guide.md | 26 +- examples/lightspeed-stack-byok-rag.yaml | 22 +- lightspeed-stack.yaml | 19 +- run.yaml | 3 + src/app/endpoints/query.py | 43 +- src/app/endpoints/streaming_query.py | 50 +- src/configuration.py | 26 +- src/constants.py | 33 +- src/llama_stack_configuration.py | 161 +++++- src/models/config.py | 119 ++++- src/utils/responses.py | 79 ++- src/utils/types.py | 20 + src/utils/vector_search.py | 492 ++++++++++++++---- .../app/endpoints/test_streaming_query.py | 26 +- tests/unit/models/config/test_byok_rag.py | 36 +- .../models/config/test_dump_configuration.py | 51 +- tests/unit/models/responses/test_rag_chunk.py | 86 ++- tests/unit/test_configuration.py | 80 +++ tests/unit/test_llama_stack_configuration.py | 133 ++++- tests/unit/utils/test_responses.py | 97 +++- 22 files changed, 1478 insertions(+), 284 deletions(-) diff --git a/docs/byok_guide.md b/docs/byok_guide.md index 29ac81151..a5b95a359 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -16,7 +16,7 @@ The BYOK (Bring Your Own Knowledge) feature in Lightspeed Core enables users to * [Step 2: Create Vector Database](#step-2-create-vector-database) * [Step 3: Configure Embedding Model](#step-3-configure-embedding-model) * [Step 4: Configure Llama Stack](#step-4-configure-llama-stack) - * [Step 5: Enable RAG Tools](#step-5-enable-rag-tools) + * [Step 5: Configure RAG Strategy](#step-5-configure-rag-strategy) * [Supported Vector Database Types](#supported-vector-database-types) * [Configuration Examples](#configuration-examples) * [Conclusion](#conclusion) @@ -34,27 +34,52 @@ BYOK (Bring Your Own Knowledge) is Lightspeed Core's implementation of Retrieval ## How BYOK Works -The BYOK system operates through a sophisticated chain of components: +BYOK knowledge sources can be queried in two complementary modes, configured independently: -1. **Agent Orchestrator**: The AI agent acts as the central coordinator, using the LLM as its reasoning engine -2. **RAG Tool**: When the agent needs external information, it queries your custom vector database -3. **Vector Database**: Your indexed knowledge sources, stored as vector embeddings for semantic search -4. **Embedding Model**: Converts queries and documents into vector representations for similarity matching -5. **Context Integration**: Retrieved knowledge is integrated into the AI's response generation process +### Always RAG (pre-query injection) + +Context is fetched from your BYOK vector stores and/or Solr **before** the LLM generates a response, and injected into every query automatically. No tool calls are required. + +```mermaid +graph TD + A[User Query] --> B[Fetch Context] + B --> C[BYOK Vector Stores] + B --> D[Solr OKP] + C --> E[Retrieved Chunks] + D --> E + E --> F[Inject Context into Prompt Context] + F --> G[LLM Generates Response] + G --> H[Response to User] +``` + +### Tool RAG (on-demand retrieval) + +The LLM can call the `file_search` tool during generation when it decides external knowledge is needed. Only BYOK vector stores are supported in Tool RAG mode. ```mermaid graph TD - A[User Query] --> B[AI Agent] + A[User Query] --> P{Always RAG enabled?} + P -->|Yes| Q[Fetch Context] + Q --> R[BYOK Vector Stores / Solr OKP] + R --> S[Inject Context into Prompt Context] + S --> B[LLM] + P -->|No| B B --> C{Need External Knowledge?} - C -->|Yes| D[RAG Tool] + C -->|Yes| D[file_search Tool] C -->|No| E[Generate Response] - D --> F[Vector Database] + D --> F[BYOK Vector Stores] F --> G[Retrieve Relevant Context] - G --> H[Integrate Context] - H --> E - E --> I[Response to User] + G --> B + E --> H[Response to User] ``` +Both modes rely on: +- **Vector Database**: Your indexed knowledge sources stored as vector embeddings +- **Embedding Model**: Converts queries and documents into vector representations for similarity matching + +Always RAG additionally supports: +- **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. + --- ## Prerequisites @@ -244,12 +269,50 @@ registered_resources: **⚠️ Important**: The `vector_store_id` value must exactly match the ID you provided when creating the vector database using the rag-content tool. This identifier links your Llama Stack configuration to the specific vector database index you created. -### Step 5: Enable RAG Tools +> [!TIP] +> Instead of manually editing `run.yaml`, you can declare your knowledge sources in the `byok_rag` +> section of `lightspeed-stack.yaml`. The service automatically generates the required configuration +> at startup. +> +> ```yaml +> byok_rag: +> - rag_id: my-docs # Unique identifier for this knowledge source +> rag_type: inline::faiss +> embedding_model: sentence-transformers/all-mpnet-base-v2 +> embedding_dimension: 768 +> vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) +> db_path: /path/to/vector_db/faiss_store.db +> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources +> ``` +> +> When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of +> each store's results during Always RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. + +### Step 5: Configure RAG Strategy + +Add a `rag` section to your `lightspeed-stack.yaml` to choose how BYOK knowledge is used: + +```yaml +rag: + # Always RAG: inject context before every LLM response (no tool calls needed) + always: + byok: + enabled: true # fetch and inject BYOK vector store context pre-query + solr: + enabled: true # fetch and inject Solr OKP context pre-query + + # Tool RAG: the LLM can call file_search to retrieve context on demand + tool: + byok: + enabled: true # expose BYOK vector stores as the file_search tool +``` -The configuration above automatically enables the RAG tools. The system will: +Both modes can be enabled simultaneously. Choose based on your latency and control preferences: -1. **Detect RAG availability**: Automatically identify when RAG is available -2. **Enhance prompts**: Encourage the AI to use RAG tools +| Mode | When context is fetched | Tool call needed | Supported sources | score_multiplier | +|------|------------------------|------------------|-------------------|-----------------| +| Always RAG | Before every query | No | BYOK + Solr | Yes (BYOK only) | +| Tool RAG | On LLM demand | Yes | BYOK only | No | --- diff --git a/docs/config.md b/docs/config.md index 6d1dde908..0dc14d6fd 100644 --- a/docs/config.md +++ b/docs/config.md @@ -114,11 +114,12 @@ BYOK (Bring Your Own Knowledge) RAG configuration. | Field | Type | Description | |-------|------|-------------| | rag_id | string | Unique RAG ID | -| rag_type | string | Type of RAG database. | +| rag_type | string | Type of RAG database (e.g. `inline::faiss`). | | embedding_model | string | Embedding model identification | | embedding_dimension | integer | Dimensionality of embedding vectors. | | vector_db_id | string | Vector database identification. | | db_path | string | Path to RAG database. | +| score_multiplier | number | Multiplier applied to relevance scores from this vector store when querying multiple sources. Values > 1 boost results; values < 1 reduce them. Default: 1.0. | ## CORSConfiguration @@ -170,7 +171,7 @@ Global service configuration. | azure_entra_id | | | | splunk | | Splunk HEC configuration for sending telemetry events. | | deployment_environment | string | Deployment environment name (e.g., 'development', 'staging', 'production'). Used in telemetry events. | -| solr | | Configuration for Solr vector search operations. | +| rag | | RAG strategy configuration (Solr and BYOK). Controls pre-query (Always RAG) and tool-based (Tool RAG) retrieval. | ## ConversationHistoryConfiguration @@ -520,10 +521,42 @@ the service can handle requests concurrently. | cors | | Cross-Origin Resource Sharing configuration for cross-domain requests | -## SolrConfiguration +## RagConfiguration -Solr configuration for vector search queries. +Top-level RAG strategy configuration. Controls two complementary retrieval modes: + +- **Always RAG**: context is fetched from Solr and/or BYOK vector stores and injected + into every query before the LLM responds. +- **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve + context on demand from BYOK vector stores. + + +| Field | Type | Description | +|-------|------|-------------| +| always | | Pre-query RAG from Solr and BYOK. See AlwaysRagConfiguration. | +| tool | | Tool-based RAG that the LLM can invoke. See ToolRagConfiguration. | + + +## AlwaysRagConfiguration + + +Pre-query RAG configuration that injects context before the LLM generates a response. + +Both Solr and BYOK sources can be enabled independently. When enabled, retrieved +chunks are added as context on every query. + + +| Field | Type | Description | +|-------|------|-------------| +| solr | | Solr RAG configuration for pre-query context injection. | +| byok | | BYOK RAG configuration for pre-query context injection. | + + +## SolrRagConfiguration + + +Solr configuration for Always RAG (pre-query context injection). Controls whether to use offline or online mode when building document URLs from vector search results, and enables/disables Solr vector IO functionality. @@ -535,6 +568,28 @@ from vector search results, and enables/disables Solr vector IO functionality. | offline | boolean | When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs. | +## ByokRagConfiguration + + +Configuration to enable or disable BYOK RAG retrieval. + + +| Field | Type | Description | +|-------|------|-------------| +| enabled | boolean | When True, queries BYOK vector stores for RAG context. Default: False. | + + +## ToolRagConfiguration + + +Configuration for exposing RAG as a tool the LLM can call during generation. + + +| Field | Type | Description | +|-------|------|-------------| +| byok | | BYOK RAG configuration for tool-based retrieval. Default: enabled. | + + ## SplunkConfiguration diff --git a/docs/rag_guide.md b/docs/rag_guide.md index d07a03b0d..9d968a952 100644 --- a/docs/rag_guide.md +++ b/docs/rag_guide.md @@ -5,7 +5,7 @@ This document explains how to configure and customize your RAG pipeline using th * Initialize a vector store * Download and point to a local embedding model * Configure an inference provider (LLM) -* Enable Agent-based RAG querying +* Choose a RAG strategy (Always RAG or Tool RAG) --- @@ -26,12 +26,17 @@ This document explains how to configure and customize your RAG pipeline using th # Introduction -RAG in Lightspeed Core Stack (LCS) is yet only supported via the Agents API. The agent is responsible for planning and deciding when to query the vector index. +Lightspeed Core Stack (LCS) supports two complementary RAG strategies: -The system operates a chain of command. The **Agent** is the orchestrator, using the LLM as its reasoning engine. When a plan requires external information, the Agent queries the **Vector Store**. This is your database of indexed knowledge, which you are responsible for creating before running the stack. The **Embedding Model** is used to convert the queries to vectors. +- **Always RAG**: context is fetched from Solr and/or BYOK vector stores and injected into every query before the LLM responds. No tool calls are required. +- **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve context on demand from BYOK vector stores. + +Both strategies can be enabled independently via the `rag` section of `lightspeed-stack.yaml`. See [BYOK Feature Documentation](byok_guide.md) for configuration details. + +The **Embedding Model** is used to convert queries and documents into vector representations for similarity matching. > [!NOTE] -> The same Embedding Model should be used to both create the store and to query it. +> The same Embedding Model should be used to both create the vector store and to query it. --- @@ -318,19 +323,20 @@ Note: if the vector database (portal-rag) is not in the persistent data store wi **2. Configure Lightspeed Stack (`lightspeed-stack.yaml`):** ```yaml -solr: - enabled: true # Enable Solr vector IO functionality - offline: true # Use parent_id for document URLs (offline mode) - # Set to false to use reference_url (online mode) +rag: + always: + solr: + enabled: true # Enable Solr vector IO (Always RAG - pre-query injection) + offline: true # Use parent_id for document URLs (offline mode) + # Set to false to use reference_url (online mode) ``` **Query Request Example:** ``` curl -sX POST http://localhost:8080/v1/query \ -H "Content-Type: application/json" \ - -d '{"query" : "how do I secure a nodejs application with keycloak?", "no_tools":true}' | jq . + -d '{"query" : "how do I secure a nodejs application with keycloak?"}' | jq . ``` -Note: Solr does not currently work with RAG tools. You will need to specify "no_tools": true in request. **Query Processing:** diff --git a/examples/lightspeed-stack-byok-rag.yaml b/examples/lightspeed-stack-byok-rag.yaml index 7780ac21f..575b3fcf4 100644 --- a/examples/lightspeed-stack-byok-rag.yaml +++ b/examples/lightspeed-stack-byok-rag.yaml @@ -38,10 +38,28 @@ byok_rag: - rag_id: ocp_docs rag_type: inline::faiss embedding_dimension: 1024 - vector_db_id: vector_byok_1 + vector_db_id: vs_123 # Llama-stack vector_store_id db_path: /tmp/ocp.faiss + score_multiplier: 1.0 # Weight for this vector store's results - rag_id: knowledge_base rag_type: inline::faiss embedding_dimension: 384 - vector_db_id: vector_byok_2 + vector_db_id: vs_456 # Llama-stack vector_store_id db_path: /tmp/kb.faiss + score_multiplier: 1.2 # Weight for this vector store's results + +# RAG configuration +rag: + # Always RAG: context injected before every LLM response (no tool calls needed) + # Supports both Solr and BYOK sources. Score multipliers apply here only. + always: + solr: + enabled: false # Enable Solr OKP pre-query context injection + offline: false # Controls how document URLs are built from Solr results + byok: + enabled: false # Enable BYOK pre-query context injection (weighted by score_multiplier) + # Tool RAG: LLM calls file_search on demand to retrieve BYOK context + # Note: Solr is not available in Tool RAG; score_multiplier does not apply here + tool: + byok: + enabled: true # Enable BYOK vector stores as the file_search tool (default: true) \ No newline at end of file diff --git a/lightspeed-stack.yaml b/lightspeed-stack.yaml index 98b2555a8..5ed176d8c 100644 --- a/lightspeed-stack.yaml +++ b/lightspeed-stack.yaml @@ -32,7 +32,18 @@ authentication: module: "noop" -# OKP Solr for supplementary RAG -solr: - enabled: false - offline: true \ No newline at end of file +# RAG configuration +rag: + # Always RAG (inject context pre-query with RAG from Solr and BYOK vector stores) + always: + solr: + enabled: false + offline: false + # Supports weighted scoring + byok: + enabled: false + # Tool RAG (LLM can call file_search tool during generation) + tool: + byok: + # Default is true for backward compatibility + enabled: false diff --git a/run.yaml b/run.yaml index 29ce3cae3..79d4609f2 100644 --- a/run.yaml +++ b/run.yaml @@ -171,6 +171,9 @@ registered_resources: tool_groups: - toolgroup_id: builtin::rag # Register the RAG tool provider_id: rag-runtime +# REQUIRED: This section is necessary for file_search tool calls to work. +# Without it, llama-stack's rag-runtime silently fails all file_search operations +# with no error logged. vector_stores: default_provider_id: faiss default_embedding_model: # Define the default embedding model for RAG diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index cbd06e0e7..865d790a2 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -9,8 +9,8 @@ from llama_stack_api.openai_responses import OpenAIResponseObject from llama_stack_client import ( APIConnectionError, - AsyncLlamaStackClient, APIStatusError as LLSApiStatusError, + AsyncLlamaStackClient, ) from openai._exceptions import ( APIStatusError as OpenAIAPIStatusError, @@ -22,9 +22,9 @@ from authorization.middleware import authorize from client import AsyncLlamaStackClientHolder from configuration import configuration +from log import get_logger from models.config import Action from models.requests import QueryRequest - from models.responses import ( ForbiddenResponse, InternalServerErrorResponse, @@ -40,7 +40,7 @@ check_configuration_loaded, validate_and_retrieve_conversation, ) -from utils.mcp_headers import mcp_headers_dependency, McpHeaders +from utils.mcp_headers import McpHeaders, mcp_headers_dependency from utils.query import ( consume_query_tokens, handle_known_apistatus_errors, @@ -67,8 +67,7 @@ ResponsesApiParams, TurnSummary, ) -from utils.vector_search import perform_vector_search, format_rag_context_for_injection -from log import get_logger +from utils.vector_search import build_rag_context logger = get_logger(__name__) router = APIRouter(tags=["query"]) @@ -155,15 +154,14 @@ async def query_endpoint_handler( client = AsyncLlamaStackClientHolder().get_client() - _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search( - client, query_request.query, query_request.solr - ) + # Build RAG context from BYOK and Solr sources + rag_context = await build_rag_context(client, query_request, configuration) - rag_context = format_rag_context_for_injection(pre_rag_chunks) - if rag_context: - # safest: mutate a local copy so we don't surprise other logic - query_request = query_request.model_copy(deep=True) # pydantic v2 - query_request.query = query_request.query + rag_context + # Inject RAG context into query + if rag_context.context_text: + # Mutate a local copy to avoid surprising other logic + query_request = query_request.model_copy(deep=True) + query_request.query = query_request.query + rag_context.context_text # Prepare API request parameters responses_params = await prepare_responses_params( @@ -199,13 +197,18 @@ async def query_endpoint_handler( rag_id_mapping, ) - if pre_rag_chunks: - turn_summary.rag_chunks = pre_rag_chunks + (turn_summary.rag_chunks or []) - - if doc_ids_from_chunks: - turn_summary.referenced_documents = deduplicate_referenced_documents( - doc_ids_from_chunks + turn_summary.referenced_documents - ) + # Merge RAG chunks (BYOK + Solr) with tool-based RAG chunks + rag_chunks = rag_context.rag_chunks + tool_rag_chunks = turn_summary.rag_chunks or [] + logger.info("RAG as a tool retrieved %d chunks", len(tool_rag_chunks)) + turn_summary.rag_chunks = rag_chunks + tool_rag_chunks + + # Add tool-based RAG documents and chunks + rag_documents = rag_context.referenced_documents + tool_rag_documents = turn_summary.referenced_documents or [] + turn_summary.referenced_documents = deduplicate_referenced_documents( + rag_documents + tool_rag_documents + ) # Get topic summary for new conversation if not user_conversation and query_request.generate_topic_summary: diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index 81f0c1dcf..4f3a3d84c 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -3,7 +3,6 @@ import asyncio import datetime import json - from typing import Annotated, Any, AsyncIterator, Optional, cast from fastapi import APIRouter, Depends, HTTPException, Request @@ -11,18 +10,33 @@ from llama_stack_api.openai_responses import ( OpenAIResponseObject, OpenAIResponseObjectStream, +) +from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseMcpCallArgumentsDone as MCPArgsDoneChunk, +) +from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputItemAdded as OutputItemAddedChunk, +) +from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputItemDone as OutputItemDoneChunk, +) +from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputTextDelta as TextDeltaChunk, +) +from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputTextDone as TextDoneChunk, +) +from llama_stack_api.openai_responses import ( OpenAIResponseOutputMessageMCPCall as MCPCall, ) from llama_stack_client import ( APIConnectionError, +) +from llama_stack_client import ( APIStatusError as LLSApiStatusError, ) from openai._exceptions import APIStatusError as OpenAIAPIStatusError + import metrics from authentication import get_auth_dependency from authentication.interface import AuthTuple @@ -40,6 +54,7 @@ MEDIA_TYPE_JSON, MEDIA_TYPE_TEXT, ) +from log import get_logger from models.config import Action from models.context import ResponseGeneratorContext from models.requests import QueryRequest @@ -55,12 +70,15 @@ UnauthorizedResponse, UnprocessableEntityResponse, ) +<<<<<<< HEAD from utils.types import ReferencedDocument +======= +>>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) from utils.endpoints import ( check_configuration_loaded, validate_and_retrieve_conversation, ) -from utils.mcp_headers import mcp_headers_dependency, McpHeaders +from utils.mcp_headers import McpHeaders, mcp_headers_dependency from utils.query import ( consume_query_tokens, extract_provider_and_model_from_model_id, @@ -90,9 +108,8 @@ from utils.stream_interrupts import get_stream_interrupt_registry from utils.suid import get_suid, normalize_conversation_id from utils.token_counter import TokenCounter -from utils.types import ResponsesApiParams, TurnSummary -from utils.vector_search import format_rag_context_for_injection, perform_vector_search -from log import get_logger +from utils.types import ReferencedDocument, ResponsesApiParams, TurnSummary +from utils.vector_search import build_rag_context logger = get_logger(__name__) router = APIRouter(tags=["streaming_query"]) @@ -185,14 +202,23 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals client = AsyncLlamaStackClientHolder().get_client() +<<<<<<< HEAD _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search( client, query_request.query, query_request.solr ) rag_context = format_rag_context_for_injection(pre_rag_chunks) if rag_context: +======= + # Build RAG context from BYOK and Solr sources + rag_context = await build_rag_context(client, query_request, configuration) + + # Inject RAG context into query + if rag_context.context_text: + # Mutate a local copy to avoid surprising other logic +>>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) query_request = query_request.model_copy(deep=True) - query_request.query = query_request.query + rag_context + query_request.query = query_request.query + rag_context.context_text # Prepare API request parameters responses_params = await prepare_responses_params( @@ -240,7 +266,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals generator, turn_summary = await retrieve_response_generator( responses_params=responses_params, context=context, - doc_ids_from_chunks=doc_ids_from_chunks, + pre_rag_documents=rag_context.referenced_documents, ) response_media_type = ( @@ -263,7 +289,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals async def retrieve_response_generator( responses_params: ResponsesApiParams, context: ResponseGeneratorContext, - doc_ids_from_chunks: list[ReferencedDocument], + pre_rag_documents: list[ReferencedDocument], ) -> tuple[AsyncIterator[str], TurnSummary]: """ Retrieve the appropriate response generator. @@ -275,7 +301,11 @@ async def retrieve_response_generator( Args: responses_params: The Responses API parameters context: The response generator context +<<<<<<< HEAD doc_ids_from_chunks: List of ReferencedDocument objects extracted from static RAG +======= + pre_rag_documents: Referenced documents from pre-query RAG (BYOK + Solr) +>>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) Returns: tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary @@ -305,8 +335,8 @@ async def retrieve_response_generator( response = await context.client.responses.create( **responses_params.model_dump(exclude_none=True) ) - # Store pre-RAG documents for later merging - turn_summary.pre_rag_documents = doc_ids_from_chunks + # Store pre-RAG documents for later merging with tool-based RAG + turn_summary.pre_rag_documents = pre_rag_documents return response_generator(response, context, turn_summary), turn_summary # Handle know LLS client errors only at stream creation time and shield execution diff --git a/src/configuration.py b/src/configuration.py index c918be9ea..c231515f7 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -14,6 +14,7 @@ Configuration, Customization, LlamaStackConfiguration, + RagConfiguration, UserDataCollection, ServiceConfiguration, ModelContextProtocolServer, @@ -22,7 +23,6 @@ DatabaseConfiguration, ConversationHistoryConfiguration, QuotaHandlersConfiguration, - SolrConfiguration, SplunkConfiguration, ) @@ -365,11 +365,11 @@ def deployment_environment(self) -> str: return self._configuration.deployment_environment @property - def solr(self) -> Optional[SolrConfiguration]: - """Return Solr configuration, or None if not provided.""" + def rag(self) -> "RagConfiguration": + """Return RAG configuration.""" if self._configuration is None: raise LogicError("logic error: configuration is not loaded") - return self._configuration.solr + return self._configuration.rag @property def rag_id_mapping(self) -> dict[str, str]: @@ -386,6 +386,24 @@ def rag_id_mapping(self) -> dict[str, str]: raise LogicError("logic error: configuration is not loaded") return {brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag} + @property + def score_multiplier_mapping(self) -> dict[str, float]: + """Return mapping from vector_db_id to score_multiplier from BYOK RAG config. + + Returns: + dict[str, float]: Mapping where keys are llama-stack vector_db_ids + and values are score multipliers from configuration. + + Raises: + LogicError: If the configuration has not been loaded. + """ + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + return { + brag.vector_db_id: brag.score_multiplier + for brag in self._configuration.byok_rag + } + def resolve_index_name( self, vector_store_id: str, rag_id_mapping: Optional[dict[str, str]] = None ) -> str: diff --git a/src/constants.py b/src/constants.py index 902db920c..f12377bb3 100644 --- a/src/constants.py +++ b/src/constants.py @@ -131,9 +131,6 @@ MCP_AUTH_CLIENT = "client" MCP_AUTH_OAUTH = "oauth" -# default RAG tool value -DEFAULT_RAG_TOOL = "file_search" - # Media type constants for streaming responses MEDIA_TYPE_JSON = "application/json" MEDIA_TYPE_TEXT = "text/plain" @@ -174,14 +171,36 @@ USER_QUOTA_LIMITER = "user_limiter" CLUSTER_QUOTA_LIMITER = "cluster_limiter" -# Vector search constants -VECTOR_SEARCH_DEFAULT_K = 5 -VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.0 -VECTOR_SEARCH_DEFAULT_MODE = "hybrid" +# RAG as a tool constants +DEFAULT_RAG_TOOL = "file_search" +TOOL_RAG_MAX_CHUNKS = 10 # retrieved from RAG as a tool + +# BYOK RAG constants +BYOK_RAG_MAX_CHUNKS = 10 # retrieved from BYOK RAG (Always RAG strategy) + +# Solr OKP constants +SOLR_RAG_MAX_CHUNKS = 5 # retrieved from the Solr OKP RAG (Always RAG strategy) +SOLR_VECTOR_SEARCH_DEFAULT_K = 5 +SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.0 +SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid" # SOLR OKP RAG MIMIR_DOC_URL = "https://mimir.corp.redhat.com" +SOLR_PROVIDER_ID = "okp_solr" + +# Solr default configuration values (can be overridden via environment variables) +SOLR_DEFAULT_VECTOR_STORE_ID = "portal-rag" +SOLR_DEFAULT_VECTOR_FIELD = "chunk_vector" +SOLR_DEFAULT_CONTENT_FIELD = "chunk" +SOLR_DEFAULT_EMBEDDING_MODEL = ( + "sentence-transformers/ibm-granite/granite-embedding-30m-english" +) +SOLR_DEFAULT_EMBEDDING_DIMENSION = 384 + +# Default score multiplier for BYOK RAG vector stores +DEFAULT_SCORE_MULTIPLIER = 1.0 + # Logging configuration constants # Environment variable name for configurable log level LIGHTSPEED_STACK_LOG_LEVEL_ENV_VAR = "LIGHTSPEED_STACK_LOG_LEVEL" diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index b74fbd7f9..aba46262b 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -14,6 +14,8 @@ from azure.core.exceptions import ClientAuthenticationError from azure.identity import ClientSecretCredential, CredentialUnavailableError from llama_stack.core.stack import replace_env_vars + +import constants from log import get_logger logger = get_logger(__name__) @@ -137,11 +139,11 @@ def construct_storage_backends_section( # add new backends for each BYOK RAG for brag in byok_rag: - vector_db_id = brag.get("vector_db_id", "") - backend_name = f"byok_{vector_db_id}_storage" + rag_id = brag.get("rag_id", "") + backend_name = f"byok_{rag_id}_storage" output[backend_name] = { "type": "kv_sqlite", - "db_path": brag.get("db_path", f".llama/{vector_db_id}.db"), + "db_path": brag.get("db_path", f".llama/{rag_id}.db"), } logger.info( "Added %s backends into storage.backends section, total backends %s", @@ -183,16 +185,18 @@ def construct_vector_stores_section( existing_store_ids = {vs.get("vector_store_id") for vs in output} added = 0 for brag in byok_rag: + rag_id = brag.get("rag_id", "") vector_db_id = brag.get("vector_db_id", "") if vector_db_id in existing_store_ids: continue existing_store_ids.add(vector_db_id) added += 1 + embedding_model = brag.get("embedding_model", constants.DEFAULT_EMBEDDING_MODEL) output.append( { "vector_store_id": vector_db_id, - "provider_id": f"byok_{vector_db_id}", - "embedding_model": brag.get("embedding_model", ""), + "provider_id": f"byok_{rag_id}", + "embedding_model": embedding_model, "embedding_dimension": brag.get("embedding_dimension"), } ) @@ -227,10 +231,14 @@ def construct_models_section( # add embedding models for each BYOK RAG for brag in byok_rag: - embedding_model = brag.get("embedding_model", "") - vector_db_id = brag.get("vector_db_id", "") + embedding_model = brag.get("embedding_model", constants.DEFAULT_EMBEDDING_MODEL) + rag_id = brag.get("rag_id", "") embedding_dimension = brag.get("embedding_dimension") + # Skip if no embedding model specified + if not embedding_model: + continue + # Strip sentence-transformers/ prefix if present provider_model_id = embedding_model if provider_model_id.startswith("sentence-transformers/"): @@ -243,7 +251,7 @@ def construct_models_section( output.append( { - "model_id": f"byok_{vector_db_id}_embedding", + "model_id": f"byok_{rag_id}_embedding", "model_type": "embedding", "provider_id": "sentence-transformers", "provider_model_id": provider_model_id, @@ -290,9 +298,9 @@ def construct_vector_io_providers_section( # append new vector_io entries for brag in byok_rag: - vector_db_id = brag.get("vector_db_id", "") - backend_name = f"byok_{vector_db_id}_storage" - provider_id = f"byok_{vector_db_id}" + rag_id = brag.get("rag_id", "") + backend_name = f"byok_{rag_id}_storage" + provider_id = f"byok_{rag_id}" output.append( { "provider_id": provider_id, @@ -353,6 +361,133 @@ def enrich_byok_rag(ls_config: dict[str, Any], byok_rag: list[dict[str, Any]]) - ) +# ============================================================================= +# Enrichment: Solr +# ============================================================================= + + +def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: + """Enrich Llama Stack config with Solr settings. + + Args: + ls_config: Llama Stack configuration dict (modified in place) + solr_config: Solr configuration dict + """ + if not solr_config or not solr_config.get("enabled"): + logger.info("Solr is not enabled: skipping") + return + + logger.info("Enriching Llama Stack config with Solr") + + # Add vector_io provider for Solr + if "providers" not in ls_config: + ls_config["providers"] = {} + if "vector_io" not in ls_config["providers"]: + ls_config["providers"]["vector_io"] = [] + + # Add Solr provider if not already present + existing_providers = [ + p.get("provider_id") for p in ls_config["providers"]["vector_io"] + ] + if constants.SOLR_PROVIDER_ID not in existing_providers: + # Build environment variable expressions + solr_url_env = "${env.SOLR_URL:=http://localhost:8081/solr}" + collection_env = ( + f"${{env.SOLR_COLLECTION:={constants.SOLR_DEFAULT_VECTOR_STORE_ID}}}" + ) + vector_field_env = ( + f"${{env.SOLR_VECTOR_FIELD:={constants.SOLR_DEFAULT_VECTOR_FIELD}}}" + ) + content_field_env = ( + f"${{env.SOLR_CONTENT_FIELD:={constants.SOLR_DEFAULT_CONTENT_FIELD}}}" + ) + embedding_model_env = ( + f"${{env.SOLR_EMBEDDING_MODEL:={constants.SOLR_DEFAULT_EMBEDDING_MODEL}}}" + ) + embedding_dim_env = ( + f"${{env.SOLR_EMBEDDING_DIM:={constants.SOLR_DEFAULT_EMBEDDING_DIMENSION}}}" + ) + + ls_config["providers"]["vector_io"].append( + { + "provider_id": constants.SOLR_PROVIDER_ID, + "provider_type": "remote::solr_vector_io", + "config": { + "solr_url": solr_url_env, + "collection_name": collection_env, + "vector_field": vector_field_env, + "content_field": content_field_env, + "embedding_model": embedding_model_env, + "embedding_dimension": embedding_dim_env, + "persistence": { + "namespace": constants.SOLR_DEFAULT_VECTOR_STORE_ID, + "backend": "kv_default", + }, + }, + } + ) + logger.info("Added Solr provider to providers/vector_io") + + # Add vector store registration for Solr + if "registered_resources" not in ls_config: + ls_config["registered_resources"] = {} + if "vector_stores" not in ls_config["registered_resources"]: + ls_config["registered_resources"]["vector_stores"] = [] + + # Add Solr vector store if not already present + existing_stores = [ + vs.get("vector_store_id") + for vs in ls_config["registered_resources"]["vector_stores"] + ] + if constants.SOLR_DEFAULT_VECTOR_STORE_ID not in existing_stores: + # Build environment variable expression + embedding_model_env = ( + f"${{env.SOLR_EMBEDDING_MODEL:={constants.SOLR_DEFAULT_EMBEDDING_MODEL}}}" + ) + + ls_config["registered_resources"]["vector_stores"].append( + { + "vector_store_id": constants.SOLR_DEFAULT_VECTOR_STORE_ID, + "provider_id": constants.SOLR_PROVIDER_ID, + "embedding_model": embedding_model_env, + "embedding_dimension": constants.SOLR_DEFAULT_EMBEDDING_DIMENSION, + } + ) + logger.info( + "Added %s vector store to registered_resources", + constants.SOLR_DEFAULT_VECTOR_STORE_ID, + ) + + # Add Solr embedding model to registered_resources.models if not already present + if "models" not in ls_config["registered_resources"]: + ls_config["registered_resources"]["models"] = [] + + # Strip sentence-transformers/ prefix from constant for provider_model_id + provider_model_id = constants.SOLR_DEFAULT_EMBEDDING_MODEL + if provider_model_id.startswith("sentence-transformers/"): + provider_model_id = provider_model_id[len("sentence-transformers/") :] + + # Check if already registered + registered_models = ls_config["registered_resources"]["models"] + existing_model_ids = [m.get("provider_model_id") for m in registered_models] + if provider_model_id not in existing_model_ids: + # Build environment variable expression + provider_model_env = f"${{env.SOLR_EMBEDDING_MODEL:={provider_model_id}}}" + + ls_config["registered_resources"]["models"].append( + { + "model_id": "solr_embedding", + "model_type": "embedding", + "provider_id": "sentence-transformers", + "provider_model_id": provider_model_env, + "metadata": { + "embedding_dimension": constants.SOLR_DEFAULT_EMBEDDING_DIMENSION, + }, + } + ) + logger.info("Added Solr embedding model to registered_resources.models") + + # ============================================================================= # Main Generation Function (service/container mode only) # ============================================================================= @@ -383,6 +518,10 @@ def generate_configuration( # Enrichment: BYOK RAG enrich_byok_rag(ls_config, config.get("byok_rag", [])) + # Enrichment: Solr + solr_config = config.get("rag", {}).get("always", {}).get("solr", {}) + enrich_solr(ls_config, solr_config) + logger.info("Writing Llama Stack configuration into file %s", output_file) with open(output_file, "w", encoding="utf-8") as file: diff --git a/src/models/config.py b/src/models/config.py index 1aca828ad..cc2301975 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -2,37 +2,35 @@ # pylint: disable=too-many-lines -from pathlib import Path -from typing import Optional, Any, Pattern +import re from enum import Enum from functools import cached_property -import re -import yaml +from pathlib import Path +from typing import Any, Optional, Pattern import jsonpath_ng +import yaml from jsonpath_ng.exceptions import JSONPathError from pydantic import ( + AnyHttpUrl, BaseModel, ConfigDict, Field, - field_validator, - model_validator, FilePath, - AnyHttpUrl, - PositiveInt, NonNegativeInt, - SecretStr, + PositiveInt, PrivateAttr, + SecretStr, + field_validator, + model_validator, ) - from pydantic.dataclasses import dataclass -from typing_extensions import Self, Literal +from typing_extensions import Literal, Self import constants - +from log import get_logger from utils import checks from utils.mcp_auth_headers import resolve_authorization_headers -from log import get_logger logger = get_logger(__name__) @@ -1565,6 +1563,15 @@ class ByokRag(ConfigurationBase): description="Path to RAG database.", ) + score_multiplier: float = Field( + constants.DEFAULT_SCORE_MULTIPLIER, + gt=0, + title="Score multiplier", + description="Multiplier applied to relevance scores from this vector store. " + "Used to weight results when querying multiple knowledge sources. " + "Values > 1 boost this store's results; values < 1 reduce them.", + ) + class QuotaLimiterConfiguration(ConfigurationBase): """Configuration for one quota limiter. @@ -1687,28 +1694,86 @@ class QuotaHandlersConfiguration(ConfigurationBase): ) -class SolrConfiguration(ConfigurationBase): - """Solr configuration for vector search queries. +class ByokRagConfiguration(ConfigurationBase): + """BYOK RAG configuration.""" - Controls whether to use offline or online mode when building document URLs - from vector search results, and enables/disables Solr vector IO functionality. - """ + enabled: bool = Field( + default=False, + title="BYOK RAG enabled", + description="When True, queries BYOK vector stores for RAG context.", + ) + + +class SolrRagConfiguration(ConfigurationBase): + """Solr RAG configuration.""" enabled: bool = Field( - False, - title="Solr enabled", - description="When True, enables Solr vector IO functionality for vector search queries. " - "When False, disables Solr vector search processing.", + default=False, + title="Solr RAG enabled", + description="When True, queries Solr OKP for RAG context.", ) offline: bool = Field( - True, + default=True, title="Offline mode", description="When True, use parent_id for chunk source URLs. " "When False, use reference_url for chunk source URLs.", ) +class AlwaysRagConfiguration(ConfigurationBase): + """Always RAG configuration. + + Controls pre-query RAG from Solr and BYOK vector stores. + """ + + solr: SolrRagConfiguration = Field( + default_factory=lambda: SolrRagConfiguration(), # pylint: disable=unnecessary-lambda + title="Solr RAG configuration", + description="Configuration for Solr RAG (pre-query).", + ) + + byok: ByokRagConfiguration = Field( + default_factory=lambda: ByokRagConfiguration(), # pylint: disable=unnecessary-lambda + title="BYOK RAG configuration", + description="Configuration for BYOK RAG (pre-query).", + ) + + +class ToolRagConfiguration(ConfigurationBase): + """Tool RAG configuration. + + Controls whether RAG functionality is exposed as a tool that the LLM can call. + """ + + byok: ByokRagConfiguration = Field( + default_factory=lambda: ByokRagConfiguration( + enabled=True + ), # defaults True for backward compatibility + title="BYOK RAG configuration", + description="Configuration for BYOK RAG as a tool.", + ) + + +class RagConfiguration(ConfigurationBase): + """RAG strategy configuration. + + Controls different RAG strategies: pre-query (always) and tool-based. + """ + + always: AlwaysRagConfiguration = Field( + default_factory=lambda: AlwaysRagConfiguration(), # pylint: disable=unnecessary-lambda + title="Always RAG configuration", + description="Configuration for pre-query RAG from Solr and BYOK vector stores.", + ) + + tool: ToolRagConfiguration = Field( + default_factory=lambda: ToolRagConfiguration(), # pylint: disable=unnecessary-lambda + title="Tool RAG configuration", + description="Configuration for exposing RAG as a tool that the LLM can call.", + ) + + class AzureEntraIdConfiguration(ConfigurationBase): """Microsoft Entra ID authentication attributes for Azure.""" @@ -1847,10 +1912,10 @@ class Configuration(ConfigurationBase): "Used in telemetry events.", ) - solr: Optional[SolrConfiguration] = Field( - default=None, - title="Solr configuration", - description="Configuration for Solr vector search operations.", + rag: RagConfiguration = Field( + default_factory=RagConfiguration, + title="RAG configuration", + description="Configuration for all RAG strategies (pre-query and tool-based).", ) @model_validator(mode="after") diff --git a/src/utils/responses.py b/src/utils/responses.py index 71e8afbe9..dce076d41 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -11,8 +11,11 @@ OpenAIResponseContentPartRefusal as ContentPartRefusal, OpenAIResponseInputMessageContent as InputMessageContent, OpenAIResponseInputMessageContentText as InputTextPart, + OpenAIResponseInputTool as InputTool, OpenAIResponseInputToolFileSearch as InputToolFileSearch, OpenAIResponseInputToolMCP as InputToolMCP, + OpenAIResponseMCPApprovalRequest as MCPApprovalRequest, + OpenAIResponseMCPApprovalResponse as MCPApprovalResponse, OpenAIResponseMessage as ResponseMessage, OpenAIResponseObject as ResponseObject, OpenAIResponseOutput as ResponseOutput, @@ -23,10 +26,7 @@ OpenAIResponseOutputMessageMCPCall as MCPCall, OpenAIResponseOutputMessageMCPListTools as MCPListTools, OpenAIResponseOutputMessageWebSearchToolCall as WebSearchCall, - OpenAIResponseMCPApprovalRequest as MCPApprovalRequest, - OpenAIResponseMCPApprovalResponse as MCPApprovalResponse, OpenAIResponseUsage as ResponseUsage, - OpenAIResponseInputTool as InputTool, ) from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient @@ -34,6 +34,7 @@ import metrics from configuration import configuration from constants import DEFAULT_RAG_TOOL +from log import get_logger from models.config import ByokRag from models.database.conversations import UserConversation from models.requests import QueryRequest @@ -42,6 +43,7 @@ NotFoundResponse, ServiceUnavailableResponse, ) +from utils.mcp_headers import McpHeaders, extract_propagated_headers from utils.mcp_oauth_probe import probe_mcp_oauth_and_raise_401 from utils.prompts import get_system_prompt, get_topic_summary_system_prompt from utils.query import ( @@ -49,7 +51,6 @@ handle_known_apistatus_errors, prepare_input, ) -from utils.mcp_headers import McpHeaders, extract_propagated_headers from utils.suid import to_llama_stack_conversation_id from utils.token_counter import TokenCounter from utils.types import ( @@ -61,12 +62,49 @@ ToolResultSummary, TurnSummary, ) -from log import get_logger logger = get_logger(__name__) -async def get_topic_summary( +async def get_vector_store_ids( + client: AsyncLlamaStackClient, + vector_store_ids: Optional[list[str]] = None, +) -> list[str]: + """Get vector store IDs for querying. + + If vector_store_ids are provided, returns them. Otherwise fetches all + available vector stores from Llama Stack. + + Args: + client: The AsyncLlamaStackClient to use for fetching stores + vector_store_ids: Optional list of vector store IDs. If provided, + returns this list. If None, fetches all available vector stores. + + Returns: + List of vector store IDs to query + + Raises: + HTTPException: With ServiceUnavailableResponse if connection fails, + or InternalServerErrorResponse if API returns an error status + """ + if vector_store_ids: + return vector_store_ids + + try: + vector_stores = await client.vector_stores.list() + return [vector_store.id for vector_store in vector_stores.data] + except APIConnectionError as e: + error_response = ServiceUnavailableResponse( + backend_name="Llama Stack", + cause=str(e), + ) + raise HTTPException(**error_response.model_dump()) from e + except APIStatusError as e: + error_response = InternalServerErrorResponse.generic() + raise HTTPException(**error_response.model_dump()) from e + + +async def get_topic_summary( # pylint: disable=too-many-nested-blocks question: str, client: AsyncLlamaStackClient, model_id: str ) -> str: """Get a topic summary for a question using Responses API. @@ -129,25 +167,8 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position return None toolgroups: list[InputTool] = [] - # Get all vector stores if vector stores are not restricted by request - if vector_store_ids is None: - try: - vector_stores = await client.vector_stores.list() - vector_store_ids = [vector_store.id for vector_store in vector_stores.data] - except APIConnectionError as e: - error_response = ServiceUnavailableResponse( - backend_name="Llama Stack", - cause=str(e), - ) - raise HTTPException(**error_response.model_dump()) from e - except APIStatusError as e: - error_response = InternalServerErrorResponse.generic() - raise HTTPException(**error_response.model_dump()) from e - else: - # Translate customer-facing BYOK rag_ids to llama-stack vector_db_ids - vector_store_ids = resolve_vector_store_ids( - vector_store_ids, configuration.configuration.byok_rag - ) + # Get vector stores for RAG tools - use specified ones or fetch all + vector_store_ids = await get_vector_store_ids(client, vector_store_ids) # Add RAG tools if vector stores are available rag_tools = get_rag_tools(vector_store_ids) @@ -344,8 +365,12 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea vector_store_ids: List of vector store identifiers Returns: - List containing file_search tool configuration, or None if no vector stores provided + List containing file_search tool configuration, or None if RAG as tool is disabled """ + # Check if Tool RAG is enabled in configuration + if not (configuration and configuration.rag.tool.byok.enabled): + return None + if not vector_store_ids: return None @@ -353,7 +378,7 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea InputToolFileSearch( type="file_search", vector_store_ids=vector_store_ids, - max_num_results=10, + max_num_results=constants.TOOL_RAG_MAX_CHUNKS, ) ] diff --git a/src/utils/types.py b/src/utils/types.py index 220a85239..6134f42b8 100644 --- a/src/utils/types.py +++ b/src/utils/types.py @@ -285,6 +285,26 @@ class ReferencedDocument(BaseModel): ) +class RAGContext(BaseModel): + """Result of building RAG context from all enabled pre-query RAG sources. + + Attributes: + context_text: Formatted RAG context string for injection into the query. + rag_chunks: RAG chunks from pre-query sources (BYOK + Solr). + referenced_documents: Referenced documents from pre-query sources. + """ + + context_text: str = Field(default="", description="Formatted context for injection") + rag_chunks: list[RAGChunk] = Field( + default_factory=list, + description="RAG chunks from pre-query sources", + ) + referenced_documents: list[ReferencedDocument] = Field( + default_factory=list, + description="Documents from pre-query sources", + ) + + class TurnSummary(BaseModel): """Summary of a turn in llama stack.""" diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index e39e9ec04..737e05a7c 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -4,61 +4,180 @@ and processing RAG chunks that is shared between query_v2.py and streaming_query_v2.py. """ +import asyncio import traceback from typing import Any, Optional from urllib.parse import urljoin from llama_stack_client import AsyncLlamaStackClient -from llama_stack_client.types.query_chunks_response import Chunk from pydantic import AnyUrl import constants from configuration import configuration from log import get_logger from models.responses import ReferencedDocument -from utils.types import RAGChunk +from utils.responses import get_vector_store_ids +from utils.types import RAGChunk, RAGContext logger = get_logger(__name__) def _is_solr_enabled() -> bool: """Check if Solr is enabled in configuration.""" - return bool(configuration.solr and configuration.solr.enabled) + return bool(configuration.rag.always.solr.enabled) -def _get_vector_store_ids(solr_enabled: bool) -> list[str]: +def _get_solr_vector_store_ids() -> list[str]: """Get vector store IDs based on Solr configuration.""" - if solr_enabled: - vector_store_ids = ["portal-rag"] - logger.info( - "Using portal-rag vector store for Solr query: %s", - vector_store_ids, - ) - return vector_store_ids - return [] + vector_store_ids = [constants.SOLR_DEFAULT_VECTOR_STORE_ID] + logger.info( + "Using %s vector store for Solr query: %s", + constants.SOLR_DEFAULT_VECTOR_STORE_ID, + vector_store_ids, + ) + return vector_store_ids def _build_query_params(solr: Optional[dict[str, Any]] = None) -> dict[str, Any]: """Build query parameters for vector search.""" params = { - "k": constants.VECTOR_SEARCH_DEFAULT_K, - "score_threshold": constants.VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD, - "mode": constants.VECTOR_SEARCH_DEFAULT_MODE, + "k": constants.SOLR_VECTOR_SEARCH_DEFAULT_K, + "score_threshold": constants.SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD, + "mode": constants.SOLR_VECTOR_SEARCH_DEFAULT_MODE, } - logger.info("Initial params: %s", params) - logger.info("solr: %s", solr) + logger.debug("Initial params: %s", params) + logger.debug("query_request.solr: %s", query_request.solr) - if solr: - params["solr"] = solr - logger.info("Final params with solr filters: %s", params) + if query_request.solr: + params["solr"] = query_request.solr + logger.debug("Final params with solr filters: %s", params) else: - logger.info("No solr filters provided") + logger.debug("No solr filters provided") - logger.info("Final params being sent to vector_io.query: %s", params) + logger.debug("Final params being sent to vector_io.query: %s", params) return params -def _extract_document_metadata( +def _extract_byok_rag_chunks( + search_response: Any, vector_store_id: str, weight: float +) -> list[dict[str, Any]]: + """Extract and weight result chunks from vector search for BYOK RAG. + + Args: + search_response: Response from vector_io.query + vector_store_id: ID of the vector store that produced these results + weight: Score multiplier to apply to this store's results + + Returns: + List of result dictionaries with weighted scores + """ + result_chunks = [] + for chunk, score in zip( + search_response.chunks, search_response.scores, strict=True + ): + weighted_score = score * weight + doc_id = ( + chunk.metadata.get("document_id", chunk.chunk_id) + if chunk.metadata + else chunk.chunk_id + ) + logger.debug( + " [%s] score=%.4f weighted=%.4f", + vector_store_id, + score, + weighted_score, + ) + result_chunks.append( + { + "content": chunk.content, + "score": score, + "weighted_score": weighted_score, + "source": vector_store_id, + "doc_id": doc_id, + "metadata": chunk.metadata or {}, + } + ) + return result_chunks + + +def _format_rag_context(rag_chunks: list[RAGChunk], query: str) -> str: + """Format RAG chunks for pre-query context injection. + + This format is used for both BYOK RAG and Solr RAG chunks. + Format is inspired by llama-stack file_search tool implementation. + + Args: + rag_chunks: List of RAG chunks from pre-query sources (BYOK + Solr) + query: The original search query + + Returns: + Formatted string with RAG context metadata attributes + """ + if not rag_chunks: + return "" + + output = f"file_search found {len(rag_chunks)} chunks:\n" + output += "BEGIN of file_search results.\n" + + for i, chunk in enumerate(rag_chunks, 1): + # Build metadata text with source and score + metadata_parts = [] + if chunk.source: + metadata_parts.append(f"document_id: {chunk.source}") + if chunk.score is not None: + metadata_parts.append(f"score: {chunk.score:.4f}") + + metadata_text = ", ".join(metadata_parts) + + # Add additional attributes if present + if chunk.attributes: + metadata_text += f", attributes: {chunk.attributes}" + + # Format chunk with metadata and content + output += f"[{i}] {metadata_text}\n{chunk.content}\n\n" + + output += "END of file_search results.\n" + + output += ( + f'The above results were retrieved to help answer the user\'s query: "{query}". ' + "Use them as supporting information only in answering this query. " + ) + return output + + +async def _query_store_for_byok_rag( + client: AsyncLlamaStackClient, + vector_store_id: str, + query: str, + weight: float, +) -> list[dict[str, Any]]: + """Query a single vector store for BYOK RAG. + + Args: + client: AsyncLlamaStackClient for vector_io queries + vector_store_id: ID of the vector store to query + query: Search query string + weight: Score multiplier to apply + + Returns: + List of weighted result dictionaries, or empty list on error + """ + try: + search_response = await client.vector_io.query( + vector_store_id=vector_store_id, + query=query, + params={ + "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, + "mode": "vector", + }, + ) + return _extract_byok_rag_chunks(search_response, vector_store_id, weight) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("Failed to search '%s': %s", vector_store_id, e) + return [] + + +def _extract_solr_document_metadata( chunk: Any, ) -> tuple[Optional[str], Optional[str], Optional[str]]: """Extract document ID, title, and reference URL from chunk metadata.""" @@ -86,17 +205,81 @@ def _extract_document_metadata( return doc_id, title, reference_url -def _process_chunks_for_documents( +def _process_byok_rag_chunks_for_documents( + result_chunks: list[dict[str, Any]], +) -> list[ReferencedDocument]: + """Process BYOK RAG result chunks to extract referenced documents. + + Args: + result_chunks: Processed result dictionaries from BYOK RAG + (output of _extract_byok_rag_chunks) + + Returns: + List of referenced documents extracted from BYOK RAG chunks + """ + referenced_documents = [] + seen_doc_ids = set() + + for result in result_chunks: + metadata = result.get("metadata", {}) + doc_id = result.get("doc_id") or metadata.get("document_id") + title = metadata.get("title") + reference_url = ( + metadata.get("reference_url") + or metadata.get("doc_url") + or metadata.get("docs_url") + ) + + if not doc_id and not reference_url: + continue + + # Use doc_id or reference_url as deduplication key + dedup_key = reference_url or doc_id + if dedup_key and dedup_key not in seen_doc_ids: + seen_doc_ids.add(dedup_key) + + # Build document URL + parsed_url: Optional[AnyUrl] = None + if reference_url: + try: + parsed_url = AnyUrl(reference_url) + except Exception: # pylint: disable=broad-exception-caught + parsed_url = None + + referenced_documents.append( + ReferencedDocument( + doc_title=title, + doc_url=parsed_url, + source=result.get("source"), # Vector store ID + ) + ) + + logger.info( + "Extracted %d unique documents from BYOK RAG", + len(referenced_documents), + ) + return referenced_documents + + +def _process_solr_chunks_for_documents( chunks: list[Any], offline: bool ) -> list[ReferencedDocument]: - """Process chunks to extract referenced documents.""" + """Process Solr chunks to extract referenced documents. + + Args: + chunks: Raw chunks from Solr vector store + offline: Whether to use offline mode for URL construction + + Returns: + List of referenced documents extracted from Solr chunks + """ doc_ids_from_chunks = [] metadata_doc_ids = set() for chunk in chunks: - logger.info("Extract doc ids from chunk: %s", chunk) + logger.debug("Extract doc ids from chunk: %s", chunk) - doc_id, title, reference_url = _extract_document_metadata(chunk) + doc_id, title, reference_url = _extract_solr_document_metadata(chunk) if not doc_id and not reference_url: continue @@ -118,23 +301,112 @@ def _process_chunks_for_documents( ReferencedDocument( doc_title=title, doc_url=parsed_url, + source="OKP Solr", ) ) - logger.info( - "Extracted %d unique document IDs from chunks", + logger.debug( + "Extracted %d unique document IDs from Solr chunks", len(doc_ids_from_chunks), ) return doc_ids_from_chunks -async def perform_vector_search( +async def _fetch_byok_rag( client: AsyncLlamaStackClient, query: str, - solr: Optional[dict[str, Any]] = None, -) -> tuple[list[Any], list[float], list[ReferencedDocument], list[RAGChunk]]: + configuration: AppConfig, + vector_store_ids: Optional[list[str]] = None, +) -> tuple[list[RAGChunk], list[ReferencedDocument]]: + """Fetch chunks and documents from BYOK RAG sources. + + Args: + client: The AsyncLlamaStackClient to use for the request + query: The search query + configuration: Application configuration + vector_store_ids: Optional list of vector store IDs to query. + If provided, only these stores will be queried. If None, all stores + (excluding Solr) will be queried. + + Returns: + Tuple containing: + - rag_chunks: RAG chunks from BYOK RAG + - referenced_documents: Documents referenced in BYOK RAG results """ - Perform vector search and extract RAG chunks and referenced documents. + rag_chunks: list[RAGChunk] = [] + referenced_documents: list[ReferencedDocument] = [] + + if not configuration.rag.always.byok.enabled: + logger.info("Always RAG (BYOK) disabled, skipping BYOK RAG search") + return rag_chunks, referenced_documents + + try: + # Get score multiplier and rag_id mappings + score_multiplier_mapping = configuration.score_multiplier_mapping + rag_id_mapping = configuration.rag_id_mapping + + # Filter out Solr vector stores from available stores + vector_store_ids_to_query = [ + vs_id + for vs_id in await get_vector_store_ids(client, vector_store_ids) + if vs_id != constants.SOLR_DEFAULT_VECTOR_STORE_ID + ] + + # Query all vector stores in parallel + results_per_store = await asyncio.gather( + *[ + _query_store_for_byok_rag( + client, + vector_store_id, + query, + score_multiplier_mapping.get(vector_store_id, 1.0), + ) + for vector_store_id in vector_store_ids_to_query + ] + ) + + # Flatten, sort by weighted score, and take top results + all_results: list[dict[str, Any]] = [] + for store_results in results_per_store: + all_results.extend(store_results) + all_results.sort(key=lambda x: x["weighted_score"], reverse=True) + top_results = all_results[: constants.BYOK_RAG_MAX_CHUNKS] + + # Resolve source, log, and convert to RAGChunk in a single pass + logger.info("Filtered top %d chunks from BYOK RAG", len(top_results)) + for result in top_results: + result["source"] = rag_id_mapping.get(result["source"], result["source"]) + logger.debug( + " [%s] score=%.4f weighted=%.4f", + result["source"], + result["score"], + result["weighted_score"], + ) + rag_chunks.append( + RAGChunk( + content=result["content"], + source=result["source"], + score=result["weighted_score"], + attributes=result.get("metadata", {}), + ) + ) + + # Extract referenced documents from BYOK RAG chunks (now with resolved sources) + referenced_documents = _process_byok_rag_chunks_for_documents(top_results) + + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning("Failed to perform BYOK RAG search: %s", e) + logger.debug("BYOK RAG error details: %s", traceback.format_exc()) + + return rag_chunks, referenced_documents + + +async def _fetch_solr_rag( + client: AsyncLlamaStackClient, + query_request: QueryRequest, + configuration: AppConfig, +) -> tuple[list[RAGChunk], list[ReferencedDocument]]: + """Fetch chunks and documents from Solr RAG source. Args: client: The AsyncLlamaStackClient to use for the request @@ -143,28 +415,24 @@ async def perform_vector_search( Returns: Tuple containing: - - retrieved_chunks: Raw chunks from vector store - - retrieved_scores: Scores for each chunk - - doc_ids_from_chunks: Referenced documents extracted from chunks - - rag_chunks: Processed RAG chunks ready for use + - rag_chunks: RAG chunks from Solr + - referenced_documents: Documents referenced in Solr results """ - retrieved_chunks: list[Chunk] = [] - retrieved_scores: list[float] = [] - doc_ids_from_chunks: list[ReferencedDocument] = [] rag_chunks: list[RAGChunk] = [] + referenced_documents: list[ReferencedDocument] = [] - # Check if Solr is enabled in configuration - if not _is_solr_enabled(): - logger.info("Solr vector IO is disabled, skipping vector search") - return retrieved_chunks, retrieved_scores, doc_ids_from_chunks, rag_chunks + if not _is_solr_enabled(configuration): + logger.info("Solr vector IO is disabled, skipping Solr search") + return rag_chunks, referenced_documents # Get offline setting from configuration - offline = configuration.solr.offline if configuration.solr else True + offline = configuration.rag.always.solr.offline try: - vector_store_ids = _get_vector_store_ids(True) + vector_store_ids = _get_solr_vector_store_ids() if vector_store_ids: + # Assuming only one Solr vector store is registered vector_store_id = vector_store_ids[0] params = _build_query_params(solr) @@ -174,31 +442,84 @@ async def perform_vector_search( params=params, ) - logger.info("The query response total payload: %s", query_response) + logger.debug("Solr query response: %s", query_response) if query_response.chunks: - retrieved_chunks = query_response.chunks retrieved_scores = ( query_response.scores if hasattr(query_response, "scores") else [] ) - # Extract doc_ids from chunks for referenced_documents - doc_ids_from_chunks = _process_chunks_for_documents( - query_response.chunks, offline + # Limit to top N chunks + top_chunks = query_response.chunks[: constants.SOLR_RAG_MAX_CHUNKS] + top_scores = retrieved_scores[: constants.SOLR_RAG_MAX_CHUNKS] + + # Extract referenced documents from Solr chunks + referenced_documents = _process_solr_chunks_for_documents( + top_chunks, offline ) # Convert retrieved chunks to RAGChunk format - rag_chunks = _convert_chunks_to_rag_format( - retrieved_chunks, retrieved_scores, offline + rag_chunks = _convert_solr_chunks_to_rag_format( + top_chunks, top_scores, offline + ) + logger.info( + "Filtered top %d chunks from Solr OKP RAG (%d were retrieved)", + constants.SOLR_RAG_MAX_CHUNKS, + len(rag_chunks), ) - logger.info("Retrieved %d chunks from vector DB", len(rag_chunks)) except Exception as e: # pylint: disable=broad-exception-caught - logger.warning("Failed to query vector database for chunks: %s", e) - logger.debug("Vector DB query error details: %s", traceback.format_exc()) - # Continue without RAG chunks + logger.warning("Failed to query Solr for chunks: %s", e) + logger.debug("Solr query error details: %s", traceback.format_exc()) - return retrieved_chunks, retrieved_scores, doc_ids_from_chunks, rag_chunks + return rag_chunks, referenced_documents + + +async def build_rag_context( + client: AsyncLlamaStackClient, + query_request: QueryRequest, + configuration: AppConfig, +) -> RAGContext: + """Build RAG context by fetching and merging chunks from all enabled sources. + + Enabled sources can be BYOK and/or Solr OKP. + + Args: + client: The AsyncLlamaStackClient to use for the request + query_request: The user's query request + configuration: Application configuration + + Returns: + RAGContext containing formatted context text and referenced documents + """ + # Fetch from all enabled RAG sources in parallel + byok_chunks_task = _fetch_byok_rag( + client, query_request.query, configuration, query_request.vector_store_ids + ) + solr_chunks_task = _fetch_solr_rag(client, query_request, configuration) + + (byok_chunks, byok_docs), (solr_chunks, solr_docs) = await asyncio.gather( + byok_chunks_task, solr_chunks_task + ) + + # Merge chunks from all sources (BYOK + Solr) + context_chunks = byok_chunks + solr_chunks + + context_text = _format_rag_context(context_chunks, query_request.query) + + logger.debug("=" * 80) + logger.debug("RAG context built for pre-query injection:") + logger.debug(context_text) + logger.debug("=" * 80) + + # Merge referenced documents from all sources (BYOK + Solr) + top_documents = byok_docs + solr_docs + + return RAGContext( + context_text=context_text, + rag_chunks=context_chunks, + referenced_documents=top_documents, + ) def _build_document_url( @@ -233,13 +554,13 @@ def _build_document_url( return doc_url, reference_doc -def _convert_chunks_to_rag_format( +def _convert_solr_chunks_to_rag_format( retrieved_chunks: list[Any], retrieved_scores: list[float], offline: bool, ) -> list[RAGChunk]: """ - Convert retrieved chunks to RAGChunk format. + Convert retrieved chunks to RAGChunk format for Solr OKP. Args: retrieved_chunks: Raw chunks from vector store @@ -252,15 +573,28 @@ def _convert_chunks_to_rag_format( rag_chunks = [] for i, chunk in enumerate(retrieved_chunks): - # Extract source from chunk metadata based on offline flag - source = None + # Build attributes with document metadata + attributes = {} + + # Legacy logic: extract doc_url from chunk metadata based on offline flag if chunk.metadata: if offline: parent_id = chunk.metadata.get("parent_id") if parent_id: - source = urljoin(constants.MIMIR_DOC_URL, parent_id) + attributes["doc_url"] = urljoin(constants.MIMIR_DOC_URL, parent_id) else: - source = chunk.metadata.get("reference_url") + reference_url = chunk.metadata.get("reference_url") + if reference_url: + attributes["doc_url"] = reference_url + + # For Solr chunks, also extract from chunk_metadata + if hasattr(chunk, "chunk_metadata") and chunk.chunk_metadata: + if hasattr(chunk.chunk_metadata, "document_id"): + doc_id = chunk.chunk_metadata.document_id + attributes["document_id"] = doc_id + # Build URL if not already set + if "doc_url" not in attributes and offline and doc_id: + attributes["doc_url"] = urljoin(constants.MIMIR_DOC_URL, doc_id) # Get score from retrieved_scores list if available score = retrieved_scores[i] if i < len(retrieved_scores) else None @@ -268,36 +602,10 @@ def _convert_chunks_to_rag_format( rag_chunks.append( RAGChunk( content=chunk.content, - source=source, + source="OKP Solr", # Hardcoded source for Solr chunks score=score, + attributes=attributes if attributes else None, ) ) return rag_chunks - - -def format_rag_context_for_injection( - rag_chunks: list[RAGChunk], max_chunks: int = 5 -) -> str: - """ - Format RAG context for injection into user message. - - Args: - rag_chunks: List of RAG chunks to format - max_chunks: Maximum number of chunks to include (default: 5) - - Returns: - Formatted RAG context string ready for injection - """ - if not rag_chunks: - return "" - - context_chunks = [] - for chunk in rag_chunks[:max_chunks]: # Limit to top chunks - chunk_text = f"Source: {chunk.source or 'Unknown'}\n{chunk.content}" - context_chunks.append(chunk_text) - - rag_context = "\n\nRelevant documentation:\n" + "\n\n".join(context_chunks) - logger.info("Injecting %d RAG chunks into user message", len(context_chunks)) - - return rag_context diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py index eebed248f..6f10467ee 100644 --- a/tests/unit/app/endpoints/test_streaming_query.py +++ b/tests/unit/app/endpoints/test_streaming_query.py @@ -52,7 +52,7 @@ from models.responses import InternalServerErrorResponse from utils.token_counter import TokenCounter from utils.stream_interrupts import StreamInterruptRegistry -from utils.types import ReferencedDocument, ResponsesApiParams, TurnSummary +from utils.types import RAGContext, ReferencedDocument, ResponsesApiParams, TurnSummary MOCK_AUTH_STREAMING = ( "00000001-0001-0001-0001-000000000001", @@ -330,12 +330,8 @@ async def test_successful_streaming_query( mocker.patch("app.endpoints.streaming_query.check_tokens_available") mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), - ) - mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), + "app.endpoints.streaming_query.build_rag_context", + new=mocker.AsyncMock(return_value=RAGContext()), ) mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) @@ -417,8 +413,8 @@ async def test_streaming_query_text_media_type_header( mocker.patch("app.endpoints.streaming_query.check_tokens_available") mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), + "app.endpoints.streaming_query.build_rag_context", + new=mocker.AsyncMock(return_value=RAGContext()), ) mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) @@ -503,8 +499,8 @@ async def test_streaming_query_with_conversation( mocker.patch("app.endpoints.streaming_query.check_tokens_available") mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), + "app.endpoints.streaming_query.build_rag_context", + new=mocker.AsyncMock(return_value=RAGContext()), ) mocker.patch( "app.endpoints.streaming_query.normalize_conversation_id", @@ -600,8 +596,8 @@ async def test_streaming_query_with_attachments( mocker.patch("app.endpoints.streaming_query.check_tokens_available") mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), + "app.endpoints.streaming_query.build_rag_context", + new=mocker.AsyncMock(return_value=RAGContext()), ) mock_validate = mocker.patch( "app.endpoints.streaming_query.validate_attachments_metadata" @@ -685,8 +681,8 @@ async def test_streaming_query_azure_token_refresh( mocker.patch("app.endpoints.streaming_query.check_tokens_available") mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") mocker.patch( - "app.endpoints.streaming_query.perform_vector_search", - new=mocker.AsyncMock(return_value=([], [], [], [])), + "app.endpoints.streaming_query.build_rag_context", + new=mocker.AsyncMock(return_value=RAGContext()), ) mock_client = mocker.AsyncMock(spec=AsyncLlamaStackClient) diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py index e0cb7a8fb..832d99a42 100644 --- a/tests/unit/models/config/test_byok_rag.py +++ b/tests/unit/models/config/test_byok_rag.py @@ -3,16 +3,15 @@ from pathlib import Path import pytest - from pydantic import ValidationError -from models.config import ByokRag - from constants import ( - DEFAULT_RAG_TYPE, - DEFAULT_EMBEDDING_MODEL, DEFAULT_EMBEDDING_DIMENSION, + DEFAULT_EMBEDDING_MODEL, + DEFAULT_RAG_TYPE, + DEFAULT_SCORE_MULTIPLIER, ) +from models.config import ByokRag def test_byok_rag_configuration_default_values() -> None: @@ -29,7 +28,8 @@ def test_byok_rag_configuration_default_values() -> None: assert byok_rag.embedding_model == DEFAULT_EMBEDDING_MODEL assert byok_rag.embedding_dimension == DEFAULT_EMBEDDING_DIMENSION assert byok_rag.vector_db_id == "vector_db_id" - assert byok_rag.db_path == "tests/configuration/rag.txt" + assert byok_rag.db_path == Path("tests/configuration/rag.txt") + assert byok_rag.score_multiplier == DEFAULT_SCORE_MULTIPLIER def test_byok_rag_configuration_nondefault_values() -> None: @@ -142,3 +142,27 @@ def test_byok_rag_configuration_empty_vector_db_id() -> None: vector_db_id="", db_path=Path("tests/configuration/rag.txt"), ) + + +def test_byok_rag_configuration_custom_score_multiplier() -> None: + """Test ByokRag with custom score_multiplier.""" + + byok_rag = ByokRag( + rag_id="rag_id", + vector_db_id="vector_db_id", + db_path="tests/configuration/rag.txt", + score_multiplier=2.5, + ) + assert byok_rag.score_multiplier == 2.5 + + +def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None: + """Test that score_multiplier must be greater than 0.""" + + with pytest.raises(ValidationError, match="greater than 0"): + _ = ByokRag( + rag_id="rag_id", + vector_db_id="vector_db_id", + db_path="tests/configuration/rag.txt", + score_multiplier=0.0, + ) diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 0867db6b1..a3c4ad3b4 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -206,7 +206,15 @@ def test_dump_configuration(tmp_path: Path) -> None: "postgres": None, }, "azure_entra_id": None, - "solr": None, + "rag": { + "always": { + "byok": {"enabled": False}, + "solr": {"enabled": False, "offline": True}, + }, + "tool": { + "byok": {"enabled": True}, + }, + }, "splunk": None, "deployment_environment": "development", } @@ -550,7 +558,15 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: "postgres": None, }, "azure_entra_id": None, - "solr": None, + "rag": { + "always": { + "byok": {"enabled": False}, + "solr": {"enabled": False, "offline": True}, + }, + "tool": { + "byok": {"enabled": True}, + }, + }, "splunk": None, "deployment_environment": "development", } @@ -772,7 +788,15 @@ def test_dump_configuration_with_quota_limiters_different_values( "postgres": None, }, "azure_entra_id": None, - "solr": None, + "rag": { + "always": { + "byok": {"enabled": False}, + "solr": {"enabled": False, "offline": True}, + }, + "tool": { + "byok": {"enabled": True}, + }, + }, "splunk": None, "deployment_environment": "development", } @@ -950,6 +974,7 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "rag_id": "rag_id", "rag_type": "inline::faiss", "vector_db_id": "vector_db_id", + "score_multiplier": 1.0, }, ], "quota_handlers": { @@ -968,7 +993,15 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "postgres": None, }, "azure_entra_id": None, - "solr": None, + "rag": { + "always": { + "byok": {"enabled": False}, + "solr": {"enabled": False, "offline": True}, + }, + "tool": { + "byok": {"enabled": True}, + }, + }, "splunk": None, "deployment_environment": "development", } @@ -1150,7 +1183,15 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: "postgres": None, }, "azure_entra_id": None, - "solr": None, + "rag": { + "always": { + "byok": {"enabled": False}, + "solr": {"enabled": False, "offline": True}, + }, + "tool": { + "byok": {"enabled": True}, + }, + }, "splunk": None, "deployment_environment": "development", } diff --git a/tests/unit/models/responses/test_rag_chunk.py b/tests/unit/models/responses/test_rag_chunk.py index d9809eade..621e9a4ac 100644 --- a/tests/unit/models/responses/test_rag_chunk.py +++ b/tests/unit/models/responses/test_rag_chunk.py @@ -1,6 +1,7 @@ -"""Unit tests for RAGChunk model.""" +"""Unit tests for RAGChunk and RAGContext models.""" -from utils.types import RAGChunk +from utils.types import RAGChunk, RAGContext +from models.responses import ReferencedDocument class TestRAGChunk: @@ -110,3 +111,84 @@ def test_url_as_source(self) -> None: ) assert chunk.source == url_source assert chunk.score == 0.92 + + def test_attributes_field(self) -> None: + """Test RAGChunk with attributes field.""" + attributes = { + "doc_url": "https://example.com/doc", + "title": "Example Document", + "author": "John Doe", + } + chunk = RAGChunk( + content="Test content", source="test-source", attributes=attributes + ) + assert chunk.attributes == attributes + assert chunk.attributes["doc_url"] == "https://example.com/doc" + + def test_attributes_none(self) -> None: + """Test RAGChunk with attributes=None.""" + chunk = RAGChunk(content="Test content", attributes=None) + assert chunk.attributes is None + + +class TestRAGContext: + """Test cases for the RAGContext model.""" + + def test_default_values(self) -> None: + """Test RAGContext with default values.""" + context = RAGContext() + assert context.context_text == "" + assert context.rag_chunks == [] + assert context.referenced_documents == [] + + def test_with_context_text(self) -> None: + """Test RAGContext with context text.""" + context = RAGContext(context_text="Test context") + assert context.context_text == "Test context" + assert context.rag_chunks == [] + assert context.referenced_documents == [] + + def test_with_rag_chunks(self) -> None: + """Test RAGContext with RAG chunks.""" + chunks = [ + RAGChunk(content="Chunk 1", source="source1", score=0.9), + RAGChunk(content="Chunk 2", source="source2", score=0.8), + ] + context = RAGContext(rag_chunks=chunks) + assert len(context.rag_chunks) == 2 + assert context.rag_chunks[0].content == "Chunk 1" + assert context.rag_chunks[1].content == "Chunk 2" + + def test_with_referenced_documents(self) -> None: + """Test RAGContext with referenced documents.""" + docs = [ + ReferencedDocument( + doc_title="Doc 1", + doc_url="https://example.com/doc1", + source="source1", + ), + ReferencedDocument( + doc_title="Doc 2", + doc_url="https://example.com/doc2", + source="source2", + ), + ] + context = RAGContext(referenced_documents=docs) + assert len(context.referenced_documents) == 2 + assert context.referenced_documents[0].doc_title == "Doc 1" + assert context.referenced_documents[1].doc_title == "Doc 2" + + def test_fully_populated(self) -> None: + """Test RAGContext with all fields populated.""" + chunks = [RAGChunk(content="Test chunk", source="source1", score=0.95)] + docs = [ + ReferencedDocument(doc_title="Test Doc", doc_url="https://example.com/doc") + ] + context = RAGContext( + context_text="Formatted context", + rag_chunks=chunks, + referenced_documents=docs, + ) + assert context.context_text == "Formatted context" + assert len(context.rag_chunks) == 1 + assert len(context.referenced_documents) == 1 diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index b99f68e71..1cd11df86 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -1,5 +1,7 @@ """Unit tests for functions defined in src/configuration.py.""" +# pylint: disable=too-many-lines + from pathlib import Path from typing import Any, Generator from pydantic import ValidationError @@ -994,3 +996,81 @@ def test_rag_id_mapping_not_loaded() -> None: cfg._configuration = None with pytest.raises(LogicError): _ = cfg.rag_id_mapping + + +def test_score_multiplier_mapping_empty_when_no_byok(minimal_config: AppConfig) -> None: + """Test that score_multiplier_mapping returns empty dict when no BYOK RAG configured.""" + assert minimal_config.score_multiplier_mapping == {} + + +def test_score_multiplier_mapping_with_byok_defaults(tmp_path: Path) -> None: + """Test that score_multiplier_mapping uses default multiplier when not specified.""" + db_file = tmp_path / "test.db" + db_file.touch() + cfg = AppConfig() + cfg.init_from_dict( + { + "name": "test", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://test.com:1234", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": [ + { + "rag_id": "my-kb", + "vector_db_id": "vs-001", + "db_path": str(db_file), + }, + ], + } + ) + assert cfg.score_multiplier_mapping == {"vs-001": 1.0} + + +def test_score_multiplier_mapping_with_custom_values(tmp_path: Path) -> None: + """Test that score_multiplier_mapping builds correct mapping with custom values.""" + db_file1 = tmp_path / "test1.db" + db_file1.touch() + db_file2 = tmp_path / "test2.db" + db_file2.touch() + cfg = AppConfig() + cfg.init_from_dict( + { + "name": "test", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://test.com:1234", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": [ + { + "rag_id": "kb1", + "vector_db_id": "vs-001", + "db_path": str(db_file1), + "score_multiplier": 1.5, + }, + { + "rag_id": "kb2", + "vector_db_id": "vs-002", + "db_path": str(db_file2), + "score_multiplier": 0.75, + }, + ], + } + ) + assert cfg.score_multiplier_mapping == {"vs-001": 1.5, "vs-002": 0.75} + + +def test_score_multiplier_mapping_not_loaded() -> None: + """Test that score_multiplier_mapping raises when config not loaded.""" + cfg = AppConfig() + cfg._configuration = None + with pytest.raises(LogicError): + _ = cfg.score_multiplier_mapping diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py index aad0c1d72..d98674834 100644 --- a/tests/unit/test_llama_stack_configuration.py +++ b/tests/unit/test_llama_stack_configuration.py @@ -12,6 +12,7 @@ construct_vector_io_providers_section, construct_storage_backends_section, construct_models_section, + enrich_solr, ) from models.config import ( Configuration, @@ -63,7 +64,7 @@ def test_construct_vector_stores_section_adds_new() -> None: output = construct_vector_stores_section(ls_config, byok_rag) assert len(output) == 1 assert output[0]["vector_store_id"] == "store1" - assert output[0]["provider_id"] == "byok_store1" + assert output[0]["provider_id"] == "byok_rag1" assert output[0]["embedding_model"] == "test-model" assert output[0]["embedding_dimension"] == 512 @@ -142,19 +143,20 @@ def test_construct_vector_io_providers_section_preserves_existing() -> None: def test_construct_vector_io_providers_section_adds_new() -> None: - """Test adds new BYOK RAG entries.""" + """Test adds new BYOK RAG entries using rag_id for provider naming.""" ls_config: dict[str, Any] = {"providers": {}} byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "rag_type": "inline::faiss", }, ] output = construct_vector_io_providers_section(ls_config, byok_rag) assert len(output) == 1 - assert output[0]["provider_id"] == "byok_store1" + assert output[0]["provider_id"] == "byok_rag1" assert output[0]["provider_type"] == "inline::faiss" - assert output[0]["config"]["persistence"]["backend"] == "byok_store1_storage" + assert output[0]["config"]["persistence"]["backend"] == "byok_rag1_storage" assert output[0]["config"]["persistence"]["namespace"] == "vector_io::faiss" @@ -187,19 +189,20 @@ def test_construct_storage_backends_section_preserves_existing() -> None: def test_construct_storage_backends_section_adds_new() -> None: - """Test adds new BYOK RAG backend entries.""" + """Test adds new BYOK RAG backend entries using rag_id for backend naming.""" ls_config: dict[str, Any] = {} byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "db_path": "/path/to/store1.db", }, ] output = construct_storage_backends_section(ls_config, byok_rag) assert len(output) == 1 - assert "byok_store1_storage" in output - assert output["byok_store1_storage"]["type"] == "kv_sqlite" - assert output["byok_store1_storage"]["db_path"] == "/path/to/store1.db" + assert "byok_rag1_storage" in output + assert output["byok_rag1_storage"]["type"] == "kv_sqlite" + assert output["byok_rag1_storage"]["db_path"] == "/path/to/store1.db" # ============================================================================= @@ -229,10 +232,11 @@ def test_construct_models_section_preserves_existing() -> None: def test_construct_models_section_adds_embedding_model() -> None: - """Test adds embedding model from BYOK RAG.""" + """Test adds embedding model from BYOK RAG using rag_id for model naming.""" ls_config: dict[str, Any] = {} byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "embedding_model": "sentence-transformers/all-mpnet-base-v2", "embedding_dimension": 768, @@ -240,7 +244,7 @@ def test_construct_models_section_adds_embedding_model() -> None: ] output = construct_models_section(ls_config, byok_rag) assert len(output) == 1 - assert output[0]["model_id"] == "byok_store1_embedding" + assert output[0]["model_id"] == "byok_rag1_embedding" assert output[0]["model_type"] == "embedding" assert output[0]["provider_id"] == "sentence-transformers" assert output[0]["provider_model_id"] == "all-mpnet-base-v2" @@ -338,13 +342,110 @@ def test_generate_configuration_with_byok(tmp_path: Path) -> None: ] assert "store1" in store_ids - # Check storage.backends - assert "byok_store1_storage" in result["storage"]["backends"] + # Check storage.backends - named after rag_id + assert "byok_rag1_storage" in result["storage"]["backends"] - # Check providers.vector_io + # Check providers.vector_io - named after rag_id provider_ids = [p["provider_id"] for p in result["providers"]["vector_io"]] - assert "byok_store1" in provider_ids + assert "byok_rag1" in provider_ids - # Check registered_resources.models for embedding model + # Check registered_resources.models for embedding model - named after rag_id model_ids = [m["model_id"] for m in result["registered_resources"]["models"]] - assert "byok_store1_embedding" in model_ids + assert "byok_rag1_embedding" in model_ids + + +# ============================================================================= +# Test enrich_solr +# ============================================================================= + + +def test_enrich_solr_skips_when_not_enabled() -> None: + """Test enrich_solr does nothing when Solr is not enabled.""" + ls_config: dict[str, Any] = {} + enrich_solr(ls_config, {"enabled": False}) + assert not ls_config + + +def test_enrich_solr_skips_when_empty_config() -> None: + """Test enrich_solr does nothing with empty config.""" + ls_config: dict[str, Any] = {} + enrich_solr(ls_config, {}) + assert not ls_config + + +def test_enrich_solr_adds_vector_io_provider() -> None: + """Test enrich_solr adds Solr provider to vector_io section.""" + ls_config: dict[str, Any] = {} + enrich_solr(ls_config, {"enabled": True}) + + assert "providers" in ls_config + assert "vector_io" in ls_config["providers"] + provider_ids = [p["provider_id"] for p in ls_config["providers"]["vector_io"]] + assert "okp_solr" in provider_ids + + +def test_enrich_solr_adds_vector_store_registration() -> None: + """Test enrich_solr registers the Solr vector store.""" + ls_config: dict[str, Any] = {} + enrich_solr(ls_config, {"enabled": True}) + + assert "registered_resources" in ls_config + store_ids = [ + s["vector_store_id"] for s in ls_config["registered_resources"]["vector_stores"] + ] + assert "portal-rag" in store_ids + + +def test_enrich_solr_adds_embedding_model() -> None: + """Test enrich_solr registers the Solr embedding model.""" + ls_config: dict[str, Any] = {} + enrich_solr(ls_config, {"enabled": True}) + + model_ids = [m["model_id"] for m in ls_config["registered_resources"]["models"]] + assert "solr_embedding" in model_ids + + +def test_enrich_solr_skips_duplicate_provider() -> None: + """Test enrich_solr does not add duplicate Solr provider.""" + ls_config: dict[str, Any] = { + "providers": {"vector_io": [{"provider_id": "okp_solr"}]} + } + enrich_solr(ls_config, {"enabled": True}) + + provider_ids = [p["provider_id"] for p in ls_config["providers"]["vector_io"]] + assert provider_ids.count("okp_solr") == 1 + + +def test_enrich_solr_skips_duplicate_vector_store() -> None: + """Test enrich_solr does not add duplicate vector store registration.""" + ls_config: dict[str, Any] = { + "registered_resources": {"vector_stores": [{"vector_store_id": "portal-rag"}]} + } + enrich_solr(ls_config, {"enabled": True}) + + store_ids = [ + s["vector_store_id"] for s in ls_config["registered_resources"]["vector_stores"] + ] + assert store_ids.count("portal-rag") == 1 + + +def test_enrich_solr_preserves_existing_config() -> None: + """Test enrich_solr preserves existing providers and resources.""" + ls_config: dict[str, Any] = { + "providers": {"vector_io": [{"provider_id": "existing_provider"}]}, + "registered_resources": { + "vector_stores": [{"vector_store_id": "existing_store"}], + "models": [{"model_id": "existing_model"}], + }, + } + enrich_solr(ls_config, {"enabled": True}) + + provider_ids = [p["provider_id"] for p in ls_config["providers"]["vector_io"]] + assert "existing_provider" in provider_ids + assert "okp_solr" in provider_ids + + store_ids = [ + s["vector_store_id"] for s in ls_config["registered_resources"]["vector_stores"] + ] + assert "existing_store" in store_ids + assert "portal-rag" in store_ids diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index 9f0a1d598..e31163f8d 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -11,21 +11,26 @@ from llama_stack_api.openai_responses import ( OpenAIResponseInputToolFileSearch as InputToolFileSearch, OpenAIResponseInputToolMCP as InputToolMCP, + OpenAIResponseMCPApprovalRequest as MCPApprovalRequest, + OpenAIResponseMCPApprovalResponse as MCPApprovalResponse, OpenAIResponseOutputMessageFileSearchToolCall as FileSearchCall, OpenAIResponseOutputMessageFunctionToolCall as FunctionCall, OpenAIResponseOutputMessageMCPCall as MCPCall, OpenAIResponseOutputMessageMCPListTools as MCPListTools, - OpenAIResponseMCPApprovalRequest as MCPApprovalRequest, - OpenAIResponseMCPApprovalResponse as MCPApprovalResponse, OpenAIResponseOutputMessageWebSearchToolCall as WebSearchCall, ) from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient from pydantic import AnyUrl from pytest_mock import MockerFixture +import constants +from configuration import AppConfig from models.config import ByokRag, ModelContextProtocolServer from models.requests import QueryRequest from utils.responses import ( + _build_chunk_attributes, + _increment_llm_call_metric, + _resolve_source_for_result, build_mcp_tool_call_from_arguments_done, build_tool_call_summary, build_tool_result_from_mcp_output_item_done, @@ -37,14 +42,12 @@ get_mcp_tools, get_rag_tools, get_topic_summary, + get_vector_store_ids, parse_arguments_string, parse_referenced_documents, prepare_responses_params, prepare_tools, resolve_vector_store_ids, - _build_chunk_attributes, - _increment_llm_call_metric, - _resolve_source_for_result, ) from utils.types import RAGChunk @@ -2354,3 +2357,87 @@ def test_multiple_stores_source_is_none(self, mocker: MockerFixture) -> None: assert len(docs) == 1 assert docs[0].source is None + + +class TestGetVectorStoreIds: + """Tests for get_vector_store_ids utility function.""" + + @pytest.mark.asyncio + async def test_returns_provided_ids_directly(self, mocker: MockerFixture) -> None: + """Test that provided vector_store_ids are returned without fetching.""" + client_mock = mocker.AsyncMock() + result = await get_vector_store_ids(client_mock, ["vs1", "vs2"]) + assert result == ["vs1", "vs2"] + client_mock.vector_stores.list.assert_not_called() + + @pytest.mark.asyncio + async def test_fetches_all_when_no_ids_provided( + self, mocker: MockerFixture + ) -> None: + """Test that all vector stores are fetched when no IDs provided.""" + mock_store1 = mocker.Mock() + mock_store1.id = "vs-fetched-1" + mock_store2 = mocker.Mock() + mock_store2.id = "vs-fetched-2" + + mock_list_result = mocker.Mock() + mock_list_result.data = [mock_store1, mock_store2] + + client_mock = mocker.AsyncMock() + client_mock.vector_stores.list.return_value = mock_list_result + + result = await get_vector_store_ids(client_mock, None) + assert result == ["vs-fetched-1", "vs-fetched-2"] + client_mock.vector_stores.list.assert_called_once() + + @pytest.mark.asyncio + async def test_raises_on_connection_error(self, mocker: MockerFixture) -> None: + """Test that APIConnectionError raises HTTPException 503.""" + client_mock = mocker.AsyncMock() + client_mock.vector_stores.list.side_effect = APIConnectionError.__new__( + APIConnectionError + ) + + with pytest.raises(HTTPException) as exc_info: + await get_vector_store_ids(client_mock, None) + assert exc_info.value.status_code == 503 + + @pytest.mark.asyncio + async def test_raises_on_api_status_error(self, mocker: MockerFixture) -> None: + """Test that APIStatusError raises HTTPException 500.""" + mock_response = mocker.Mock() + mock_response.status_code = 500 + mock_response.headers = {} + mock_response.text = "error" + + client_mock = mocker.AsyncMock() + client_mock.vector_stores.list.side_effect = APIStatusError( + "error", response=mock_response, body=None + ) + + with pytest.raises(HTTPException) as exc_info: + await get_vector_store_ids(client_mock, None) + assert exc_info.value.status_code == 500 + + +class TestGetRAGToolsWithConfig: + """Tests for get_rag_tools with configuration checks.""" + + def test_returns_none_when_tool_rag_disabled(self, mocker: MockerFixture) -> None: + """Test get_rag_tools returns None when Tool RAG is disabled in config.""" + mock_config = mocker.Mock(spec=AppConfig) + mock_config.rag.tool.byok.enabled = False + mocker.patch("utils.responses.configuration", mock_config) + + assert get_rag_tools(["vs1", "vs2"]) is None + + def test_returns_tools_when_enabled(self, mocker: MockerFixture) -> None: + """Test get_rag_tools returns tools when Tool RAG is enabled in config.""" + mock_config = mocker.Mock(spec=AppConfig) + mock_config.rag.tool.byok.enabled = True + mocker.patch("utils.responses.configuration", mock_config) + + tools = get_rag_tools(["vs1"]) + assert tools is not None + assert tools[0].type == constants.DEFAULT_RAG_TOOL + assert tools[0].vector_store_ids == ["vs1"] From 661a03e0029f9393a5a828ef8ffab4584d5e848d Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 2 Mar 2026 14:06:54 +0100 Subject: [PATCH 2/5] Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation --- docs/byok_guide.md | 24 ++-- docs/config.md | 20 +-- docs/openapi.json | 129 +++++++++++++----- docs/rag_guide.md | 12 +- examples/lightspeed-stack-byok-rag.yaml | 6 +- lightspeed-stack.yaml | 8 +- src/app/endpoints/query.py | 17 +-- src/app/endpoints/streaming_query.py | 49 ++++--- src/constants.py | 6 +- src/llama_stack_configuration.py | 22 ++- src/models/config.py | 40 +++--- src/utils/query.py | 46 ++++--- src/utils/responses.py | 15 +- src/utils/types.py | 4 +- src/utils/vector_search.py | 23 ++-- tests/unit/app/endpoints/test_query.py | 2 +- .../models/config/test_dump_configuration.py | 20 +-- tests/unit/test_llama_stack_configuration.py | 46 ++++++- tests/unit/utils/test_responses.py | 6 +- 19 files changed, 314 insertions(+), 181 deletions(-) diff --git a/docs/byok_guide.md b/docs/byok_guide.md index a5b95a359..475732354 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -36,15 +36,15 @@ BYOK (Bring Your Own Knowledge) is Lightspeed Core's implementation of Retrieval BYOK knowledge sources can be queried in two complementary modes, configured independently: -### Always RAG (pre-query injection) +### Inline RAG (pre-query injection) -Context is fetched from your BYOK vector stores and/or Solr **before** the LLM generates a response, and injected into every query automatically. No tool calls are required. +Context is fetched from your BYOK vector stores and/or OKP **before** the LLM generates a response, and injected into every query automatically. No tool calls are required. ```mermaid graph TD A[User Query] --> B[Fetch Context] B --> C[BYOK Vector Stores] - B --> D[Solr OKP] + B --> D[OKP Vector Stores] C --> E[Retrieved Chunks] D --> E E --> F[Inject Context into Prompt Context] @@ -58,9 +58,9 @@ The LLM can call the `file_search` tool during generation when it decides extern ```mermaid graph TD - A[User Query] --> P{Always RAG enabled?} + A[User Query] --> P{Inline RAG enabled?} P -->|Yes| Q[Fetch Context] - Q --> R[BYOK Vector Stores / Solr OKP] + Q --> R[BYOK / OKP Vector Stores] R --> S[Inject Context into Prompt Context] S --> B[LLM] P -->|No| B @@ -77,7 +77,7 @@ Both modes rely on: - **Vector Database**: Your indexed knowledge sources stored as vector embeddings - **Embedding Model**: Converts queries and documents into vector representations for similarity matching -Always RAG additionally supports: +Inline RAG additionally supports: - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. --- @@ -286,7 +286,7 @@ registered_resources: > ``` > > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of -> each store's results during Always RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. +> each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. ### Step 5: Configure RAG Strategy @@ -294,12 +294,12 @@ Add a `rag` section to your `lightspeed-stack.yaml` to choose how BYOK knowledge ```yaml rag: - # Always RAG: inject context before every LLM response (no tool calls needed) - always: + # Inline RAG: inject context before every LLM response (no tool calls needed) + inline: byok: enabled: true # fetch and inject BYOK vector store context pre-query - solr: - enabled: true # fetch and inject Solr OKP context pre-query + okp: + enabled: true # fetch and inject OKP context pre-query # Tool RAG: the LLM can call file_search to retrieve context on demand tool: @@ -311,7 +311,7 @@ Both modes can be enabled simultaneously. Choose based on your latency and contr | Mode | When context is fetched | Tool call needed | Supported sources | score_multiplier | |------|------------------------|------------------|-------------------|-----------------| -| Always RAG | Before every query | No | BYOK + Solr | Yes (BYOK only) | +| Inline RAG | Before every query | No | BYOK + OKP | Yes (BYOK only) | | Tool RAG | On LLM demand | Yes | BYOK only | No | --- diff --git a/docs/config.md b/docs/config.md index 0dc14d6fd..955c68f00 100644 --- a/docs/config.md +++ b/docs/config.md @@ -171,7 +171,7 @@ Global service configuration. | azure_entra_id | | | | splunk | | Splunk HEC configuration for sending telemetry events. | | deployment_environment | string | Deployment environment name (e.g., 'development', 'staging', 'production'). Used in telemetry events. | -| rag | | RAG strategy configuration (Solr and BYOK). Controls pre-query (Always RAG) and tool-based (Tool RAG) retrieval. | +| rag | | RAG strategy configuration (OKP and BYOK). Controls pre-query (Inline RAG) and tool-based (Tool RAG) retrieval. | ## ConversationHistoryConfiguration @@ -526,7 +526,7 @@ the service can handle requests concurrently. Top-level RAG strategy configuration. Controls two complementary retrieval modes: -- **Always RAG**: context is fetched from Solr and/or BYOK vector stores and injected +- **Inline RAG**: context is fetched from OKP and/or BYOK vector stores and injected into every query before the LLM responds. - **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve context on demand from BYOK vector stores. @@ -534,37 +534,37 @@ Top-level RAG strategy configuration. Controls two complementary retrieval modes | Field | Type | Description | |-------|------|-------------| -| always | | Pre-query RAG from Solr and BYOK. See AlwaysRagConfiguration. | +| inline | | Pre-query RAG from OKP and BYOK. See InlineRagConfiguration. | | tool | | Tool-based RAG that the LLM can invoke. See ToolRagConfiguration. | -## AlwaysRagConfiguration +## InlineRagConfiguration Pre-query RAG configuration that injects context before the LLM generates a response. -Both Solr and BYOK sources can be enabled independently. When enabled, retrieved +Both OKP and BYOK sources can be enabled independently. When enabled, retrieved chunks are added as context on every query. | Field | Type | Description | |-------|------|-------------| -| solr | | Solr RAG configuration for pre-query context injection. | +| okp | | OKP RAG configuration for pre-query context injection. | | byok | | BYOK RAG configuration for pre-query context injection. | -## SolrRagConfiguration +## OkpRagConfiguration -Solr configuration for Always RAG (pre-query context injection). +OKP configuration for Inline RAG (pre-query context injection). Controls whether to use offline or online mode when building document URLs -from vector search results, and enables/disables Solr vector IO functionality. +from vector search results, and enables/disables OKP vector IO functionality. | Field | Type | Description | |-------|------|-------------| -| enabled | boolean | When True, enables Solr vector IO functionality for vector search queries. When False, disables Solr vector search processing. | +| enabled | boolean | When True, enables OKP vector IO functionality for vector search queries. When False, disables OKP vector search processing. | | offline | boolean | When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs. | diff --git a/docs/openapi.json b/docs/openapi.json index 23bac9b99..6fe13ef00 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -4447,7 +4447,7 @@ ], "summary": "Handle A2A Jsonrpc", "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", - "operationId": "handle_a2a_jsonrpc_a2a_get", + "operationId": "handle_a2a_jsonrpc_a2a_post", "responses": { "200": { "description": "Successful Response", @@ -4465,7 +4465,7 @@ ], "summary": "Handle A2A Jsonrpc", "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", - "operationId": "handle_a2a_jsonrpc_a2a_get", + "operationId": "handle_a2a_jsonrpc_a2a_post", "responses": { "200": { "description": "Successful Response", @@ -5503,6 +5503,13 @@ "format": "file-path", "title": "DB path", "description": "Path to RAG database." + }, + "score_multiplier": { + "type": "number", + "exclusiveMinimum": 0.0, + "title": "Score multiplier", + "description": "Multiplier applied to relevance scores from this vector store. Used to weight results when querying multiple knowledge sources. Values > 1 boost this store's results; values < 1 reduce them.", + "default": 1.0 } }, "additionalProperties": false, @@ -5515,6 +5522,20 @@ "title": "ByokRag", "description": "BYOK (Bring Your Own Knowledge) RAG configuration." }, + "ByokRagConfiguration": { + "properties": { + "enabled": { + "type": "boolean", + "title": "BYOK RAG enabled", + "description": "When True, queries BYOK vector stores for RAG context.", + "default": false + } + }, + "additionalProperties": false, + "type": "object", + "title": "ByokRagConfiguration", + "description": "BYOK RAG configuration." + }, "CORSConfiguration": { "properties": { "allow_origins": { @@ -5714,17 +5735,10 @@ "description": "Deployment environment name (e.g., 'development', 'staging', 'production'). Used in telemetry events.", "default": "development" }, - "solr": { - "anyOf": [ - { - "$ref": "#/components/schemas/SolrConfiguration" - }, - { - "type": "null" - } - ], - "title": "Solr configuration", - "description": "Configuration for Solr vector search operations." + "rag": { + "$ref": "#/components/schemas/RagConfiguration", + "title": "RAG configuration", + "description": "Configuration for all RAG strategies (inline and tool-based)." } }, "additionalProperties": false, @@ -6966,6 +6980,24 @@ } ] }, + "InlineRagConfiguration": { + "properties": { + "okp": { + "$ref": "#/components/schemas/OkpRagConfiguration", + "title": "OKP RAG configuration", + "description": "Configuration for OKP RAG (inline)." + }, + "byok": { + "$ref": "#/components/schemas/ByokRagConfiguration", + "title": "BYOK RAG configuration", + "description": "Configuration for BYOK RAG (inline)." + } + }, + "additionalProperties": false, + "type": "object", + "title": "InlineRagConfiguration", + "description": "Inline RAG configuration.\n\nControls inline RAG from OKP and BYOK vector stores." + }, "InternalServerErrorResponse": { "properties": { "status_code": { @@ -7575,6 +7607,26 @@ "title": "OAuthFlows", "description": "Defines the configuration for the supported OAuth 2.0 flows." }, + "OkpRagConfiguration": { + "properties": { + "enabled": { + "type": "boolean", + "title": "OKP RAG enabled", + "description": "When True, queries OKP for RAG context.", + "default": false + }, + "offline": { + "type": "boolean", + "title": "Offline mode", + "description": "When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs.", + "default": true + } + }, + "additionalProperties": false, + "type": "object", + "title": "OkpRagConfiguration", + "description": "OKP RAG configuration." + }, "OpenIdConnectSecurityScheme": { "properties": { "description": { @@ -8749,6 +8801,24 @@ "title": "RHIdentityConfiguration", "description": "Red Hat Identity authentication configuration." }, + "RagConfiguration": { + "properties": { + "inline": { + "$ref": "#/components/schemas/InlineRagConfiguration", + "title": "Inline RAG configuration", + "description": "Configuration for inline RAG from OKP and BYOK vector stores." + }, + "tool": { + "$ref": "#/components/schemas/ToolRagConfiguration", + "title": "Tool RAG configuration", + "description": "Configuration for exposing RAG as a tool that the LLM can call." + } + }, + "additionalProperties": false, + "type": "object", + "title": "RagConfiguration", + "description": "RAG strategy configuration.\n\nControls different RAG strategies: inline and tool-based." + }, "ReadinessResponse": { "properties": { "ready": { @@ -9260,26 +9330,6 @@ } ] }, - "SolrConfiguration": { - "properties": { - "enabled": { - "type": "boolean", - "title": "Solr enabled", - "description": "When True, enables Solr vector IO functionality for vector search queries. When False, disables Solr vector search processing.", - "default": false - }, - "offline": { - "type": "boolean", - "title": "Offline mode", - "description": "When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs.", - "default": true - } - }, - "additionalProperties": false, - "type": "object", - "title": "SolrConfiguration", - "description": "Solr configuration for vector search queries.\n\nControls whether to use offline or online mode when building document URLs\nfrom vector search results, and enables/disables Solr vector IO functionality." - }, "SplunkConfiguration": { "properties": { "enabled": { @@ -9535,6 +9585,19 @@ "title": "ToolCallSummary", "description": "Model representing a tool call made during response generation (for tool_calls list)." }, + "ToolRagConfiguration": { + "properties": { + "byok": { + "$ref": "#/components/schemas/ByokRagConfiguration", + "title": "BYOK RAG configuration", + "description": "Configuration for BYOK RAG as a tool." + } + }, + "additionalProperties": false, + "type": "object", + "title": "ToolRagConfiguration", + "description": "Tool RAG configuration.\n\nControls whether RAG functionality is exposed as a tool that the LLM can call." + }, "ToolResultSummary": { "properties": { "id": { diff --git a/docs/rag_guide.md b/docs/rag_guide.md index 9d968a952..4319d868b 100644 --- a/docs/rag_guide.md +++ b/docs/rag_guide.md @@ -5,7 +5,7 @@ This document explains how to configure and customize your RAG pipeline using th * Initialize a vector store * Download and point to a local embedding model * Configure an inference provider (LLM) -* Choose a RAG strategy (Always RAG or Tool RAG) +* Choose a RAG strategy (Inline RAG or Tool RAG) --- @@ -28,7 +28,7 @@ This document explains how to configure and customize your RAG pipeline using th Lightspeed Core Stack (LCS) supports two complementary RAG strategies: -- **Always RAG**: context is fetched from Solr and/or BYOK vector stores and injected into every query before the LLM responds. No tool calls are required. +- **Inline RAG**: context is fetched from Solr and/or BYOK vector stores and injected into every query before the LLM responds. No tool calls are required. - **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve context on demand from BYOK vector stores. Both strategies can be enabled independently via the `rag` section of `lightspeed-stack.yaml`. See [BYOK Feature Documentation](byok_guide.md) for configuration details. @@ -324,9 +324,9 @@ Note: if the vector database (portal-rag) is not in the persistent data store wi ```yaml rag: - always: - solr: - enabled: true # Enable Solr vector IO (Always RAG - pre-query injection) + inline: + okp: + enabled: true # Enable OKP vector IO (Inline RAG - pre-query injection) offline: true # Use parent_id for document URLs (offline mode) # Set to false to use reference_url (online mode) ``` @@ -341,7 +341,7 @@ curl -sX POST http://localhost:8080/v1/query \ **Query Processing:** -1. When Solr is enabled, queries use the `portal-rag` vector store +1. When OKP is enabled, queries use the `portal-rag` vector store 2. Vector search is performed with configurable parameters: - `k`: Number of results (default: 5) - `score_threshold`: Minimum similarity score (default: 0.0) diff --git a/examples/lightspeed-stack-byok-rag.yaml b/examples/lightspeed-stack-byok-rag.yaml index 575b3fcf4..b5081a7a9 100644 --- a/examples/lightspeed-stack-byok-rag.yaml +++ b/examples/lightspeed-stack-byok-rag.yaml @@ -50,10 +50,10 @@ byok_rag: # RAG configuration rag: - # Always RAG: context injected before every LLM response (no tool calls needed) + # Inline RAG: context injected before every LLM request (no tool calls needed) # Supports both Solr and BYOK sources. Score multipliers apply here only. - always: - solr: + inline: + okp: enabled: false # Enable Solr OKP pre-query context injection offline: false # Controls how document URLs are built from Solr results byok: diff --git a/lightspeed-stack.yaml b/lightspeed-stack.yaml index 5ed176d8c..c9c40aa23 100644 --- a/lightspeed-stack.yaml +++ b/lightspeed-stack.yaml @@ -34,9 +34,9 @@ authentication: # RAG configuration rag: - # Always RAG (inject context pre-query with RAG from Solr and BYOK vector stores) - always: - solr: + # Inline RAG (inject context pre-query with RAG from Solr and BYOK vector stores) + inline: + okp: enabled: false offline: false # Supports weighted scoring @@ -46,4 +46,4 @@ rag: tool: byok: # Default is true for backward compatibility - enabled: false + enabled: true diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 865d790a2..f9b329d92 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -154,14 +154,8 @@ async def query_endpoint_handler( client = AsyncLlamaStackClientHolder().get_client() - # Build RAG context from BYOK and Solr sources - rag_context = await build_rag_context(client, query_request, configuration) - - # Inject RAG context into query - if rag_context.context_text: - # Mutate a local copy to avoid surprising other logic - query_request = query_request.model_copy(deep=True) - query_request.query = query_request.query + rag_context.context_text + # Build RAG context from Inline RAG sources + inline_rag_context = await build_rag_context(client, query_request, configuration) # Prepare API request parameters responses_params = await prepare_responses_params( @@ -173,6 +167,7 @@ async def query_endpoint_handler( stream=False, store=True, request_headers=request.headers, + inline_rag_context=inline_rag_context.context_text or None, ) # Handle Azure token refresh if needed @@ -197,14 +192,14 @@ async def query_endpoint_handler( rag_id_mapping, ) - # Merge RAG chunks (BYOK + Solr) with tool-based RAG chunks - rag_chunks = rag_context.rag_chunks + # Combine inline RAG results (BYOK + Solr) with tool-based RAG results for the transcript + rag_chunks = inline_rag_context.rag_chunks tool_rag_chunks = turn_summary.rag_chunks or [] logger.info("RAG as a tool retrieved %d chunks", len(tool_rag_chunks)) turn_summary.rag_chunks = rag_chunks + tool_rag_chunks # Add tool-based RAG documents and chunks - rag_documents = rag_context.referenced_documents + rag_documents = inline_rag_context.referenced_documents tool_rag_documents = turn_summary.referenced_documents or [] turn_summary.referenced_documents = deduplicate_referenced_documents( rag_documents + tool_rag_documents diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index 4f3a3d84c..98bae1e61 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -10,29 +10,15 @@ from llama_stack_api.openai_responses import ( OpenAIResponseObject, OpenAIResponseObjectStream, -) -from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseMcpCallArgumentsDone as MCPArgsDoneChunk, -) -from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputItemAdded as OutputItemAddedChunk, -) -from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputItemDone as OutputItemDoneChunk, -) -from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputTextDelta as TextDeltaChunk, -) -from llama_stack_api.openai_responses import ( OpenAIResponseObjectStreamResponseOutputTextDone as TextDoneChunk, -) -from llama_stack_api.openai_responses import ( OpenAIResponseOutputMessageMCPCall as MCPCall, ) from llama_stack_client import ( APIConnectionError, -) -from llama_stack_client import ( APIStatusError as LLSApiStatusError, ) from openai._exceptions import APIStatusError as OpenAIAPIStatusError @@ -70,10 +56,7 @@ UnauthorizedResponse, UnprocessableEntityResponse, ) -<<<<<<< HEAD from utils.types import ReferencedDocument -======= ->>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) from utils.endpoints import ( check_configuration_loaded, validate_and_retrieve_conversation, @@ -202,6 +185,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals client = AsyncLlamaStackClientHolder().get_client() +<<<<<<< HEAD <<<<<<< HEAD _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search( client, query_request.query, query_request.solr @@ -219,6 +203,10 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals >>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) query_request = query_request.model_copy(deep=True) query_request.query = query_request.query + rag_context.context_text +======= + # Build RAG context from Inline RAG sources + inline_rag_context = await build_rag_context(client, query_request, configuration) +>>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) # Prepare API request parameters responses_params = await prepare_responses_params( @@ -230,6 +218,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals stream=True, store=True, request_headers=request.headers, + inline_rag_context=inline_rag_context.context_text or None, ) # Handle Azure token refresh if needed @@ -266,7 +255,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals generator, turn_summary = await retrieve_response_generator( responses_params=responses_params, context=context, - pre_rag_documents=rag_context.referenced_documents, + inline_rag_documents=inline_rag_context.referenced_documents, ) response_media_type = ( @@ -289,7 +278,7 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals async def retrieve_response_generator( responses_params: ResponsesApiParams, context: ResponseGeneratorContext, - pre_rag_documents: list[ReferencedDocument], + inline_rag_documents: list[ReferencedDocument], ) -> tuple[AsyncIterator[str], TurnSummary]: """ Retrieve the appropriate response generator. @@ -301,11 +290,15 @@ async def retrieve_response_generator( Args: responses_params: The Responses API parameters context: The response generator context +<<<<<<< HEAD <<<<<<< HEAD doc_ids_from_chunks: List of ReferencedDocument objects extracted from static RAG ======= pre_rag_documents: Referenced documents from pre-query RAG (BYOK + Solr) >>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) +======= + inline_rag_documents: Referenced documents from pre-query RAG (BYOK + Solr) +>>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) Returns: tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary @@ -336,7 +329,7 @@ async def retrieve_response_generator( **responses_params.model_dump(exclude_none=True) ) # Store pre-RAG documents for later merging with tool-based RAG - turn_summary.pre_rag_documents = pre_rag_documents + turn_summary.inline_rag_documents = inline_rag_documents return response_generator(response, context, turn_summary), turn_summary # Handle know LLS client errors only at stream creation time and shield execution @@ -777,9 +770,25 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat rag_id_mapping=context.rag_id_mapping, ) +<<<<<<< HEAD turn_summary.referenced_documents = deduplicate_referenced_documents( tool_based_documents + turn_summary.pre_rag_documents ) +======= + # Merge pre-RAG documents with tool-based documents (similar to query.py) + if turn_summary.inline_rag_documents: + all_documents = turn_summary.inline_rag_documents + tool_based_documents + seen = set() + deduplicated_documents = [] + for doc in all_documents: + key = (doc.doc_url, doc.doc_title) + if key not in seen: + seen.add(key) + deduplicated_documents.append(doc) + turn_summary.referenced_documents = deduplicated_documents + else: + turn_summary.referenced_documents = tool_based_documents +>>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) def stream_http_error_event( diff --git a/src/constants.py b/src/constants.py index f12377bb3..06da328a3 100644 --- a/src/constants.py +++ b/src/constants.py @@ -175,11 +175,11 @@ DEFAULT_RAG_TOOL = "file_search" TOOL_RAG_MAX_CHUNKS = 10 # retrieved from RAG as a tool -# BYOK RAG constants -BYOK_RAG_MAX_CHUNKS = 10 # retrieved from BYOK RAG (Always RAG strategy) +# Inline RAG constants +BYOK_RAG_MAX_CHUNKS = 10 # retrieved from BYOK RAG +OKP_RAG_MAX_CHUNKS = 5 # retrieved from OKP RAG # Solr OKP constants -SOLR_RAG_MAX_CHUNKS = 5 # retrieved from the Solr OKP RAG (Always RAG strategy) SOLR_VECTOR_SEARCH_DEFAULT_K = 5 SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.0 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid" diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index aba46262b..f2ea7373d 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -139,7 +139,9 @@ def construct_storage_backends_section( # add new backends for each BYOK RAG for brag in byok_rag: - rag_id = brag.get("rag_id", "") + if not brag.get("rag_id"): + raise ValueError(f"BYOK RAG entry is missing required 'rag_id': {brag}") + rag_id = brag["rag_id"] backend_name = f"byok_{rag_id}_storage" output[backend_name] = { "type": "kv_sqlite", @@ -185,8 +187,12 @@ def construct_vector_stores_section( existing_store_ids = {vs.get("vector_store_id") for vs in output} added = 0 for brag in byok_rag: - rag_id = brag.get("rag_id", "") - vector_db_id = brag.get("vector_db_id", "") + if not brag.get("rag_id"): + raise ValueError(f"BYOK RAG entry is missing required 'rag_id': {brag}") + if not brag.get("vector_db_id"): + raise ValueError(f"BYOK RAG entry is missing required 'vector_db_id': {brag}") + rag_id = brag["rag_id"] + vector_db_id = brag["vector_db_id"] if vector_db_id in existing_store_ids: continue existing_store_ids.add(vector_db_id) @@ -231,8 +237,10 @@ def construct_models_section( # add embedding models for each BYOK RAG for brag in byok_rag: + if not brag.get("rag_id"): + raise ValueError(f"BYOK RAG entry is missing required 'rag_id': {brag}") + rag_id = brag["rag_id"] embedding_model = brag.get("embedding_model", constants.DEFAULT_EMBEDDING_MODEL) - rag_id = brag.get("rag_id", "") embedding_dimension = brag.get("embedding_dimension") # Skip if no embedding model specified @@ -298,7 +306,9 @@ def construct_vector_io_providers_section( # append new vector_io entries for brag in byok_rag: - rag_id = brag.get("rag_id", "") + if not brag.get("rag_id"): + raise ValueError(f"BYOK RAG entry is missing required 'rag_id': {brag}") + rag_id = brag["rag_id"] backend_name = f"byok_{rag_id}_storage" provider_id = f"byok_{rag_id}" output.append( @@ -519,7 +529,7 @@ def generate_configuration( enrich_byok_rag(ls_config, config.get("byok_rag", [])) # Enrichment: Solr - solr_config = config.get("rag", {}).get("always", {}).get("solr", {}) + solr_config = config.get("rag", {}).get("inline", {}).get("okp", {}) enrich_solr(ls_config, solr_config) logger.info("Writing Llama Stack configuration into file %s", output_file) diff --git a/src/models/config.py b/src/models/config.py index cc2301975..086090efc 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1704,13 +1704,13 @@ class ByokRagConfiguration(ConfigurationBase): ) -class SolrRagConfiguration(ConfigurationBase): - """Solr RAG configuration.""" +class OkpRagConfiguration(ConfigurationBase): + """OKP RAG configuration.""" enabled: bool = Field( default=False, - title="Solr RAG enabled", - description="When True, queries Solr OKP for RAG context.", + title="OKP RAG enabled", + description="When True, queries OKP for RAG context.", ) offline: bool = Field( @@ -1721,22 +1721,22 @@ class SolrRagConfiguration(ConfigurationBase): ) -class AlwaysRagConfiguration(ConfigurationBase): - """Always RAG configuration. +class InlineRagConfiguration(ConfigurationBase): + """Inline RAG configuration. - Controls pre-query RAG from Solr and BYOK vector stores. + Controls inline RAG from OKP and BYOK vector stores. """ - solr: SolrRagConfiguration = Field( - default_factory=lambda: SolrRagConfiguration(), # pylint: disable=unnecessary-lambda - title="Solr RAG configuration", - description="Configuration for Solr RAG (pre-query).", + okp: OkpRagConfiguration = Field( + default_factory=OkpRagConfiguration, + title="OKP RAG configuration", + description="Configuration for OKP RAG (inline).", ) byok: ByokRagConfiguration = Field( - default_factory=lambda: ByokRagConfiguration(), # pylint: disable=unnecessary-lambda + default_factory=ByokRagConfiguration, title="BYOK RAG configuration", - description="Configuration for BYOK RAG (pre-query).", + description="Configuration for BYOK RAG (inline).", ) @@ -1758,17 +1758,17 @@ class ToolRagConfiguration(ConfigurationBase): class RagConfiguration(ConfigurationBase): """RAG strategy configuration. - Controls different RAG strategies: pre-query (always) and tool-based. + Controls different RAG strategies: inline and tool-based. """ - always: AlwaysRagConfiguration = Field( - default_factory=lambda: AlwaysRagConfiguration(), # pylint: disable=unnecessary-lambda - title="Always RAG configuration", - description="Configuration for pre-query RAG from Solr and BYOK vector stores.", + inline: InlineRagConfiguration = Field( + default_factory=InlineRagConfiguration, + title="Inline RAG configuration", + description="Configuration for inline RAG from OKP and BYOK vector stores.", ) tool: ToolRagConfiguration = Field( - default_factory=lambda: ToolRagConfiguration(), # pylint: disable=unnecessary-lambda + default_factory=ToolRagConfiguration, title="Tool RAG configuration", description="Configuration for exposing RAG as a tool that the LLM can call.", ) @@ -1915,7 +1915,7 @@ class Configuration(ConfigurationBase): rag: RagConfiguration = Field( default_factory=RagConfiguration, title="RAG configuration", - description="Configuration for all RAG strategies (pre-query and tool-based).", + description="Configuration for all RAG strategies (inline and tool-based).", ) @model_validator(mode="after") diff --git a/src/utils/query.py b/src/utils/query.py index 9d447ff9c..91e562a44 100644 --- a/src/utils/query.py +++ b/src/utils/query.py @@ -1,23 +1,31 @@ """Utility functions for working with queries.""" +import sqlite3 from datetime import UTC, datetime from typing import Optional +import psycopg2 +from fastapi import HTTPException from llama_stack_client import ( APIConnectionError, APIStatusError as LLSApiStatusError, AsyncLlamaStackClient, ) -from openai._exceptions import APIStatusError as OpenAIAPIStatusError from llama_stack_client.types import Shield - -from fastapi import HTTPException +from openai._exceptions import APIStatusError as OpenAIAPIStatusError from sqlalchemy import func +from sqlalchemy.exc import SQLAlchemyError + +import constants +from app.database import get_session +from authorization.azure_token_manager import AzureEntraIDManager +from cache.cache_error import CacheError +from client import AsyncLlamaStackClientHolder from configuration import configuration +from log import get_logger from models.cache_entry import CacheEntry from models.config import Action from models.database.conversations import UserConversation, UserTurn -import constants from models.requests import Attachment, QueryRequest from models.responses import ( AbstractErrorResponse, @@ -28,23 +36,15 @@ ServiceUnavailableResponse, UnprocessableEntityResponse, ) -from authorization.azure_token_manager import AzureEntraIDManager -from cache.cache_error import CacheError -import psycopg2 -import sqlite3 -from sqlalchemy.exc import SQLAlchemyError -from app.database import get_session -from client import AsyncLlamaStackClientHolder +from utils.quota import consume_tokens +from utils.suid import normalize_conversation_id +from utils.token_counter import TokenCounter from utils.transcripts import ( create_transcript, create_transcript_metadata, store_transcript, ) -from utils.quota import consume_tokens -from utils.suid import normalize_conversation_id -from utils.token_counter import TokenCounter from utils.types import TurnSummary -from log import get_logger logger = get_logger(__name__) @@ -192,19 +192,27 @@ async def update_azure_token( ) -def prepare_input(query_request: QueryRequest) -> str: +def prepare_input( + query_request: QueryRequest, inline_rag_context: Optional[str] = None +) -> str: """ - Prepare input text for Responses API by appending attachments. + Prepare input text for Responses API by appending RAG context and attachments. - Takes the query text and appends any attachment content with type labels. + Takes the query text, appends any inline RAG context for the LLM call, then + appends any attachment content with type labels. Args: query_request: The query request containing the query and optional attachments + inline_rag_context: Optional RAG context to inject into the query before + sending to the LLM. Passed separately to keep QueryRequest a pure public + API model. Returns: - str: The input text with attachments appended (if any) + str: The input text with RAG context and attachments appended (if any) """ input_text = query_request.query + if inline_rag_context: + input_text += inline_rag_context if query_request.attachments: for attachment in query_request.attachments: # Append attachment content with type label diff --git a/src/utils/responses.py b/src/utils/responses.py index dce076d41..59a8a4c9b 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -87,7 +87,7 @@ async def get_vector_store_ids( HTTPException: With ServiceUnavailableResponse if connection fails, or InternalServerErrorResponse if API returns an error status """ - if vector_store_ids: + if vector_store_ids is not None: return vector_store_ids try: @@ -231,6 +231,7 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma stream: bool = False, store: bool = True, request_headers: Optional[Mapping[str, str]] = None, + inline_rag_context: Optional[str] = None, ) -> ResponsesApiParams: """Prepare API request parameters for Responses API. @@ -243,6 +244,9 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma stream: Whether to stream the response store: Whether to store the response request_headers: Incoming HTTP request headers for allowlist propagation + inline_rag_context: Optional RAG context to inject into the query before + sending to the LLM. Passed separately to keep QueryRequest a pure public + API model. Returns: ResponsesApiParams containing all prepared parameters for the API request @@ -272,7 +276,8 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma ) # Prepare input for Responses API - input_text = prepare_input(query_request) + # Adds inline RAG context and attachments + input_text = prepare_input(query_request, inline_rag_context) # Handle conversation ID for Responses API conversation_id = query_request.conversation_id @@ -369,10 +374,10 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea """ # Check if Tool RAG is enabled in configuration if not (configuration and configuration.rag.tool.byok.enabled): - return None + return [] - if not vector_store_ids: - return None + if vector_store_ids == []: + return [] return [ InputToolFileSearch( diff --git a/src/utils/types.py b/src/utils/types.py index 6134f42b8..c3a0c71d3 100644 --- a/src/utils/types.py +++ b/src/utils/types.py @@ -21,8 +21,6 @@ from llama_stack_client.lib.agents.tool_parser import ToolParser from llama_stack_client.lib.agents.types import ( CompletionMessage as AgentCompletionMessage, -) -from llama_stack_client.lib.agents.types import ( ToolCall as AgentToolCall, ) from pydantic import AnyUrl, BaseModel, Field @@ -313,7 +311,7 @@ class TurnSummary(BaseModel): tool_results: list[ToolResultSummary] = Field(default_factory=list) rag_chunks: list[RAGChunk] = Field(default_factory=list) referenced_documents: list[ReferencedDocument] = Field(default_factory=list) - pre_rag_documents: list[ReferencedDocument] = Field(default_factory=list) + inline_rag_documents: list[ReferencedDocument] = Field(default_factory=list) token_usage: TokenCounter = Field(default_factory=TokenCounter) diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index 737e05a7c..c44c747ae 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -24,7 +24,7 @@ def _is_solr_enabled() -> bool: """Check if Solr is enabled in configuration.""" - return bool(configuration.rag.always.solr.enabled) + return bool(configuration.rag.inline.okp.enabled) def _get_solr_vector_store_ids() -> list[str]: @@ -336,8 +336,8 @@ async def _fetch_byok_rag( rag_chunks: list[RAGChunk] = [] referenced_documents: list[ReferencedDocument] = [] - if not configuration.rag.always.byok.enabled: - logger.info("Always RAG (BYOK) disabled, skipping BYOK RAG search") + if not configuration.rag.inline.byok.enabled: + logger.info("Inline RAG (BYOK) disabled, skipping BYOK RAG search") return rag_chunks, referenced_documents try: @@ -426,7 +426,7 @@ async def _fetch_solr_rag( return rag_chunks, referenced_documents # Get offline setting from configuration - offline = configuration.rag.always.solr.offline + offline = configuration.rag.inline.okp.offline try: vector_store_ids = _get_solr_vector_store_ids() @@ -450,8 +450,8 @@ async def _fetch_solr_rag( ) # Limit to top N chunks - top_chunks = query_response.chunks[: constants.SOLR_RAG_MAX_CHUNKS] - top_scores = retrieved_scores[: constants.SOLR_RAG_MAX_CHUNKS] + top_chunks = query_response.chunks[: constants.OKP_RAG_MAX_CHUNKS] + top_scores = retrieved_scores[: constants.OKP_RAG_MAX_CHUNKS] # Extract referenced documents from Solr chunks referenced_documents = _process_solr_chunks_for_documents( @@ -464,7 +464,7 @@ async def _fetch_solr_rag( ) logger.info( "Filtered top %d chunks from Solr OKP RAG (%d were retrieved)", - constants.SOLR_RAG_MAX_CHUNKS, + constants.OKP_RAG_MAX_CHUNKS, len(rag_chunks), ) @@ -507,10 +507,11 @@ async def build_rag_context( context_text = _format_rag_context(context_chunks, query_request.query) - logger.debug("=" * 80) - logger.debug("RAG context built for pre-query injection:") - logger.debug(context_text) - logger.debug("=" * 80) + logger.debug( + "Inline RAG context built: %d chunks, %d characters", + len(context_chunks), + len(context_text), + ) # Merge referenced documents from all sources (BYOK + Solr) top_documents = byok_docs + solr_docs diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index 1599c78f2..044fb5bf2 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -712,4 +712,4 @@ async def test_retrieve_response_with_tool_calls( assert result.token_usage.output_tokens == 5 assert result.rag_chunks == [] assert result.referenced_documents == [] - assert result.pre_rag_documents == [] + assert result.inline_rag_documents == [] diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index a3c4ad3b4..22d7a5a6a 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -207,9 +207,9 @@ def test_dump_configuration(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "always": { + "inline": { "byok": {"enabled": False}, - "solr": {"enabled": False, "offline": True}, + "okp": {"enabled": False, "offline": True}, }, "tool": { "byok": {"enabled": True}, @@ -559,9 +559,9 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "always": { + "inline": { "byok": {"enabled": False}, - "solr": {"enabled": False, "offline": True}, + "okp": {"enabled": False, "offline": True}, }, "tool": { "byok": {"enabled": True}, @@ -789,9 +789,9 @@ def test_dump_configuration_with_quota_limiters_different_values( }, "azure_entra_id": None, "rag": { - "always": { + "inline": { "byok": {"enabled": False}, - "solr": {"enabled": False, "offline": True}, + "okp": {"enabled": False, "offline": True}, }, "tool": { "byok": {"enabled": True}, @@ -994,9 +994,9 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "always": { + "inline": { "byok": {"enabled": False}, - "solr": {"enabled": False, "offline": True}, + "okp": {"enabled": False, "offline": True}, }, "tool": { "byok": {"enabled": True}, @@ -1184,9 +1184,9 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "always": { + "inline": { "byok": {"enabled": False}, - "solr": {"enabled": False, "offline": True}, + "okp": {"enabled": False, "offline": True}, }, "tool": { "byok": {"enabled": True}, diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py index d98674834..8d17a96c7 100644 --- a/tests/unit/test_llama_stack_configuration.py +++ b/tests/unit/test_llama_stack_configuration.py @@ -74,7 +74,7 @@ def test_construct_vector_stores_section_merge() -> None: ls_config = { "registered_resources": {"vector_stores": [{"vector_store_id": "existing"}]} } - byok_rag = [{"vector_db_id": "new_store"}] + byok_rag = [{"rag_id": "rag1", "vector_db_id": "new_store"}] output = construct_vector_stores_section(ls_config, byok_rag) assert len(output) == 2 @@ -90,6 +90,7 @@ def test_construct_vector_stores_section_skips_duplicate_from_existing() -> None } byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "embedding_model": "test-model", "embedding_dimension": 512, @@ -105,11 +106,13 @@ def test_construct_vector_stores_section_skips_duplicate_within_byok() -> None: ls_config: dict[str, Any] = {} byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "embedding_model": "model-a", "embedding_dimension": 512, }, { + "rag_id": "rag2", "vector_db_id": "store1", "embedding_model": "model-b", "embedding_dimension": 768, @@ -256,6 +259,7 @@ def test_construct_models_section_strips_prefix() -> None: ls_config: dict[str, Any] = {} byok_rag = [ { + "rag_id": "rag1", "vector_db_id": "store1", "embedding_model": "sentence-transformers//usr/path/model", "embedding_dimension": 768, @@ -266,6 +270,46 @@ def test_construct_models_section_strips_prefix() -> None: assert output[0]["provider_model_id"] == "/usr/path/model" +def test_construct_storage_backends_section_raises_on_missing_rag_id() -> None: + """Test raises ValueError when rag_id is missing from a BYOK RAG entry.""" + ls_config: dict[str, Any] = {} + byok_rag = [{"vector_db_id": "store1"}] + with pytest.raises(ValueError, match="missing required 'rag_id'"): + construct_storage_backends_section(ls_config, byok_rag) + + +def test_construct_vector_stores_section_raises_on_missing_rag_id() -> None: + """Test raises ValueError when rag_id is missing from a BYOK RAG entry.""" + ls_config: dict[str, Any] = {} + byok_rag = [{"vector_db_id": "store1"}] + with pytest.raises(ValueError, match="missing required 'rag_id'"): + construct_vector_stores_section(ls_config, byok_rag) + + +def test_construct_vector_stores_section_raises_on_missing_vector_db_id() -> None: + """Test raises ValueError when vector_db_id is missing from a BYOK RAG entry.""" + ls_config: dict[str, Any] = {} + byok_rag = [{"rag_id": "rag1"}] + with pytest.raises(ValueError, match="missing required 'vector_db_id'"): + construct_vector_stores_section(ls_config, byok_rag) + + +def test_construct_vector_io_section_raises_on_missing_rag_id() -> None: + """Test raises ValueError when rag_id is missing from a BYOK RAG entry.""" + ls_config: dict[str, Any] = {} + byok_rag = [{"vector_db_id": "store1"}] + with pytest.raises(ValueError, match="missing required 'rag_id'"): + construct_vector_io_providers_section(ls_config, byok_rag) + + +def test_construct_models_section_raises_on_missing_rag_id() -> None: + """Test raises ValueError when rag_id is missing from a BYOK RAG entry.""" + ls_config: dict[str, Any] = {} + byok_rag = [{"vector_db_id": "store1", "embedding_model": "some-model"}] + with pytest.raises(ValueError, match="missing required 'rag_id'"): + construct_models_section(ls_config, byok_rag) + + # ============================================================================= # Test generate_configuration # ============================================================================= diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index e31163f8d..cc66951d2 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -337,8 +337,8 @@ class TestGetRAGTools: """Test cases for get_rag_tools utility function.""" def test_get_rag_tools_empty_list(self) -> None: - """Test get_rag_tools returns None for empty list.""" - assert get_rag_tools([]) is None + """Test get_rag_tools returns empty list for empty vector store IDs.""" + assert get_rag_tools([]) == [] def test_get_rag_tools_with_vector_stores(self) -> None: """Test get_rag_tools returns correct tool format for vector stores.""" @@ -2429,7 +2429,7 @@ def test_returns_none_when_tool_rag_disabled(self, mocker: MockerFixture) -> Non mock_config.rag.tool.byok.enabled = False mocker.patch("utils.responses.configuration", mock_config) - assert get_rag_tools(["vs1", "vs2"]) is None + assert get_rag_tools(["vs1", "vs2"]) == [] def test_returns_tools_when_enabled(self, mocker: MockerFixture) -> None: """Test get_rag_tools returns tools when Tool RAG is enabled in config.""" From ac963999587d65155b9653e816eb6053da6c61e4 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:43:19 +0100 Subject: [PATCH 3/5] - Abstracted Solr away from user - Config changed to list rag_id / okp stores per RAG mode: inline / tool --- docs/byok_guide.md | 44 +- docs/config.md | 91 ++-- docs/openapi.json | 103 ++-- docs/rag_guide.md | 97 ++-- examples/lightspeed-stack-byok-okp-rag.yaml | 71 +++ examples/lightspeed-stack-byok-rag.yaml | 65 --- lightspeed-stack.yaml | 16 - run.yaml | 41 +- src/app/endpoints/query.py | 18 +- src/app/endpoints/streaming_query.py | 3 +- src/configuration.py | 80 +++ src/constants.py | 3 + src/llama_stack_configuration.py | 40 +- src/models/config.py | 105 ++-- src/utils/query.py | 2 +- src/utils/responses.py | 18 +- src/utils/vector_search.py | 65 ++- .../models/config/test_dump_configuration.py | 65 ++- .../models/config/test_rag_configuration.py | 92 ++++ tests/unit/utils/test_responses.py | 21 +- tests/unit/utils/test_vector_search.py | 494 ++++++++++++++++++ 21 files changed, 1073 insertions(+), 461 deletions(-) create mode 100644 examples/lightspeed-stack-byok-okp-rag.yaml delete mode 100644 examples/lightspeed-stack-byok-rag.yaml create mode 100644 tests/unit/models/config/test_rag_configuration.py create mode 100644 tests/unit/utils/test_vector_search.py diff --git a/docs/byok_guide.md b/docs/byok_guide.md index 475732354..e9390fd62 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -36,9 +36,9 @@ BYOK (Bring Your Own Knowledge) is Lightspeed Core's implementation of Retrieval BYOK knowledge sources can be queried in two complementary modes, configured independently: -### Inline RAG (pre-query injection) +### Inline RAG -Context is fetched from your BYOK vector stores and/or OKP **before** the LLM generates a response, and injected into every query automatically. No tool calls are required. +Context is fetched from your BYOK vector stores and/or OKP and injected before the LLM request. No tool calls are required. ```mermaid graph TD @@ -54,7 +54,7 @@ graph TD ### Tool RAG (on-demand retrieval) -The LLM can call the `file_search` tool during generation when it decides external knowledge is needed. Only BYOK vector stores are supported in Tool RAG mode. +The LLM can call the `file_search` tool during generation when it decides external knowledge is needed. Both BYOK vector stores and OKP are supported in Tool RAG mode. ```mermaid graph TD @@ -67,7 +67,7 @@ graph TD B --> C{Need External Knowledge?} C -->|Yes| D[file_search Tool] C -->|No| E[Generate Response] - D --> F[BYOK Vector Stores] + D --> F[BYOK / OKP Vector Stores] F --> G[Retrieve Relevant Context] G --> B E --> H[Response to User] @@ -78,7 +78,13 @@ Both modes rely on: - **Embedding Model**: Converts queries and documents into vector representations for similarity matching Inline RAG additionally supports: -- **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. +- **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. + +> [!NOTE] +> OKP and BYOK scores are not directly comparable (different scoring systems), so +> `score_multiplier` does not apply to OKP results. To control the amount of retrieved +> context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py` +> (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10). --- @@ -290,29 +296,37 @@ registered_resources: ### Step 5: Configure RAG Strategy -Add a `rag` section to your `lightspeed-stack.yaml` to choose how BYOK knowledge is used: +Add a `rag` section to your `lightspeed-stack.yaml` to choose how BYOK knowledge is used. +Each list entry is a `rag_id` from `byok_rag`, or the special value `okp-rag` for OKP. ```yaml rag: - # Inline RAG: inject context before every LLM response (no tool calls needed) + # Inline RAG: inject context before the LLM request (no tool calls needed) inline: - byok: - enabled: true # fetch and inject BYOK vector store context pre-query - okp: - enabled: true # fetch and inject OKP context pre-query + - my-docs # rag_id from byok_rag + - okp-rag # include OKP context inline # Tool RAG: the LLM can call file_search to retrieve context on demand + # Omit to use all registered BYOK stores (backward compatibility) tool: - byok: - enabled: true # expose BYOK vector stores as the file_search tool + - my-docs # expose this BYOK store as the file_search tool + - okp-rag # expose OKP as the file_search tool + +# OKP provider settings (only relevant when okp-rag is listed above) +okp: + offline: true # true = use parent_id for source URLs, false = use reference_url ``` Both modes can be enabled simultaneously. Choose based on your latency and control preferences: | Mode | When context is fetched | Tool call needed | Supported sources | score_multiplier | |------|------------------------|------------------|-------------------|-----------------| -| Inline RAG | Before every query | No | BYOK + OKP | Yes (BYOK only) | -| Tool RAG | On LLM demand | Yes | BYOK only | No | +| Inline RAG | With every query | No | BYOK + OKP | Yes (BYOK only) | +| Tool RAG | On LLM demand | Yes | BYOK + OKP | No | + +> [!TIP] +> A ready-to-use example combining BYOK and OKP is available at +> [`examples/lightspeed-stack-byok-okp-rag.yaml`](../examples/lightspeed-stack-byok-okp-rag.yaml). --- diff --git a/docs/config.md b/docs/config.md index 955c68f00..4ecb3b635 100644 --- a/docs/config.md +++ b/docs/config.md @@ -110,6 +110,22 @@ Microsoft Entra ID authentication attributes for Azure. BYOK (Bring Your Own Knowledge) RAG configuration. +Each entry registers a local vector store with the service. The `rag_id` is the +identifier used in `rag.inline` and `rag.tool` to select which stores to use. + +Example: + +```yaml +byok_rag: + - rag_id: my-docs # referenced in rag.inline / rag.tool + rag_type: inline::faiss + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + embedding_dimension: 384 + vector_db_id: vs_abc123 + db_path: /path/to/faiss_store.db + score_multiplier: 1.0 +``` + | Field | Type | Description | |-------|------|-------------| @@ -526,68 +542,55 @@ the service can handle requests concurrently. Top-level RAG strategy configuration. Controls two complementary retrieval modes: -- **Inline RAG**: context is fetched from OKP and/or BYOK vector stores and injected - into every query before the LLM responds. +- **Inline RAG**: context is fetched from the listed sources and injected before the + LLM request. - **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve - context on demand from BYOK vector stores. - - -| Field | Type | Description | -|-------|------|-------------| -| inline | | Pre-query RAG from OKP and BYOK. See InlineRagConfiguration. | -| tool | | Tool-based RAG that the LLM can invoke. See ToolRagConfiguration. | - - -## InlineRagConfiguration - - -Pre-query RAG configuration that injects context before the LLM generates a response. - -Both OKP and BYOK sources can be enabled independently. When enabled, retrieved -chunks are added as context on every query. + context on demand from the listed vector stores. Supports both BYOK and OKP. +Each strategy is configured as a list of RAG IDs referencing entries in `byok_rag`. +The special ID `okp-rag` activates the OKP provider (no `byok_rag` entry needed). -| Field | Type | Description | -|-------|------|-------------| -| okp | | OKP RAG configuration for pre-query context injection. | -| byok | | BYOK RAG configuration for pre-query context injection. | - - -## OkpRagConfiguration - - -OKP configuration for Inline RAG (pre-query context injection). - -Controls whether to use offline or online mode when building document URLs -from vector search results, and enables/disables OKP vector IO functionality. - - -| Field | Type | Description | -|-------|------|-------------| -| enabled | boolean | When True, enables OKP vector IO functionality for vector search queries. When False, disables OKP vector search processing. | -| offline | boolean | When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs. | - +**Backward compatibility**: omitting `tool` uses all registered BYOK vector stores +(equivalent to the old `tool.byok.enabled = True`). Omitting `inline` means no +context is injected before the LLM request. -## ByokRagConfiguration +Example: +```yaml +rag: + inline: + - my-docs # inject context from my-docs before the LLM request + tool: + - okp-rag # LLM can search OKP as a tool + - my-docs # LLM can also search my-docs as a tool -Configuration to enable or disable BYOK RAG retrieval. +okp: + offline: true # use parent_id for OKP URL construction +``` | Field | Type | Description | |-------|------|-------------| -| enabled | boolean | When True, queries BYOK vector stores for RAG context. Default: False. | +| inline | list[string] | RAG IDs whose content is injected before the LLM request. Use `okp-rag` for OKP. Empty by default (no inline RAG). | +| tool | list[string] or null | RAG IDs exposed as a `file_search` tool the LLM can invoke. Use `okp-rag` to include OKP. When omitted, all registered BYOK vector stores are used (backward compatibility). | -## ToolRagConfiguration +## OkpConfiguration +OKP (Offline Knowledge Portal) provider settings. Only used when `okp-rag` is listed in `rag.inline` or `rag.tool`. -Configuration for exposing RAG as a tool the LLM can call during generation. +Example: +```yaml +okp: + offline: true # use parent_id for OKP URL construction + chunk_filter_query: "is_chunk:true" +``` | Field | Type | Description | |-------|------|-------------| -| byok | | BYOK RAG configuration for tool-based retrieval. Default: enabled. | +| offline | boolean | When `true` (default), use `parent_id` for OKP chunk source URLs. When `false`, use `reference_url`. | +| chunk_filter_query | string | OKP filter query (`fq`) applied to every OKP search request. Defaults to `"is_chunk:true"`. Extend with `AND` for extra constraints. | ## SplunkConfiguration diff --git a/docs/openapi.json b/docs/openapi.json index 6fe13ef00..571b67546 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -4447,7 +4447,7 @@ ], "summary": "Handle A2A Jsonrpc", "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", - "operationId": "handle_a2a_jsonrpc_a2a_post", + "operationId": "handle_a2a_jsonrpc_a2a_get", "responses": { "200": { "description": "Successful Response", @@ -4465,7 +4465,7 @@ ], "summary": "Handle A2A Jsonrpc", "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", - "operationId": "handle_a2a_jsonrpc_a2a_post", + "operationId": "handle_a2a_jsonrpc_a2a_get", "responses": { "200": { "description": "Successful Response", @@ -5522,20 +5522,6 @@ "title": "ByokRag", "description": "BYOK (Bring Your Own Knowledge) RAG configuration." }, - "ByokRagConfiguration": { - "properties": { - "enabled": { - "type": "boolean", - "title": "BYOK RAG enabled", - "description": "When True, queries BYOK vector stores for RAG context.", - "default": false - } - }, - "additionalProperties": false, - "type": "object", - "title": "ByokRagConfiguration", - "description": "BYOK RAG configuration." - }, "CORSConfiguration": { "properties": { "allow_origins": { @@ -5739,6 +5725,11 @@ "$ref": "#/components/schemas/RagConfiguration", "title": "RAG configuration", "description": "Configuration for all RAG strategies (inline and tool-based)." + }, + "okp": { + "$ref": "#/components/schemas/OkpConfiguration", + "title": "OKP configuration", + "description": "OKP provider settings. Only used when 'okp-rag' is listed in rag.inline or rag.tool." } }, "additionalProperties": false, @@ -6980,24 +6971,6 @@ } ] }, - "InlineRagConfiguration": { - "properties": { - "okp": { - "$ref": "#/components/schemas/OkpRagConfiguration", - "title": "OKP RAG configuration", - "description": "Configuration for OKP RAG (inline)." - }, - "byok": { - "$ref": "#/components/schemas/ByokRagConfiguration", - "title": "BYOK RAG configuration", - "description": "Configuration for BYOK RAG (inline)." - } - }, - "additionalProperties": false, - "type": "object", - "title": "InlineRagConfiguration", - "description": "Inline RAG configuration.\n\nControls inline RAG from OKP and BYOK vector stores." - }, "InternalServerErrorResponse": { "properties": { "status_code": { @@ -7607,25 +7580,25 @@ "title": "OAuthFlows", "description": "Defines the configuration for the supported OAuth 2.0 flows." }, - "OkpRagConfiguration": { + "OkpConfiguration": { "properties": { - "enabled": { - "type": "boolean", - "title": "OKP RAG enabled", - "description": "When True, queries OKP for RAG context.", - "default": false - }, "offline": { "type": "boolean", - "title": "Offline mode", - "description": "When True, use parent_id for chunk source URLs. When False, use reference_url for chunk source URLs.", + "title": "OKP offline mode", + "description": "When True, use parent_id for OKP chunk source URLs. When False, use reference_url for chunk source URLs.", "default": true + }, + "chunk_filter_query": { + "type": "string", + "title": "OKP chunk filter query", + "description": "OKP filter query applied to every OKP search request. Defaults to 'is_chunk:true' to restrict results to chunk documents. To add extra constraints, extend the expression using boolean syntax, e.g. 'is_chunk:true AND product:*openshift*'.", + "default": "is_chunk:true" } }, "additionalProperties": false, "type": "object", - "title": "OkpRagConfiguration", - "description": "OKP RAG configuration." + "title": "OkpConfiguration", + "description": "OKP (Offline Knowledge Portal) provider configuration.\n\nControls provider-specific behaviour for the OKP vector store.\nOnly relevant when ``\"okp-rag\"`` is listed in ``rag.inline`` or ``rag.tool``." }, "OpenIdConnectSecurityScheme": { "properties": { @@ -8804,20 +8777,33 @@ "RagConfiguration": { "properties": { "inline": { - "$ref": "#/components/schemas/InlineRagConfiguration", - "title": "Inline RAG configuration", - "description": "Configuration for inline RAG from OKP and BYOK vector stores." + "items": { + "type": "string" + }, + "type": "array", + "title": "Inline RAG IDs", + "description": "RAG IDs whose sources are injected as context before the LLM call. Use 'okp-rag' to enable OKP inline RAG. Empty by default (no inline RAG)." }, "tool": { - "$ref": "#/components/schemas/ToolRagConfiguration", - "title": "Tool RAG configuration", - "description": "Configuration for exposing RAG as a tool that the LLM can call." + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool RAG IDs", + "description": "RAG IDs made available to the LLM as a file_search tool. Use 'okp-rag' to include the OKP vector store. When omitted, all registered BYOK vector stores are used (backward compatibility)." } }, "additionalProperties": false, "type": "object", "title": "RagConfiguration", - "description": "RAG strategy configuration.\n\nControls different RAG strategies: inline and tool-based." + "description": "RAG strategy configuration.\n\nControls which RAG sources are used for inline and tool-based retrieval.\n\nEach strategy lists RAG IDs to include. The special ID ``\"okp-rag\"`` defined in constants,\nactivates the OKP provider; all other IDs refer to entries in ``byok_rag``.\n\nBackward compatibility:\n - ``inline`` defaults to ``[]`` (no inline RAG).\n - ``tool`` defaults to ``None`` which means all registered vector stores\n are used (identical to the previous ``tool.byok.enabled = True`` default)." }, "ReadinessResponse": { "properties": { @@ -9585,19 +9571,6 @@ "title": "ToolCallSummary", "description": "Model representing a tool call made during response generation (for tool_calls list)." }, - "ToolRagConfiguration": { - "properties": { - "byok": { - "$ref": "#/components/schemas/ByokRagConfiguration", - "title": "BYOK RAG configuration", - "description": "Configuration for BYOK RAG as a tool." - } - }, - "additionalProperties": false, - "type": "object", - "title": "ToolRagConfiguration", - "description": "Tool RAG configuration.\n\nControls whether RAG functionality is exposed as a tool that the LLM can call." - }, "ToolResultSummary": { "properties": { "id": { diff --git a/docs/rag_guide.md b/docs/rag_guide.md index 4319d868b..dd7cc9e87 100644 --- a/docs/rag_guide.md +++ b/docs/rag_guide.md @@ -28,8 +28,8 @@ This document explains how to configure and customize your RAG pipeline using th Lightspeed Core Stack (LCS) supports two complementary RAG strategies: -- **Inline RAG**: context is fetched from Solr and/or BYOK vector stores and injected into every query before the LLM responds. No tool calls are required. -- **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve context on demand from BYOK vector stores. +- **Inline RAG**: context is fetched from BYOK vector stores and/or OKP and injected before the LLM request. No tool calls are required. +- **Tool RAG**: the LLM can call the `file_search` tool during generation to retrieve context on demand from BYOK vector stores and/or OKP. Both strategies can be enabled independently via the `rag` section of `lightspeed-stack.yaml`. See [BYOK Feature Documentation](byok_guide.md) for configuration details. @@ -273,64 +273,25 @@ The OKP (Offline Knowledge Portal) Solr Vector IO is a read-only vector search p #### How to Enable Solr Vector IO -**1. Configure Llama Stack (`run.yaml`):** - -```yaml -providers: - vector_io: - - provider_id: solr-vector - provider_type: remote::solr_vector_io - config: - solr_url: http://localhost:8983/solr - collection_name: portal-rag - vector_field: chunk_vector - content_field: chunk - embedding_dimension: 384 - embedding_model: ${env.EMBEDDING_MODEL_DIR} - chunk_window_config: - chunk_parent_id_field: "parent_id" - chunk_content_field: "chunk_field" - chunk_index_field: "chunk_index" - chunk_token_count_field: "num_tokens" - parent_total_chunks_field: "total_chunks" - parent_total_tokens_field: "total_tokens" - chunk_filter_query: "is_chunk:true" - persistence: - namespace: portal-rag - backend: kv_default - -registered_resources: - vector_stores: - - vector_store_id: portal-rag - provider_id: solr-vector - embedding_model: granite-embedding-30m - embedding_dimension: 384 -``` - -Note: if the vector database (portal-rag) is not in the persistent data store within the vector_io provider -(e.g. after deleting the llama stack cache) you will need to register the vector database under registered resources: - - -```yaml - vector_stores: - - embedding_dimension: 384 - embedding_model: sentence-transformers/${env.EMBEDDING_MODEL_DIR} - provider_id: solr-vector - vector_store_id: portal-rag -``` - - -**2. Configure Lightspeed Stack (`lightspeed-stack.yaml`):** +**1. Configure Lightspeed Stack (`lightspeed-stack.yaml`):** ```yaml rag: inline: - okp: - enabled: true # Enable OKP vector IO (Inline RAG - pre-query injection) - offline: true # Use parent_id for document URLs (offline mode) - # Set to false to use reference_url (online mode) + - okp-rag # inject OKP context before the LLM request + tool: + - okp-rag # expose OKP as the file_search tool + +okp: + offline: true # true = use parent_id for source URLs (offline mode) + # false = use reference_url (online mode) ``` +> [!NOTE] +> When `okp-rag` is listed in `rag.inline` or `rag.tool`, Lightspeed Stack automatically enriches +> the Llama Stack `run.yaml` at startup with the required `vector_io` provider and `registered_resources` +> entries for the OKP vector store. No manual registration is needed. + **Query Request Example:** ``` curl -sX POST http://localhost:8080/v1/query \ @@ -353,11 +314,19 @@ curl -sX POST http://localhost:8080/v1/query \ **Query Filtering:** -To filter the Solr context edit the *chunk_filter_query* field in the -Solr **vector_io** provider in the `run.yaml`. Filters should follow the key:value format: -ex. `"product:*openshift*"` +To filter the Solr context, set the `chunk_filter_query` field in the `okp` section of +`lightspeed-stack.yaml`. Filters follow the Solr key:value format and are applied as a static +`fq` parameter on every OKP search request. The default value `"is_chunk:true"` restricts +results to chunk documents. To add extra constraints, extend the expression using Solr boolean +syntax: -Note: This static filter is a temporary work-around. +```yaml +okp: + chunk_filter_query: "is_chunk:true AND product:*openshift*" +``` + +> [!NOTE] +> This static filter is a temporary work-around until dynamic per-request filtering is supported. **Prerequisites:** @@ -365,6 +334,18 @@ Note: This static filter is a temporary work-around. for instructions on how to pull and run the OKP Solr image visit: https://github.com/lightspeed-core/lightspeed-providers/lightspeed_stack_providers/providers/remote/solr_vector_io/solr_vector_io/README.md +**Chunk volume:** + +OKP and BYOK scores are not directly comparable (different scoring systems), so +`score_multiplier` (a BYOK-only concept) does not apply to OKP results. To control +the number of retrieved chunks, set the constants in `src/constants.py`: + +| Constant | Default | Description | +|----------|---------|-------------| +| `OKP_RAG_MAX_CHUNKS` | 5 | Max chunks retrieved from OKP (Inline RAG) | +| `BYOK_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved from BYOK stores (Inline RAG) | +| `TOOL_RAG_MAX_CHUNKS` | 10 | Max chunks retrieved via Tool RAG (`file_search`) | + **Limitations:** - This is a **read-only** provider - no insert/delete operations diff --git a/examples/lightspeed-stack-byok-okp-rag.yaml b/examples/lightspeed-stack-byok-okp-rag.yaml new file mode 100644 index 000000000..09a0cb85c --- /dev/null +++ b/examples/lightspeed-stack-byok-okp-rag.yaml @@ -0,0 +1,71 @@ +name: Lightspeed Core Service (LCS) +service: + host: localhost + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: false + url: http://localhost:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" +authentication: + module: "noop" +quota_handlers: + sqlite: + db_path: quota.sqlite + limiters: + - name: user_monthly_limits + type: user_limiter + initial_quota: 50 + quota_increase: 50 + period: "30 seconds" + - name: cluster_monthly_limits + type: cluster_limiter + initial_quota: 100 + quota_increase: 100 + period: "30 seconds" + scheduler: + # scheduler ticks in seconds + period: 10 +byok_rag: + - rag_id: ocp-docs # referenced in rag.inline / rag.tool + rag_type: inline::faiss + embedding_dimension: 1024 + vector_db_id: vs_123 # Llama-stack vector_store_id + db_path: /tmp/ocp.faiss + score_multiplier: 1.0 # Weight for this vector store's results (Inline RAG only) + - rag_id: knowledge-base # referenced in rag.inline / rag.tool + rag_type: inline::faiss + embedding_dimension: 384 + vector_db_id: vs_456 # Llama-stack vector_store_id + db_path: /tmp/kb.faiss + score_multiplier: 1.2 # Weight for this vector store's results (Inline RAG only) + +# RAG configuration +rag: + # Inline RAG: context injected before the LLM request from the listed sources + # List rag_ids from byok_rag, or 'okp-rag' to include OKP + inline: + - ocp-docs + - knowledge-base + - okp-rag + # Tool RAG: LLM can call file_search on demand to retrieve context + # List rag_ids from byok_rag, or 'okp-rag' to include OKP + # Omit to use all registered BYOK stores (backward compatibility) + tool: + - ocp-docs + - knowledge-base + +# OKP provider settings (only used when 'okp-rag' is listed in rag.inline or rag.tool) +okp: + offline: true # true = use parent_id for source URLs, false = use reference_url + # Solr fq applied to every OKP search request. Combine with AND for extra constraints: + # chunk_filter_query: "is_chunk:true AND product:*openshift*" + chunk_filter_query: "is_chunk:true" diff --git a/examples/lightspeed-stack-byok-rag.yaml b/examples/lightspeed-stack-byok-rag.yaml deleted file mode 100644 index b5081a7a9..000000000 --- a/examples/lightspeed-stack-byok-rag.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Lightspeed Core Service (LCS) -service: - host: localhost - port: 8080 - auth_enabled: false - workers: 1 - color_log: true - access_log: true -llama_stack: - use_as_library_client: false - url: http://localhost:8321 - api_key: xyzzy -user_data_collection: - feedback_enabled: true - feedback_storage: "/tmp/data/feedback" - transcripts_enabled: true - transcripts_storage: "/tmp/data/transcripts" -authentication: - module: "noop" -quota_handlers: - sqlite: - db_path: quota.sqlite - limiters: - - name: user_monthly_limits - type: user_limiter - initial_quota: 50 - quota_increase: 50 - period: "30 seconds" - - name: cluster_monthly_limits - type: cluster_limiter - initial_quota: 100 - quota_increase: 100 - period: "30 seconds" - scheduler: - # scheduler ticks in seconds - period: 10 -byok_rag: - - rag_id: ocp_docs - rag_type: inline::faiss - embedding_dimension: 1024 - vector_db_id: vs_123 # Llama-stack vector_store_id - db_path: /tmp/ocp.faiss - score_multiplier: 1.0 # Weight for this vector store's results - - rag_id: knowledge_base - rag_type: inline::faiss - embedding_dimension: 384 - vector_db_id: vs_456 # Llama-stack vector_store_id - db_path: /tmp/kb.faiss - score_multiplier: 1.2 # Weight for this vector store's results - -# RAG configuration -rag: - # Inline RAG: context injected before every LLM request (no tool calls needed) - # Supports both Solr and BYOK sources. Score multipliers apply here only. - inline: - okp: - enabled: false # Enable Solr OKP pre-query context injection - offline: false # Controls how document URLs are built from Solr results - byok: - enabled: false # Enable BYOK pre-query context injection (weighted by score_multiplier) - # Tool RAG: LLM calls file_search on demand to retrieve BYOK context - # Note: Solr is not available in Tool RAG; score_multiplier does not apply here - tool: - byok: - enabled: true # Enable BYOK vector stores as the file_search tool (default: true) \ No newline at end of file diff --git a/lightspeed-stack.yaml b/lightspeed-stack.yaml index c9c40aa23..fe655a810 100644 --- a/lightspeed-stack.yaml +++ b/lightspeed-stack.yaml @@ -31,19 +31,3 @@ conversation_cache: authentication: module: "noop" - -# RAG configuration -rag: - # Inline RAG (inject context pre-query with RAG from Solr and BYOK vector stores) - inline: - okp: - enabled: false - offline: false - # Supports weighted scoring - byok: - enabled: false - # Tool RAG (LLM can call file_search tool during generation) - tool: - byok: - # Default is true for backward compatibility - enabled: true diff --git a/run.yaml b/run.yaml index 79d4609f2..b7e56d249 100644 --- a/run.yaml +++ b/run.yaml @@ -24,10 +24,7 @@ providers: config: api_key: ${env.OPENAI_API_KEY} allowed_models: ["${env.E2E_OPENAI_MODEL:=gpt-4o-mini}"] - - config: - allowed_models: - - ${env.EMBEDDING_MODEL_DIR} - provider_id: sentence-transformers + - provider_id: sentence-transformers provider_type: inline::sentence-transformers files: - config: @@ -58,27 +55,7 @@ providers: provider_id: rag-runtime provider_type: inline::rag-runtime vector_io: - - provider_id: solr-vector - provider_type: remote::solr_vector_io - config: - solr_url: http://localhost:8983/solr - collection_name: portal-rag - vector_field: chunk_vector - content_field: chunk - embedding_dimension: 384 - embedding_model: ${env.EMBEDDING_MODEL_DIR} - chunk_window_config: - chunk_parent_id_field: "parent_id" - chunk_content_field: "chunk_field" - chunk_index_field: "chunk_index" - chunk_token_count_field: "num_tokens" - parent_total_chunks_field: "total_chunks" - parent_total_tokens_field: "total_tokens" - chunk_filter_query: "is_chunk:true" - persistence: - namespace: portal-rag - backend: kv_default - - config: # Define the storage backend for RAG + - config: persistence: namespace: vector_io::faiss backend: kv_default @@ -149,22 +126,12 @@ storage: namespace: prompts backend: kv_default registered_resources: - models: - - model_id: granite-embedding-30m - model_type: embedding - provider_id: sentence-transformers - provider_model_id: ${env.EMBEDDING_MODEL_DIR} - metadata: - embedding_dimension: 384 + models: [] shields: - shield_id: llama-guard provider_id: llama-guard provider_shield_id: openai/gpt-4o-mini - vector_stores: - - embedding_dimension: 384 - embedding_model: sentence-transformers/${env.EMBEDDING_MODEL_DIR} - provider_id: solr-vector - vector_store_id: portal-rag + vector_stores: [] datasets: [] scoring_fns: [] benchmarks: [] diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index f9b329d92..659c55f3a 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -44,6 +44,7 @@ from utils.query import ( consume_query_tokens, handle_known_apistatus_errors, + prepare_input, store_query_results, update_azure_token, validate_attachments_metadata, @@ -155,7 +156,13 @@ async def query_endpoint_handler( client = AsyncLlamaStackClientHolder().get_client() # Build RAG context from Inline RAG sources - inline_rag_context = await build_rag_context(client, query_request, configuration) + inline_rag_context = await build_rag_context( + client, query_request.query, query_request.vector_store_ids, query_request.solr + ) + + # Moderation input is the raw user content (query + attachments) without injected RAG + # context, to avoid false positives from retrieved document content. + moderation_input = prepare_input(query_request) # Prepare API request parameters responses_params = await prepare_responses_params( @@ -190,6 +197,7 @@ async def query_endpoint_handler( query_request.shield_ids, vector_store_ids, rag_id_mapping, + moderation_input=moderation_input, ) # Combine inline RAG results (BYOK + Solr) with tool-based RAG results for the transcript @@ -266,6 +274,7 @@ async def retrieve_response( # pylint: disable=too-many-locals shield_ids: Optional[list[str]] = None, vector_store_ids: Optional[list[str]] = None, rag_id_mapping: Optional[dict[str, str]] = None, + moderation_input: Optional[str] = None, ) -> TurnSummary: """ Retrieve response from LLMs and agents. @@ -279,6 +288,9 @@ async def retrieve_response( # pylint: disable=too-many-locals shield_ids: Optional list of shield IDs for moderation. vector_store_ids: Vector store IDs used in the query for source resolution. rag_id_mapping: Mapping from vector_db_id to user-facing rag_id. + moderation_input: Text to moderate. Should be the raw user content (query + + attachments) without injected RAG context to avoid false positives. + Falls back to responses_params.input if not provided. Returns: TurnSummary: Summary of the LLM response content @@ -286,7 +298,9 @@ async def retrieve_response( # pylint: disable=too-many-locals response: Optional[OpenAIResponseObject] = None try: moderation_result = await run_shield_moderation( - client, cast(str, responses_params.input), shield_ids + client, + moderation_input or cast(str, responses_params.input), + shield_ids, ) if moderation_result.decision == "blocked": # Handle shield moderation blocking diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index 98bae1e61..b2f68cd3b 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -66,6 +66,7 @@ consume_query_tokens, extract_provider_and_model_from_model_id, handle_known_apistatus_errors, + prepare_input, store_query_results, update_azure_token, validate_attachments_metadata, @@ -308,7 +309,7 @@ async def retrieve_response_generator( try: moderation_result = await run_shield_moderation( context.client, - cast(str, responses_params.input), + prepare_input(context.query_request), context.query_request.shield_ids, ) if moderation_result.decision == "blocked": diff --git a/src/configuration.py b/src/configuration.py index c231515f7..130761709 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -7,6 +7,7 @@ from llama_stack.core.stack import replace_env_vars import yaml +import constants from models.config import ( A2AStateConfiguration, AuthorizationConfiguration, @@ -14,6 +15,7 @@ Configuration, Customization, LlamaStackConfiguration, + OkpConfiguration, RagConfiguration, UserDataCollection, ServiceConfiguration, @@ -371,6 +373,13 @@ def rag(self) -> "RagConfiguration": raise LogicError("logic error: configuration is not loaded") return self._configuration.rag + @property + def okp(self) -> "OkpConfiguration": + """Return OKP configuration.""" + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + return self._configuration.okp + @property def rag_id_mapping(self) -> dict[str, str]: """Return mapping from vector_db_id to rag_id from BYOK RAG config. @@ -404,6 +413,77 @@ def score_multiplier_mapping(self) -> dict[str, float]: for brag in self._configuration.byok_rag } + @property + def inline_solr_enabled(self) -> bool: + """Return whether OKP is included in the inline RAG list. + + Returns: + bool: True if 'okp-rag' appears in rag.inline, False otherwise. + + Raises: + LogicError: If the configuration has not been loaded. + """ + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + return constants.OKP_RAG_ID in self._configuration.rag.inline + + @property + def inline_byok_vector_store_ids(self) -> list[str]: + """Return vector store IDs for the BYOK sources listed in rag.inline. + + Maps non-okp rag_ids in rag.inline to their corresponding vector_db_ids + from the byok_rag configuration. IDs that are not found in byok_rag are + silently skipped. + + Returns: + list[str]: Ordered list of vector_db_ids for inline BYOK RAG. + + Raises: + LogicError: If the configuration has not been loaded. + """ + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + inline_ids = [ + rid for rid in self._configuration.rag.inline if rid != constants.OKP_RAG_ID + ] + rag_to_vdb = { + brag.rag_id: brag.vector_db_id for brag in self._configuration.byok_rag + } + return [rag_to_vdb[rid] for rid in inline_ids if rid in rag_to_vdb] + + @property + def tool_vector_store_ids(self) -> Optional[list[str]]: + """Return vector store IDs for tool RAG, or None to use all registered stores. + + When rag.tool is None (default), returns None to signal that all + registered vector stores should be used (backward compatibility). + + When rag.tool is an explicit list, maps rag_ids to vector_db_ids and + includes the OKP vector store ID for the special 'okp-rag' entry. + + Returns: + Optional[list[str]]: List of vector_db_ids for tool RAG, or None + when all registered stores should be used. + + Raises: + LogicError: If the configuration has not been loaded. + """ + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + tool_ids = self._configuration.rag.tool + if tool_ids is None: + return None + rag_to_vdb = { + brag.rag_id: brag.vector_db_id for brag in self._configuration.byok_rag + } + result = [] + for rid in tool_ids: + if rid == constants.OKP_RAG_ID: + result.append(constants.SOLR_DEFAULT_VECTOR_STORE_ID) + elif rid in rag_to_vdb: + result.append(rag_to_vdb[rid]) + return result + def resolve_index_name( self, vector_store_id: str, rag_id_mapping: Optional[dict[str, str]] = None ) -> str: diff --git a/src/constants.py b/src/constants.py index 06da328a3..3d1f64973 100644 --- a/src/constants.py +++ b/src/constants.py @@ -201,6 +201,9 @@ # Default score multiplier for BYOK RAG vector stores DEFAULT_SCORE_MULTIPLIER = 1.0 +# Special RAG ID that activates the OKP provider when listed in rag.inline or rag.tool +OKP_RAG_ID = "okp-rag" + # Logging configuration constants # Environment variable name for configurable log level LIGHTSPEED_STACK_LOG_LEVEL_ENV_VAR = "LIGHTSPEED_STACK_LOG_LEVEL" diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index f2ea7373d..9314c49d5 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -190,7 +190,9 @@ def construct_vector_stores_section( if not brag.get("rag_id"): raise ValueError(f"BYOK RAG entry is missing required 'rag_id': {brag}") if not brag.get("vector_db_id"): - raise ValueError(f"BYOK RAG entry is missing required 'vector_db_id': {brag}") + raise ValueError( + f"BYOK RAG entry is missing required 'vector_db_id': {brag}" + ) rag_id = brag["rag_id"] vector_db_id = brag["vector_db_id"] if vector_db_id in existing_store_ids: @@ -381,13 +383,15 @@ def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: Args: ls_config: Llama Stack configuration dict (modified in place) - solr_config: Solr configuration dict + solr_config: Solr configuration dict. Expected keys: + - enabled (bool): whether Solr enrichment should run + - chunk_filter_query (str): Solr filter query for chunk retrieval """ if not solr_config or not solr_config.get("enabled"): - logger.info("Solr is not enabled: skipping") + logger.info("OKP is not enabled: skipping") return - logger.info("Enriching Llama Stack config with Solr") + logger.info("Enriching Llama Stack config with OKP") # Add vector_io provider for Solr if "providers" not in ls_config: @@ -418,6 +422,8 @@ def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: f"${{env.SOLR_EMBEDDING_DIM:={constants.SOLR_DEFAULT_EMBEDDING_DIMENSION}}}" ) + chunk_filter_query = solr_config.get("chunk_filter_query", "is_chunk:true") + ls_config["providers"]["vector_io"].append( { "provider_id": constants.SOLR_PROVIDER_ID, @@ -429,6 +435,15 @@ def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: "content_field": content_field_env, "embedding_model": embedding_model_env, "embedding_dimension": embedding_dim_env, + "chunk_window_config": { + "chunk_parent_id_field": "parent_id", + "chunk_content_field": "chunk_field", + "chunk_index_field": "chunk_index", + "chunk_token_count_field": "num_tokens", + "parent_total_chunks_field": "total_chunks", + "parent_total_tokens_field": "total_tokens", + "chunk_filter_query": chunk_filter_query, + }, "persistence": { "namespace": constants.SOLR_DEFAULT_VECTOR_STORE_ID, "backend": "kv_default", @@ -436,7 +451,7 @@ def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: }, } ) - logger.info("Added Solr provider to providers/vector_io") + logger.info("Added OKP provider to providers/vector_io") # Add vector store registration for Solr if "registered_resources" not in ls_config: @@ -495,7 +510,7 @@ def enrich_solr(ls_config: dict[str, Any], solr_config: dict[str, Any]) -> None: }, } ) - logger.info("Added Solr embedding model to registered_resources.models") + logger.info("Added OKP embedding model to registered_resources.models") # ============================================================================= @@ -528,9 +543,16 @@ def generate_configuration( # Enrichment: BYOK RAG enrich_byok_rag(ls_config, config.get("byok_rag", [])) - # Enrichment: Solr - solr_config = config.get("rag", {}).get("inline", {}).get("okp", {}) - enrich_solr(ls_config, solr_config) + # Enrichment: Solr - enabled when "okp-rag" appears in either inline or tool list + rag_config = config.get("rag", {}) + inline_ids = rag_config.get("inline") or [] + tool_ids = rag_config.get("tool") or [] + okp_enabled = constants.OKP_RAG_ID in inline_ids or constants.OKP_RAG_ID in tool_ids + okp_config = config.get("okp", {}) + chunk_filter_query = okp_config.get("chunk_filter_query", "is_chunk:true") + enrich_solr( + ls_config, {"enabled": okp_enabled, "chunk_filter_query": chunk_filter_query} + ) logger.info("Writing Llama Stack configuration into file %s", output_file) diff --git a/src/models/config.py b/src/models/config.py index 086090efc..e4954bf97 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1694,83 +1694,57 @@ class QuotaHandlersConfiguration(ConfigurationBase): ) -class ByokRagConfiguration(ConfigurationBase): - """BYOK RAG configuration.""" - - enabled: bool = Field( - default=False, - title="BYOK RAG enabled", - description="When True, queries BYOK vector stores for RAG context.", - ) - - -class OkpRagConfiguration(ConfigurationBase): - """OKP RAG configuration.""" - - enabled: bool = Field( - default=False, - title="OKP RAG enabled", - description="When True, queries OKP for RAG context.", - ) - - offline: bool = Field( - default=True, - title="Offline mode", - description="When True, use parent_id for chunk source URLs. " - "When False, use reference_url for chunk source URLs.", - ) +class RagConfiguration(ConfigurationBase): + """RAG strategy configuration. + Controls which RAG sources are used for inline and tool-based retrieval. -class InlineRagConfiguration(ConfigurationBase): - """Inline RAG configuration. + Each strategy lists RAG IDs to include. The special ID ``"okp-rag"`` defined in constants, + activates the OKP provider; all other IDs refer to entries in ``byok_rag``. - Controls inline RAG from OKP and BYOK vector stores. + Backward compatibility: + - ``inline`` defaults to ``[]`` (no inline RAG). + - ``tool`` defaults to ``None`` which means all registered vector stores + are used (identical to the previous ``tool.byok.enabled = True`` default). """ - okp: OkpRagConfiguration = Field( - default_factory=OkpRagConfiguration, - title="OKP RAG configuration", - description="Configuration for OKP RAG (inline).", - ) - - byok: ByokRagConfiguration = Field( - default_factory=ByokRagConfiguration, - title="BYOK RAG configuration", - description="Configuration for BYOK RAG (inline).", + inline: list[str] = Field( + default_factory=list, + title="Inline RAG IDs", + description="RAG IDs whose sources are injected as context before the LLM call. " + "Use 'okp-rag' to enable OKP inline RAG. Empty by default (no inline RAG).", ) - -class ToolRagConfiguration(ConfigurationBase): - """Tool RAG configuration. - - Controls whether RAG functionality is exposed as a tool that the LLM can call. - """ - - byok: ByokRagConfiguration = Field( - default_factory=lambda: ByokRagConfiguration( - enabled=True - ), # defaults True for backward compatibility - title="BYOK RAG configuration", - description="Configuration for BYOK RAG as a tool.", + tool: Optional[list[str]] = Field( + default=None, + title="Tool RAG IDs", + description="RAG IDs made available to the LLM as a file_search tool. " + "Use 'okp-rag' to include the OKP vector store. " + "When omitted, all registered BYOK vector stores are used (backward compatibility).", ) -class RagConfiguration(ConfigurationBase): - """RAG strategy configuration. +class OkpConfiguration(ConfigurationBase): + """OKP (Offline Knowledge Portal) provider configuration. - Controls different RAG strategies: inline and tool-based. + Controls provider-specific behaviour for the OKP vector store. + Only relevant when ``"okp-rag"`` is listed in ``rag.inline`` or ``rag.tool``. """ - inline: InlineRagConfiguration = Field( - default_factory=InlineRagConfiguration, - title="Inline RAG configuration", - description="Configuration for inline RAG from OKP and BYOK vector stores.", + offline: bool = Field( + default=True, + title="OKP offline mode", + description="When True, use parent_id for OKP chunk source URLs. " + "When False, use reference_url for chunk source URLs.", ) - tool: ToolRagConfiguration = Field( - default_factory=ToolRagConfiguration, - title="Tool RAG configuration", - description="Configuration for exposing RAG as a tool that the LLM can call.", + chunk_filter_query: str = Field( + default="is_chunk:true", + title="OKP chunk filter query", + description="OKP filter query applied to every OKP search request. " + "Defaults to 'is_chunk:true' to restrict results to chunk documents. " + "To add extra constraints, extend the expression using boolean syntax, " + "e.g. 'is_chunk:true AND product:*openshift*'.", ) @@ -1918,6 +1892,13 @@ class Configuration(ConfigurationBase): description="Configuration for all RAG strategies (inline and tool-based).", ) + okp: OkpConfiguration = Field( + default_factory=OkpConfiguration, + title="OKP configuration", + description="OKP provider settings. Only used when 'okp-rag' is listed " + "in rag.inline or rag.tool.", + ) + @model_validator(mode="after") def validate_mcp_auth_headers(self) -> Self: """ diff --git a/src/utils/query.py b/src/utils/query.py index 91e562a44..8d96b5eb6 100644 --- a/src/utils/query.py +++ b/src/utils/query.py @@ -212,7 +212,7 @@ def prepare_input( """ input_text = query_request.query if inline_rag_context: - input_text += inline_rag_context + input_text += f"\n\n{inline_rag_context}" if query_request.attachments: for attachment in query_request.attachments: # Append attachment content with type label diff --git a/src/utils/responses.py b/src/utils/responses.py index 59a8a4c9b..7a502ecc3 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -167,11 +167,17 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position return None toolgroups: list[InputTool] = [] - # Get vector stores for RAG tools - use specified ones or fetch all - vector_store_ids = await get_vector_store_ids(client, vector_store_ids) + # Per-request vector_store_ids override takes priority. + # When not provided, use config-based tool list (or None = all stores). + effective_ids = ( + vector_store_ids + if vector_store_ids is not None + else configuration.tool_vector_store_ids + ) + effective_ids = await get_vector_store_ids(client, effective_ids) # Add RAG tools if vector stores are available - rag_tools = get_rag_tools(vector_store_ids) + rag_tools = get_rag_tools(effective_ids) if rag_tools: toolgroups.extend(rag_tools) @@ -370,12 +376,8 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea vector_store_ids: List of vector store identifiers Returns: - List containing file_search tool configuration, or None if RAG as tool is disabled + List containing file_search tool configuration, or empty list if no stores available """ - # Check if Tool RAG is enabled in configuration - if not (configuration and configuration.rag.tool.byok.enabled): - return [] - if vector_store_ids == []: return [] diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index c44c747ae..f507e2ed8 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -16,22 +16,21 @@ from configuration import configuration from log import get_logger from models.responses import ReferencedDocument -from utils.responses import get_vector_store_ids from utils.types import RAGChunk, RAGContext logger = get_logger(__name__) def _is_solr_enabled() -> bool: - """Check if Solr is enabled in configuration.""" - return bool(configuration.rag.inline.okp.enabled) + """Check if Solr is enabled for inline RAG in configuration.""" + return configuration.inline_solr_enabled def _get_solr_vector_store_ids() -> list[str]: """Get vector store IDs based on Solr configuration.""" vector_store_ids = [constants.SOLR_DEFAULT_VECTOR_STORE_ID] logger.info( - "Using %s vector store for Solr query: %s", + "Using %s vector store for OKP query: %s", constants.SOLR_DEFAULT_VECTOR_STORE_ID, vector_store_ids, ) @@ -277,7 +276,9 @@ def _process_solr_chunks_for_documents( metadata_doc_ids = set() for chunk in chunks: - logger.debug("Extract doc ids from chunk: %s", chunk) + logger.debug( + "Extracting doc ids from chunk id: %s", getattr(chunk, "chunk_id", None) + ) doc_id, title, reference_url = _extract_solr_document_metadata(chunk) @@ -301,12 +302,12 @@ def _process_solr_chunks_for_documents( ReferencedDocument( doc_title=title, doc_url=parsed_url, - source="OKP Solr", + source=constants.OKP_RAG_ID, ) ) logger.debug( - "Extracted %d unique document IDs from Solr chunks", + "Extracted %d unique document IDs from OKP chunks", len(doc_ids_from_chunks), ) return doc_ids_from_chunks @@ -336,8 +337,21 @@ async def _fetch_byok_rag( rag_chunks: list[RAGChunk] = [] referenced_documents: list[ReferencedDocument] = [] - if not configuration.rag.inline.byok.enabled: - logger.info("Inline RAG (BYOK) disabled, skipping BYOK RAG search") + # Determine which BYOK vector stores to query for inline RAG. + # Per-request override takes precedence; otherwise use config-based inline list. + if vector_store_ids is not None: + # Request-level override: filter out Solr store, use the rest + vector_store_ids_to_query = [ + vs_id + for vs_id in vector_store_ids + if vs_id != constants.SOLR_DEFAULT_VECTOR_STORE_ID + ] + else: + vector_store_ids_to_query = configuration.inline_byok_vector_store_ids + + # If inline byok stores are not defined, we disable the inline RAG for backward compatibility + if not vector_store_ids_to_query: + logger.info("No inline BYOK RAG sources configured, skipping BYOK RAG search") return rag_chunks, referenced_documents try: @@ -345,13 +359,6 @@ async def _fetch_byok_rag( score_multiplier_mapping = configuration.score_multiplier_mapping rag_id_mapping = configuration.rag_id_mapping - # Filter out Solr vector stores from available stores - vector_store_ids_to_query = [ - vs_id - for vs_id in await get_vector_store_ids(client, vector_store_ids) - if vs_id != constants.SOLR_DEFAULT_VECTOR_STORE_ID - ] - # Query all vector stores in parallel results_per_store = await asyncio.gather( *[ @@ -421,12 +428,12 @@ async def _fetch_solr_rag( rag_chunks: list[RAGChunk] = [] referenced_documents: list[ReferencedDocument] = [] - if not _is_solr_enabled(configuration): - logger.info("Solr vector IO is disabled, skipping Solr search") + if not _is_solr_enabled(): + logger.info("OKP vector IO is disabled, skipping OKP search") return rag_chunks, referenced_documents # Get offline setting from configuration - offline = configuration.rag.inline.okp.offline + offline = configuration.okp.offline try: vector_store_ids = _get_solr_vector_store_ids() @@ -442,7 +449,9 @@ async def _fetch_solr_rag( params=params, ) - logger.debug("Solr query response: %s", query_response) + logger.debug( + "OKP query returned %d chunks", len(query_response.chunks or []) + ) if query_response.chunks: retrieved_scores = ( @@ -462,15 +471,15 @@ async def _fetch_solr_rag( rag_chunks = _convert_solr_chunks_to_rag_format( top_chunks, top_scores, offline ) - logger.info( - "Filtered top %d chunks from Solr OKP RAG (%d were retrieved)", + logger.debug( + "Filtered top %d chunks from OKP RAG (%d were retrieved)", constants.OKP_RAG_MAX_CHUNKS, len(rag_chunks), ) except Exception as e: # pylint: disable=broad-exception-caught - logger.warning("Failed to query Solr for chunks: %s", e) - logger.debug("Solr query error details: %s", traceback.format_exc()) + logger.warning("Failed to query OKP for chunks: %s", e) + logger.debug("OKP query error details: %s", traceback.format_exc()) return rag_chunks, referenced_documents @@ -493,10 +502,8 @@ async def build_rag_context( RAGContext containing formatted context text and referenced documents """ # Fetch from all enabled RAG sources in parallel - byok_chunks_task = _fetch_byok_rag( - client, query_request.query, configuration, query_request.vector_store_ids - ) - solr_chunks_task = _fetch_solr_rag(client, query_request, configuration) + byok_chunks_task = _fetch_byok_rag(client, query, vector_store_ids) + solr_chunks_task = _fetch_solr_rag(client, query, solr) (byok_chunks, byok_docs), (solr_chunks, solr_docs) = await asyncio.gather( byok_chunks_task, solr_chunks_task @@ -603,7 +610,7 @@ def _convert_solr_chunks_to_rag_format( rag_chunks.append( RAGChunk( content=chunk.content, - source="OKP Solr", # Hardcoded source for Solr chunks + source=constants.OKP_RAG_ID, score=score, attributes=attributes if attributes else None, ) diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 22d7a5a6a..29df175cb 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -207,13 +207,12 @@ def test_dump_configuration(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "inline": { - "byok": {"enabled": False}, - "okp": {"enabled": False, "offline": True}, - }, - "tool": { - "byok": {"enabled": True}, - }, + "inline": [], + "tool": None, + }, + "okp": { + "offline": True, + "chunk_filter_query": "is_chunk:true", }, "splunk": None, "deployment_environment": "development", @@ -559,13 +558,12 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "inline": { - "byok": {"enabled": False}, - "okp": {"enabled": False, "offline": True}, - }, - "tool": { - "byok": {"enabled": True}, - }, + "inline": [], + "tool": None, + }, + "okp": { + "offline": True, + "chunk_filter_query": "is_chunk:true", }, "splunk": None, "deployment_environment": "development", @@ -789,13 +787,12 @@ def test_dump_configuration_with_quota_limiters_different_values( }, "azure_entra_id": None, "rag": { - "inline": { - "byok": {"enabled": False}, - "okp": {"enabled": False, "offline": True}, - }, - "tool": { - "byok": {"enabled": True}, - }, + "inline": [], + "tool": None, + }, + "okp": { + "offline": True, + "chunk_filter_query": "is_chunk:true", }, "splunk": None, "deployment_environment": "development", @@ -994,13 +991,12 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "inline": { - "byok": {"enabled": False}, - "okp": {"enabled": False, "offline": True}, - }, - "tool": { - "byok": {"enabled": True}, - }, + "inline": [], + "tool": None, + }, + "okp": { + "offline": True, + "chunk_filter_query": "is_chunk:true", }, "splunk": None, "deployment_environment": "development", @@ -1184,13 +1180,12 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: }, "azure_entra_id": None, "rag": { - "inline": { - "byok": {"enabled": False}, - "okp": {"enabled": False, "offline": True}, - }, - "tool": { - "byok": {"enabled": True}, - }, + "inline": [], + "tool": None, + }, + "okp": { + "offline": True, + "chunk_filter_query": "is_chunk:true", }, "splunk": None, "deployment_environment": "development", diff --git a/tests/unit/models/config/test_rag_configuration.py b/tests/unit/models/config/test_rag_configuration.py new file mode 100644 index 000000000..a29f195b2 --- /dev/null +++ b/tests/unit/models/config/test_rag_configuration.py @@ -0,0 +1,92 @@ +"""Unit tests for RAG and OKP configuration models.""" + +# pylint: disable=no-member +# Pydantic Field(default_factory=...) pattern confuses pylint's static analysis + +import pytest +from pydantic import ValidationError + +from models.config import OkpConfiguration, RagConfiguration + + +class TestRagConfiguration: + """Tests for RagConfiguration model.""" + + def test_default_values(self) -> None: + """Test that RagConfiguration has correct default values.""" + config = RagConfiguration() + assert config.inline == [] + assert config.tool is None + + def test_inline_with_byok_ids(self) -> None: + """Test inline list with BYOK rag IDs.""" + config = RagConfiguration(inline=["store-1", "store-2"]) + assert config.inline == ["store-1", "store-2"] + assert config.tool is None + + def test_inline_with_okp_rag(self) -> None: + """Test inline list including the special okp-rag ID.""" + config = RagConfiguration(inline=["okp-rag", "store-1"]) + assert "okp-rag" in config.inline + assert "store-1" in config.inline + + def test_tool_with_okp_rag_and_byok(self) -> None: + """Test tool list with okp-rag and BYOK IDs.""" + config = RagConfiguration( + inline=["store-1"], + tool=["okp-rag", "store-1"], + ) + assert config.inline == ["store-1"] + assert config.tool == ["okp-rag", "store-1"] + + def test_tool_empty_list(self) -> None: + """Test that an explicit empty tool list disables tool RAG.""" + config = RagConfiguration(tool=[]) + assert config.tool == [] + + def test_tool_none_means_all_stores(self) -> None: + """Test that tool=None (default) means all registered stores are used.""" + config = RagConfiguration() + assert config.tool is None + + def test_no_unknown_fields_allowed(self) -> None: + """Test that RagConfiguration rejects unknown fields.""" + with pytest.raises(ValidationError, match="Extra inputs are not permitted"): + RagConfiguration(unknown_field="value") # type: ignore[call-arg] + + def test_fully_custom_config(self) -> None: + """Test RagConfiguration with all fields set.""" + config = RagConfiguration( + inline=["okp-rag", "store-1"], + tool=["store-1"], + ) + assert "okp-rag" in config.inline + assert "store-1" in config.inline + assert config.tool == ["store-1"] + + +class TestOkpConfiguration: + """Tests for OkpConfiguration model.""" + + def test_default_values(self) -> None: + """Test that OkpConfiguration has correct default values.""" + config = OkpConfiguration() + assert config.offline is True + assert config.chunk_filter_query == "is_chunk:true" + + def test_offline_false(self) -> None: + """Test offline can be set to False (online mode).""" + config = OkpConfiguration(offline=False) + assert config.offline is False + + def test_custom_chunk_filter_query(self) -> None: + """Test that chunk_filter_query can be customised.""" + config = OkpConfiguration( + chunk_filter_query="is_chunk:true AND product:*openshift*" + ) + assert config.chunk_filter_query == "is_chunk:true AND product:*openshift*" + + def test_no_unknown_fields_allowed(self) -> None: + """Test that OkpConfiguration rejects unknown fields.""" + with pytest.raises(ValidationError, match="Extra inputs are not permitted"): + OkpConfiguration(unknown_field="value") # type: ignore[call-arg] diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index cc66951d2..ba1e4bf94 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -338,7 +338,7 @@ class TestGetRAGTools: def test_get_rag_tools_empty_list(self) -> None: """Test get_rag_tools returns empty list for empty vector store IDs.""" - assert get_rag_tools([]) == [] + assert not get_rag_tools([]) def test_get_rag_tools_with_vector_stores(self) -> None: """Test get_rag_tools returns correct tool format for vector stores.""" @@ -2423,20 +2423,13 @@ async def test_raises_on_api_status_error(self, mocker: MockerFixture) -> None: class TestGetRAGToolsWithConfig: """Tests for get_rag_tools with configuration checks.""" - def test_returns_none_when_tool_rag_disabled(self, mocker: MockerFixture) -> None: - """Test get_rag_tools returns None when Tool RAG is disabled in config.""" - mock_config = mocker.Mock(spec=AppConfig) - mock_config.rag.tool.byok.enabled = False - mocker.patch("utils.responses.configuration", mock_config) - - assert get_rag_tools(["vs1", "vs2"]) == [] - - def test_returns_tools_when_enabled(self, mocker: MockerFixture) -> None: - """Test get_rag_tools returns tools when Tool RAG is enabled in config.""" - mock_config = mocker.Mock(spec=AppConfig) - mock_config.rag.tool.byok.enabled = True - mocker.patch("utils.responses.configuration", mock_config) + def test_returns_empty_when_no_vector_store_ids(self) -> None: + """Test get_rag_tools returns empty list when no vector store IDs are provided.""" + # pylint: disable-next=use-implicit-booleaness-not-comparison + assert get_rag_tools([]) == [] + def test_returns_tools_when_stores_provided(self) -> None: + """Test get_rag_tools returns tools when vector store IDs are provided.""" tools = get_rag_tools(["vs1"]) assert tools is not None assert tools[0].type == constants.DEFAULT_RAG_TOOL diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py new file mode 100644 index 000000000..683a733c3 --- /dev/null +++ b/tests/unit/utils/test_vector_search.py @@ -0,0 +1,494 @@ +"""Unit tests for vector search utilities.""" + +import pytest + +import constants +from configuration import AppConfig +from utils.types import RAGChunk +from utils.vector_search import ( + _build_document_url, + _build_query_params, + _convert_solr_chunks_to_rag_format, + _extract_byok_rag_chunks, + _extract_solr_document_metadata, + _fetch_byok_rag, + _fetch_solr_rag, + _format_rag_context, + _get_solr_vector_store_ids, + _is_solr_enabled, + build_rag_context, +) + + +class TestIsSolrEnabled: + """Tests for _is_solr_enabled function.""" + + def test_solr_enabled_true(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when Solr is enabled in configuration.""" + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_solr_enabled = True + mocker.patch("utils.vector_search.configuration", config_mock) + assert _is_solr_enabled() is True + + def test_solr_enabled_false(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when Solr is disabled in configuration.""" + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_solr_enabled = False + mocker.patch("utils.vector_search.configuration", config_mock) + assert _is_solr_enabled() is False + + +class TestGetSolrVectorStoreIds: # pylint: disable=too-few-public-methods + """Tests for _get_solr_vector_store_ids function.""" + + def test_returns_default_vector_store_id(self) -> None: + """Test that function returns the default Solr vector store ID.""" + result = _get_solr_vector_store_ids() + assert result == [constants.SOLR_DEFAULT_VECTOR_STORE_ID] + assert len(result) == 1 + + +class TestBuildQueryParams: + """Tests for _build_query_params function.""" + + def test_default_params(self) -> None: + """Test default parameters when no solr filters provided.""" + params = _build_query_params() + + assert params["k"] == constants.SOLR_VECTOR_SEARCH_DEFAULT_K + assert ( + params["score_threshold"] + == constants.SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD + ) + assert params["mode"] == constants.SOLR_VECTOR_SEARCH_DEFAULT_MODE + assert "solr" not in params + + def test_with_solr_filters(self) -> None: + """Test parameters when solr filters are provided.""" + solr_filters = {"filter": "value"} + params = _build_query_params(solr=solr_filters) + + assert params["solr"] == solr_filters + assert params["k"] == constants.SOLR_VECTOR_SEARCH_DEFAULT_K + + +class TestExtractByokRagChunks: + """Tests for _extract_byok_rag_chunks function.""" + + def test_extract_chunks_with_metadata(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test extraction of chunks with metadata.""" + # Create mock chunks + chunk1 = mocker.Mock() + chunk1.content = "Content 1" + chunk1.chunk_id = "chunk_1" + chunk1.metadata = {"document_id": "doc_1", "title": "Document 1"} + + chunk2 = mocker.Mock() + chunk2.content = "Content 2" + chunk2.chunk_id = "chunk_2" + chunk2.metadata = {"document_id": "doc_2", "title": "Document 2"} + + # Create mock search response + search_response = mocker.Mock() + search_response.chunks = [chunk1, chunk2] + search_response.scores = [0.9, 0.8] + + result = _extract_byok_rag_chunks( + search_response, vector_store_id="test_store", weight=1.5 + ) + + assert len(result) == 2 + assert result[0]["content"] == "Content 1" + assert result[0]["score"] == 0.9 + assert result[0]["weighted_score"] == 0.9 * 1.5 + assert result[0]["source"] == "test_store" + assert result[0]["doc_id"] == "doc_1" + + def test_extract_chunks_without_metadata(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test extraction of chunks without metadata.""" + chunk = mocker.Mock() + chunk.content = "Test content" + chunk.chunk_id = "chunk_id" + chunk.metadata = None + + search_response = mocker.Mock() + search_response.chunks = [chunk] + search_response.scores = [0.75] + + result = _extract_byok_rag_chunks( + search_response, vector_store_id="test_store", weight=1.0 + ) + + assert len(result) == 1 + assert result[0]["doc_id"] == "chunk_id" + assert result[0]["metadata"] == {} + + +class TestFormatRagContext: + """Tests for _format_rag_context function.""" + + def test_empty_chunks(self) -> None: + """Test formatting with empty chunks list.""" + result = _format_rag_context([], "test query") + assert result == "" + + def test_format_single_chunk(self) -> None: + """Test formatting with a single chunk.""" + chunks = [RAGChunk(content="Test content", source="test_source", score=0.95)] + result = _format_rag_context(chunks, "test query") + + assert "file_search found 1 chunks:" in result + assert "BEGIN of file_search results." in result + assert "Test content" in result + assert "document_id: test_source" in result + assert "score: 0.9500" in result + assert "END of file_search results." in result + assert 'answer the user\'s query: "test query"' in result + + def test_format_multiple_chunks(self) -> None: + """Test formatting with multiple chunks.""" + chunks = [ + RAGChunk(content="Content 1", source="source_1", score=0.9), + RAGChunk(content="Content 2", source="source_2", score=0.8), + RAGChunk( + content="Content 3", + source="source_3", + score=0.7, + attributes={"url": "http://example.com"}, + ), + ] + result = _format_rag_context(chunks, "test query") + + assert "file_search found 3 chunks:" in result + assert "Content 1" in result + assert "Content 2" in result + assert "Content 3" in result + assert "document_id: source_1" in result + assert "[1]" in result + assert "[2]" in result + assert "[3]" in result + + def test_format_chunk_with_attributes(self) -> None: + """Test formatting chunk with additional attributes.""" + chunks = [ + RAGChunk( + content="Test content", + source="test_source", + score=0.85, + attributes={"title": "Test Doc", "author": "John Doe"}, + ) + ] + result = _format_rag_context(chunks, "test query") + + assert "attributes:" in result + assert "title" in result or "author" in result + + +class TestExtractSolrDocumentMetadata: + """Tests for _extract_solr_document_metadata function.""" + + def test_extract_from_dict_metadata(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test extraction from dict-based metadata.""" + chunk = mocker.Mock() + chunk.metadata = { + "doc_id": "doc_123", + "title": "Test Document", + "reference_url": "https://example.com/doc", + } + + doc_id, title, reference_url = _extract_solr_document_metadata(chunk) + + assert doc_id == "doc_123" + assert title == "Test Document" + assert reference_url == "https://example.com/doc" + + def test_extract_from_chunk_metadata_object( # type: ignore[no-untyped-def] + self, mocker + ) -> None: + """Test extraction from typed chunk_metadata object.""" + chunk_meta = mocker.Mock() + chunk_meta.doc_id = "doc_456" + chunk_meta.title = "Another Document" + chunk_meta.reference_url = "https://example.com/another" + + chunk = mocker.Mock() + chunk.metadata = {} + chunk.chunk_metadata = chunk_meta + + doc_id, title, reference_url = _extract_solr_document_metadata(chunk) + + assert doc_id == "doc_456" + assert title == "Another Document" + assert reference_url == "https://example.com/another" + + def test_extract_with_missing_fields(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test extraction when some fields are missing.""" + chunk = mocker.Mock() + chunk.metadata = {"doc_id": "doc_789"} + + doc_id, title, reference_url = _extract_solr_document_metadata(chunk) + + assert doc_id == "doc_789" + assert title is None + assert reference_url is None + + +class TestBuildDocumentUrl: + """Tests for _build_document_url function.""" + + def test_offline_mode_with_doc_id(self) -> None: + """Test URL building in offline mode with doc_id.""" + doc_url, reference_doc = _build_document_url( + offline=True, doc_id="doc_123", reference_url=None + ) + + assert doc_url == constants.MIMIR_DOC_URL + "doc_123" + assert reference_doc == "doc_123" + + def test_online_mode_with_reference_url(self) -> None: + """Test URL building in online mode with reference_url.""" + doc_url, reference_doc = _build_document_url( + offline=False, + doc_id="doc_123", + reference_url="https://docs.example.com/page", + ) + + assert doc_url == "https://docs.example.com/page" + assert reference_doc == "https://docs.example.com/page" + + def test_online_mode_without_http(self) -> None: + """Test online mode when reference_url doesn't start with http.""" + doc_url, reference_doc = _build_document_url( + offline=False, doc_id="doc_123", reference_url="relative/path" + ) + + assert doc_url == constants.MIMIR_DOC_URL + "relative/path" + assert reference_doc == "relative/path" + + def test_offline_mode_without_doc_id(self) -> None: + """Test offline mode when doc_id is None.""" + doc_url, reference_doc = _build_document_url( + offline=True, doc_id=None, reference_url="https://example.com" + ) + + assert doc_url == "" + assert reference_doc is None + + +class TestConvertSolrChunksToRagFormat: + """Tests for _convert_solr_chunks_to_rag_format function.""" + + def test_convert_with_metadata_offline(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test conversion with metadata in offline mode.""" + chunk = mocker.Mock() + chunk.content = "Test content" + chunk.metadata = {"parent_id": "parent_123"} + chunk.chunk_metadata = None + + result = _convert_solr_chunks_to_rag_format([chunk], [0.85], offline=True) + + assert len(result) == 1 + assert result[0].content == "Test content" + assert result[0].source == constants.OKP_RAG_ID + assert result[0].score == 0.85 + assert "doc_url" in result[0].attributes + assert "parent_123" in result[0].attributes["doc_url"] + + def test_convert_with_metadata_online(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test conversion with metadata in online mode.""" + chunk = mocker.Mock() + chunk.content = "Test content" + chunk.metadata = {"reference_url": "https://example.com/doc"} + chunk.chunk_metadata = None + + result = _convert_solr_chunks_to_rag_format([chunk], [0.75], offline=False) + + assert len(result) == 1 + assert result[0].attributes["doc_url"] == "https://example.com/doc" + + def test_convert_with_chunk_metadata(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test conversion with chunk_metadata object.""" + chunk_meta = mocker.Mock() + chunk_meta.document_id = "doc_456" + + chunk = mocker.Mock() + chunk.content = "Test content" + chunk.metadata = {} + chunk.chunk_metadata = chunk_meta + + result = _convert_solr_chunks_to_rag_format([chunk], [0.9], offline=True) + + assert len(result) == 1 + assert result[0].attributes["document_id"] == "doc_456" + + def test_convert_multiple_chunks(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test conversion of multiple chunks.""" + chunk1 = mocker.Mock() + chunk1.content = "Content 1" + chunk1.metadata = {"parent_id": "parent_1"} + chunk1.chunk_metadata = None + + chunk2 = mocker.Mock() + chunk2.content = "Content 2" + chunk2.metadata = {"parent_id": "parent_2"} + chunk2.chunk_metadata = None + + result = _convert_solr_chunks_to_rag_format( + [chunk1, chunk2], [0.9, 0.8], offline=True + ) + + assert len(result) == 2 + assert result[0].content == "Content 1" + assert result[1].content == "Content 2" + assert result[0].score == 0.9 + assert result[1].score == 0.8 + + +class TestFetchByokRag: + """Tests for _fetch_byok_rag async function.""" + + @pytest.mark.asyncio + async def test_byok_no_inline_ids(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when no inline BYOK sources are configured.""" + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_byok_vector_store_ids = [] + mocker.patch("utils.vector_search.configuration", config_mock) + + client_mock = mocker.AsyncMock() + rag_chunks, referenced_docs = await _fetch_byok_rag(client_mock, "test query") + + assert rag_chunks == [] + assert referenced_docs == [] + client_mock.vector_io.query.assert_not_called() + + @pytest.mark.asyncio + async def test_byok_enabled_success(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test successful BYOK RAG fetch when inline IDs are configured.""" + # Mock configuration + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_byok_vector_store_ids = ["vs_1"] + config_mock.score_multiplier_mapping = {"vs_1": 1.5} + config_mock.rag_id_mapping = {"vs_1": "rag_1"} + mocker.patch("utils.vector_search.configuration", config_mock) + + # Mock search response + chunk_mock = mocker.Mock() + chunk_mock.content = "Test content" + chunk_mock.chunk_id = "chunk_1" + chunk_mock.metadata = { + "document_id": "doc_1", + "title": "Test Doc", + "reference_url": "https://example.com/doc", + } + + search_response = mocker.Mock() + search_response.chunks = [chunk_mock] + search_response.scores = [0.9] + + # Mock client + client_mock = mocker.AsyncMock() + client_mock.vector_io.query.return_value = search_response + + rag_chunks, referenced_docs = await _fetch_byok_rag(client_mock, "test query") + + assert len(rag_chunks) > 0 + assert rag_chunks[0].content == "Test content" + assert len(referenced_docs) > 0 + + +class TestFetchSolrRag: + """Tests for _fetch_solr_rag async function.""" + + @pytest.mark.asyncio + async def test_solr_disabled(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when Solr is disabled.""" + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_solr_enabled = False + mocker.patch("utils.vector_search.configuration", config_mock) + + client_mock = mocker.AsyncMock() + rag_chunks, referenced_docs = await _fetch_solr_rag(client_mock, "test query") + + assert rag_chunks == [] + assert referenced_docs == [] + client_mock.vector_io.query.assert_not_called() + + @pytest.mark.asyncio + async def test_solr_enabled_success(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test successful Solr RAG fetch.""" + # Mock configuration + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_solr_enabled = True + config_mock.okp.offline = True + mocker.patch("utils.vector_search.configuration", config_mock) + + # Mock chunk + chunk_mock = mocker.Mock() + chunk_mock.content = "Solr content" + chunk_mock.metadata = {"parent_id": "parent_1", "title": "Solr Doc"} + chunk_mock.chunk_metadata = None + + # Mock query response + query_response = mocker.Mock() + query_response.chunks = [chunk_mock] + query_response.scores = [0.85] + + # Mock client + client_mock = mocker.AsyncMock() + client_mock.vector_io.query.return_value = query_response + + rag_chunks, _referenced_docs = await _fetch_solr_rag(client_mock, "test query") + + assert len(rag_chunks) > 0 + assert rag_chunks[0].content == "Solr content" + assert rag_chunks[0].source == constants.OKP_RAG_ID + + +class TestBuildRagContext: + """Tests for build_rag_context async function.""" + + @pytest.mark.asyncio + async def test_both_sources_disabled(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when both BYOK inline and Solr inline are not configured.""" + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_byok_vector_store_ids = [] + config_mock.inline_solr_enabled = False + mocker.patch("utils.vector_search.configuration", config_mock) + + client_mock = mocker.AsyncMock() + context = await build_rag_context(client_mock, "test query", None) + + assert context.context_text == "" + assert context.rag_chunks == [] + assert context.referenced_documents == [] + + @pytest.mark.asyncio + async def test_byok_enabled_only(self, mocker) -> None: # type: ignore[no-untyped-def] + """Test when only inline BYOK is configured.""" + # Mock configuration + config_mock = mocker.Mock(spec=AppConfig) + config_mock.inline_byok_vector_store_ids = ["vs_1"] + config_mock.inline_solr_enabled = False + config_mock.score_multiplier_mapping = {"vs_1": 1.0} + config_mock.rag_id_mapping = {"vs_1": "rag_1"} + mocker.patch("utils.vector_search.configuration", config_mock) + + # Mock chunk + chunk_mock = mocker.Mock() + chunk_mock.content = "BYOK content" + chunk_mock.chunk_id = "chunk_1" + chunk_mock.metadata = {"document_id": "doc_1"} + + search_response = mocker.Mock() + search_response.chunks = [chunk_mock] + search_response.scores = [0.9] + + # Mock client + client_mock = mocker.AsyncMock() + client_mock.vector_io.query.return_value = search_response + + context = await build_rag_context(client_mock, "test query", None) + + assert len(context.rag_chunks) > 0 + assert "BYOK content" in context.context_text + assert "file_search found" in context.context_text From fec6fc87f3c9faf8a345017b19d83d7fcf1f6063 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:50:41 +0100 Subject: [PATCH 4/5] Rebased, solved conflicts --- src/app/endpoints/streaming_query.py | 55 +++------------------- src/configuration.py | 57 ----------------------- src/utils/responses.py | 27 +++++++---- src/utils/vector_search.py | 28 +++++++---- tests/unit/models/config/test_byok_rag.py | 2 +- tests/unit/utils/test_responses.py | 2 +- tests/unit/utils/test_vector_search.py | 18 +++++-- 7 files changed, 58 insertions(+), 131 deletions(-) diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index b2f68cd3b..6c9fe639d 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -56,7 +56,6 @@ UnauthorizedResponse, UnprocessableEntityResponse, ) -from utils.types import ReferencedDocument from utils.endpoints import ( check_configuration_loaded, validate_and_retrieve_conversation, @@ -186,29 +185,10 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals client = AsyncLlamaStackClientHolder().get_client() -<<<<<<< HEAD -<<<<<<< HEAD - _, _, doc_ids_from_chunks, pre_rag_chunks = await perform_vector_search( - client, query_request.query, query_request.solr - ) - - rag_context = format_rag_context_for_injection(pre_rag_chunks) - if rag_context: -======= - # Build RAG context from BYOK and Solr sources - rag_context = await build_rag_context(client, query_request, configuration) - - # Inject RAG context into query - if rag_context.context_text: - # Mutate a local copy to avoid surprising other logic ->>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) - query_request = query_request.model_copy(deep=True) - query_request.query = query_request.query + rag_context.context_text -======= # Build RAG context from Inline RAG sources - inline_rag_context = await build_rag_context(client, query_request, configuration) ->>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) - + inline_rag_context = await build_rag_context( + client, query_request.query, query_request.vector_store_ids, query_request.solr + ) # Prepare API request parameters responses_params = await prepare_responses_params( client=client, @@ -291,15 +271,7 @@ async def retrieve_response_generator( Args: responses_params: The Responses API parameters context: The response generator context -<<<<<<< HEAD -<<<<<<< HEAD - doc_ids_from_chunks: List of ReferencedDocument objects extracted from static RAG -======= - pre_rag_documents: Referenced documents from pre-query RAG (BYOK + Solr) ->>>>>>> 2ace88f7 (Add chunk prioritization and always RAG support) -======= - inline_rag_documents: Referenced documents from pre-query RAG (BYOK + Solr) ->>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) + inline_rag_documents: Referenced documents from inline RAG (BYOK + Solr) Returns: tuple[AsyncIterator[str], TurnSummary]: The response generator and turn summary @@ -771,25 +743,10 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat rag_id_mapping=context.rag_id_mapping, ) -<<<<<<< HEAD + # Merge pre-RAG documents with tool-based documents and deduplicate turn_summary.referenced_documents = deduplicate_referenced_documents( - tool_based_documents + turn_summary.pre_rag_documents + turn_summary.inline_rag_documents + tool_based_documents ) -======= - # Merge pre-RAG documents with tool-based documents (similar to query.py) - if turn_summary.inline_rag_documents: - all_documents = turn_summary.inline_rag_documents + tool_based_documents - seen = set() - deduplicated_documents = [] - for doc in all_documents: - key = (doc.doc_url, doc.doc_title) - if key not in seen: - seen.add(key) - deduplicated_documents.append(doc) - turn_summary.referenced_documents = deduplicated_documents - else: - turn_summary.referenced_documents = tool_based_documents ->>>>>>> a4075c6d (Address review: rename always RAG to inline RAG, Solr config to OKP, fix query mutation) def stream_http_error_event( diff --git a/src/configuration.py b/src/configuration.py index 130761709..609c22bba 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -427,63 +427,6 @@ def inline_solr_enabled(self) -> bool: raise LogicError("logic error: configuration is not loaded") return constants.OKP_RAG_ID in self._configuration.rag.inline - @property - def inline_byok_vector_store_ids(self) -> list[str]: - """Return vector store IDs for the BYOK sources listed in rag.inline. - - Maps non-okp rag_ids in rag.inline to their corresponding vector_db_ids - from the byok_rag configuration. IDs that are not found in byok_rag are - silently skipped. - - Returns: - list[str]: Ordered list of vector_db_ids for inline BYOK RAG. - - Raises: - LogicError: If the configuration has not been loaded. - """ - if self._configuration is None: - raise LogicError("logic error: configuration is not loaded") - inline_ids = [ - rid for rid in self._configuration.rag.inline if rid != constants.OKP_RAG_ID - ] - rag_to_vdb = { - brag.rag_id: brag.vector_db_id for brag in self._configuration.byok_rag - } - return [rag_to_vdb[rid] for rid in inline_ids if rid in rag_to_vdb] - - @property - def tool_vector_store_ids(self) -> Optional[list[str]]: - """Return vector store IDs for tool RAG, or None to use all registered stores. - - When rag.tool is None (default), returns None to signal that all - registered vector stores should be used (backward compatibility). - - When rag.tool is an explicit list, maps rag_ids to vector_db_ids and - includes the OKP vector store ID for the special 'okp-rag' entry. - - Returns: - Optional[list[str]]: List of vector_db_ids for tool RAG, or None - when all registered stores should be used. - - Raises: - LogicError: If the configuration has not been loaded. - """ - if self._configuration is None: - raise LogicError("logic error: configuration is not loaded") - tool_ids = self._configuration.rag.tool - if tool_ids is None: - return None - rag_to_vdb = { - brag.rag_id: brag.vector_db_id for brag in self._configuration.byok_rag - } - result = [] - for rid in tool_ids: - if rid == constants.OKP_RAG_ID: - result.append(constants.SOLR_DEFAULT_VECTOR_STORE_ID) - elif rid in rag_to_vdb: - result.append(rag_to_vdb[rid]) - return result - def resolve_index_name( self, vector_store_id: str, rag_id_mapping: Optional[dict[str, str]] = None ) -> str: diff --git a/src/utils/responses.py b/src/utils/responses.py index 7a502ecc3..034ca0b04 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -167,14 +167,19 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position return None toolgroups: list[InputTool] = [] - # Per-request vector_store_ids override takes priority. - # When not provided, use config-based tool list (or None = all stores). - effective_ids = ( - vector_store_ids - if vector_store_ids is not None - else configuration.tool_vector_store_ids - ) - effective_ids = await get_vector_store_ids(client, effective_ids) + + # Priority: per-request IDs > rag.tool config > all registered stores. + # In all cases, customer-facing rag_ids are translated to internal vector_db_ids. + # IDs fetched from llama-stack are already internal and need no translation. + byok_rags = configuration.configuration.byok_rag + if vector_store_ids is not None: + effective_ids: list[str] = resolve_vector_store_ids(vector_store_ids, byok_rags) + elif configuration.configuration.rag.tool is not None: + effective_ids = resolve_vector_store_ids( + configuration.configuration.rag.tool, byok_rags + ) + else: + effective_ids = await get_vector_store_ids(client, None) # Add RAG tools if vector stores are available rag_tools = get_rag_tools(effective_ids) @@ -350,10 +355,11 @@ def extract_vector_store_ids_from_tools( def resolve_vector_store_ids( vector_store_ids: list[str], byok_rags: list[ByokRag] ) -> list[str]: - """Translate customer-facing BYOK rag_ids to llama-stack vector_db_ids. + """Translate customer-facing rag_ids to llama-stack vector_db_ids. Each ID is looked up against the BYOK RAG configuration. If a matching ``rag_id`` is found, the corresponding ``vector_db_id`` is returned. + The special ``okp-rag`` ID is mapped to the Solr vector store ID. Otherwise the ID is passed through unchanged (assumed to already be a llama-stack vector store ID). @@ -366,6 +372,9 @@ def resolve_vector_store_ids( List of llama-stack vector_db_ids ready for the Llama Stack API. """ rag_id_to_vector_db_id = {brag.rag_id: brag.vector_db_id for brag in byok_rags} + rag_id_to_vector_db_id[constants.OKP_RAG_ID] = ( + constants.SOLR_DEFAULT_VECTOR_STORE_ID + ) return [rag_id_to_vector_db_id.get(vs_id, vs_id) for vs_id in vector_store_ids] diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index f507e2ed8..485914e0b 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -16,6 +16,7 @@ from configuration import configuration from log import get_logger from models.responses import ReferencedDocument +from utils.responses import resolve_vector_store_ids from utils.types import RAGChunk, RAGContext logger = get_logger(__name__) @@ -45,10 +46,10 @@ def _build_query_params(solr: Optional[dict[str, Any]] = None) -> dict[str, Any] "mode": constants.SOLR_VECTOR_SEARCH_DEFAULT_MODE, } logger.debug("Initial params: %s", params) - logger.debug("query_request.solr: %s", query_request.solr) + logger.debug("query_request.solr: %s", solr) - if query_request.solr: - params["solr"] = query_request.solr + if solr: + params["solr"] = solr logger.debug("Final params with solr filters: %s", params) else: logger.debug("No solr filters provided") @@ -316,7 +317,6 @@ def _process_solr_chunks_for_documents( async def _fetch_byok_rag( client: AsyncLlamaStackClient, query: str, - configuration: AppConfig, vector_store_ids: Optional[list[str]] = None, ) -> tuple[list[RAGChunk], list[ReferencedDocument]]: """Fetch chunks and documents from BYOK RAG sources. @@ -347,7 +347,14 @@ async def _fetch_byok_rag( if vs_id != constants.SOLR_DEFAULT_VECTOR_STORE_ID ] else: - vector_store_ids_to_query = configuration.inline_byok_vector_store_ids + inline_rag_ids = [ + rid + for rid in configuration.configuration.rag.inline + if rid != constants.OKP_RAG_ID + ] + vector_store_ids_to_query = resolve_vector_store_ids( + inline_rag_ids, configuration.configuration.byok_rag + ) # If inline byok stores are not defined, we disable the inline RAG for backward compatibility if not vector_store_ids_to_query: @@ -410,8 +417,8 @@ async def _fetch_byok_rag( async def _fetch_solr_rag( client: AsyncLlamaStackClient, - query_request: QueryRequest, - configuration: AppConfig, + query: str, + solr: Optional[dict[str, Any]] = None, ) -> tuple[list[RAGChunk], list[ReferencedDocument]]: """Fetch chunks and documents from Solr RAG source. @@ -486,8 +493,9 @@ async def _fetch_solr_rag( async def build_rag_context( client: AsyncLlamaStackClient, - query_request: QueryRequest, - configuration: AppConfig, + query: str, + vector_store_ids: Optional[list[str]], + solr: Optional[dict[str, Any]] = None, ) -> RAGContext: """Build RAG context by fetching and merging chunks from all enabled sources. @@ -512,7 +520,7 @@ async def build_rag_context( # Merge chunks from all sources (BYOK + Solr) context_chunks = byok_chunks + solr_chunks - context_text = _format_rag_context(context_chunks, query_request.query) + context_text = _format_rag_context(context_chunks, query) logger.debug( "Inline RAG context built: %d chunks, %d characters", diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py index 832d99a42..8cf71ede5 100644 --- a/tests/unit/models/config/test_byok_rag.py +++ b/tests/unit/models/config/test_byok_rag.py @@ -28,7 +28,7 @@ def test_byok_rag_configuration_default_values() -> None: assert byok_rag.embedding_model == DEFAULT_EMBEDDING_MODEL assert byok_rag.embedding_dimension == DEFAULT_EMBEDDING_DIMENSION assert byok_rag.vector_db_id == "vector_db_id" - assert byok_rag.db_path == Path("tests/configuration/rag.txt") + assert byok_rag.db_path == "tests/configuration/rag.txt" assert byok_rag.score_multiplier == DEFAULT_SCORE_MULTIPLIER diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index ba1e4bf94..7d84f515c 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -24,7 +24,6 @@ from pytest_mock import MockerFixture import constants -from configuration import AppConfig from models.config import ByokRag, ModelContextProtocolServer from models.requests import QueryRequest from utils.responses import ( @@ -1122,6 +1121,7 @@ async def test_does_not_translate_when_ids_fetched_from_llama_stack( mock_byok_rag.vector_db_id = "vs-translated" mock_config = mocker.Mock() mock_config.configuration.byok_rag = [mock_byok_rag] + mock_config.configuration.rag.tool = None mocker.patch("utils.responses.configuration", mock_config) result = await prepare_tools(mock_client, None, False, "token") diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py index 683a733c3..4930cb846 100644 --- a/tests/unit/utils/test_vector_search.py +++ b/tests/unit/utils/test_vector_search.py @@ -351,7 +351,8 @@ class TestFetchByokRag: async def test_byok_no_inline_ids(self, mocker) -> None: # type: ignore[no-untyped-def] """Test when no inline BYOK sources are configured.""" config_mock = mocker.Mock(spec=AppConfig) - config_mock.inline_byok_vector_store_ids = [] + config_mock.configuration.rag.inline = [] + config_mock.configuration.byok_rag = [] mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -366,7 +367,11 @@ async def test_byok_enabled_success(self, mocker) -> None: # type: ignore[no-un """Test successful BYOK RAG fetch when inline IDs are configured.""" # Mock configuration config_mock = mocker.Mock(spec=AppConfig) - config_mock.inline_byok_vector_store_ids = ["vs_1"] + byok_rag_mock = mocker.Mock() + byok_rag_mock.rag_id = "rag_1" + byok_rag_mock.vector_db_id = "vs_1" + config_mock.configuration.rag.inline = ["rag_1"] + config_mock.configuration.byok_rag = [byok_rag_mock] config_mock.score_multiplier_mapping = {"vs_1": 1.5} config_mock.rag_id_mapping = {"vs_1": "rag_1"} mocker.patch("utils.vector_search.configuration", config_mock) @@ -451,7 +456,8 @@ class TestBuildRagContext: async def test_both_sources_disabled(self, mocker) -> None: # type: ignore[no-untyped-def] """Test when both BYOK inline and Solr inline are not configured.""" config_mock = mocker.Mock(spec=AppConfig) - config_mock.inline_byok_vector_store_ids = [] + config_mock.configuration.rag.inline = [] + config_mock.configuration.byok_rag = [] config_mock.inline_solr_enabled = False mocker.patch("utils.vector_search.configuration", config_mock) @@ -467,7 +473,11 @@ async def test_byok_enabled_only(self, mocker) -> None: # type: ignore[no-untyp """Test when only inline BYOK is configured.""" # Mock configuration config_mock = mocker.Mock(spec=AppConfig) - config_mock.inline_byok_vector_store_ids = ["vs_1"] + byok_rag_mock = mocker.Mock() + byok_rag_mock.rag_id = "rag_1" + byok_rag_mock.vector_db_id = "vs_1" + config_mock.configuration.rag.inline = ["rag_1"] + config_mock.configuration.byok_rag = [byok_rag_mock] config_mock.inline_solr_enabled = False config_mock.score_multiplier_mapping = {"vs_1": 1.0} config_mock.rag_id_mapping = {"vs_1": "rag_1"} From a722e04b6dfe697416eba09f7a927796d0ccb80a Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:41:46 +0100 Subject: [PATCH 5/5] Addressing further comments --- docs/byok_guide.md | 18 +++++++++--------- docs/config.md | 12 ++++++------ docs/openapi.json | 10 +++++----- docs/rag_guide.md | 6 +++--- examples/lightspeed-stack-byok-okp-rag.yaml | 8 ++++---- src/configuration.py | 2 +- src/constants.py | 4 ++-- src/llama_stack_configuration.py | 2 +- src/models/config.py | 10 +++++----- src/utils/responses.py | 2 +- .../models/config/test_rag_configuration.py | 17 +++++++++-------- 11 files changed, 46 insertions(+), 45 deletions(-) diff --git a/docs/byok_guide.md b/docs/byok_guide.md index e9390fd62..4adac9fd7 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -277,7 +277,7 @@ registered_resources: > [!TIP] > Instead of manually editing `run.yaml`, you can declare your knowledge sources in the `byok_rag` -> section of `lightspeed-stack.yaml`. The service automatically generates the required configuration +> section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration > at startup. > > ```yaml @@ -297,32 +297,32 @@ registered_resources: ### Step 5: Configure RAG Strategy Add a `rag` section to your `lightspeed-stack.yaml` to choose how BYOK knowledge is used. -Each list entry is a `rag_id` from `byok_rag`, or the special value `okp-rag` for OKP. +Each list entry is a `rag_id` from `byok_rag`, or the special value `okp` for OKP. ```yaml rag: # Inline RAG: inject context before the LLM request (no tool calls needed) inline: - my-docs # rag_id from byok_rag - - okp-rag # include OKP context inline + - okp # include OKP context inline # Tool RAG: the LLM can call file_search to retrieve context on demand # Omit to use all registered BYOK stores (backward compatibility) tool: - my-docs # expose this BYOK store as the file_search tool - - okp-rag # expose OKP as the file_search tool + - okp # expose OKP as the file_search tool -# OKP provider settings (only relevant when okp-rag is listed above) +# OKP provider settings (only relevant when okp is listed above) okp: offline: true # true = use parent_id for source URLs, false = use reference_url ``` Both modes can be enabled simultaneously. Choose based on your latency and control preferences: -| Mode | When context is fetched | Tool call needed | Supported sources | score_multiplier | -|------|------------------------|------------------|-------------------|-----------------| -| Inline RAG | With every query | No | BYOK + OKP | Yes (BYOK only) | -| Tool RAG | On LLM demand | Yes | BYOK + OKP | No | +| Mode | When context is fetched | Tool call needed | score_multiplier | +|------|------------------------|------------------|-----------------| +| Inline RAG | With every query | No | Yes (BYOK only) | +| Tool RAG | On LLM demand | Yes | No | > [!TIP] > A ready-to-use example combining BYOK and OKP is available at diff --git a/docs/config.md b/docs/config.md index 4ecb3b635..8ba10ad7e 100644 --- a/docs/config.md +++ b/docs/config.md @@ -110,7 +110,7 @@ Microsoft Entra ID authentication attributes for Azure. BYOK (Bring Your Own Knowledge) RAG configuration. -Each entry registers a local vector store with the service. The `rag_id` is the +Each entry registers a local vector store. The `rag_id` is the identifier used in `rag.inline` and `rag.tool` to select which stores to use. Example: @@ -548,7 +548,7 @@ Top-level RAG strategy configuration. Controls two complementary retrieval modes context on demand from the listed vector stores. Supports both BYOK and OKP. Each strategy is configured as a list of RAG IDs referencing entries in `byok_rag`. -The special ID `okp-rag` activates the OKP provider (no `byok_rag` entry needed). +The special ID `okp` activates the OKP provider (no `byok_rag` entry needed). **Backward compatibility**: omitting `tool` uses all registered BYOK vector stores (equivalent to the old `tool.byok.enabled = True`). Omitting `inline` means no @@ -561,7 +561,7 @@ rag: inline: - my-docs # inject context from my-docs before the LLM request tool: - - okp-rag # LLM can search OKP as a tool + - okp # LLM can search OKP as a tool - my-docs # LLM can also search my-docs as a tool okp: @@ -571,13 +571,13 @@ okp: | Field | Type | Description | |-------|------|-------------| -| inline | list[string] | RAG IDs whose content is injected before the LLM request. Use `okp-rag` for OKP. Empty by default (no inline RAG). | -| tool | list[string] or null | RAG IDs exposed as a `file_search` tool the LLM can invoke. Use `okp-rag` to include OKP. When omitted, all registered BYOK vector stores are used (backward compatibility). | +| inline | list[string] | RAG IDs whose content is injected before the LLM request. Use `okp` for OKP. Empty by default (no inline RAG). | +| tool | list[string] or null | RAG IDs exposed as a `file_search` tool the LLM can invoke. Use `okp` to include OKP. When omitted, all registered BYOK vector stores are used (backward compatibility). | ## OkpConfiguration -OKP (Offline Knowledge Portal) provider settings. Only used when `okp-rag` is listed in `rag.inline` or `rag.tool`. +OKP (Offline Knowledge Portal) provider settings. Only used when `okp` is listed in `rag.inline` or `rag.tool`. Example: diff --git a/docs/openapi.json b/docs/openapi.json index 571b67546..1f855a6d1 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -5729,7 +5729,7 @@ "okp": { "$ref": "#/components/schemas/OkpConfiguration", "title": "OKP configuration", - "description": "OKP provider settings. Only used when 'okp-rag' is listed in rag.inline or rag.tool." + "description": "OKP provider settings. Only used when 'okp' is listed in rag.inline or rag.tool." } }, "additionalProperties": false, @@ -7598,7 +7598,7 @@ "additionalProperties": false, "type": "object", "title": "OkpConfiguration", - "description": "OKP (Offline Knowledge Portal) provider configuration.\n\nControls provider-specific behaviour for the OKP vector store.\nOnly relevant when ``\"okp-rag\"`` is listed in ``rag.inline`` or ``rag.tool``." + "description": "OKP (Offline Knowledge Portal) provider configuration.\n\nControls provider-specific behaviour for the OKP vector store.\nOnly relevant when ``\"okp\"`` is listed in ``rag.inline`` or ``rag.tool``." }, "OpenIdConnectSecurityScheme": { "properties": { @@ -8782,7 +8782,7 @@ }, "type": "array", "title": "Inline RAG IDs", - "description": "RAG IDs whose sources are injected as context before the LLM call. Use 'okp-rag' to enable OKP inline RAG. Empty by default (no inline RAG)." + "description": "RAG IDs whose sources are injected as context before the LLM call. Use 'okp' to enable OKP inline RAG. Empty by default (no inline RAG)." }, "tool": { "anyOf": [ @@ -8797,13 +8797,13 @@ } ], "title": "Tool RAG IDs", - "description": "RAG IDs made available to the LLM as a file_search tool. Use 'okp-rag' to include the OKP vector store. When omitted, all registered BYOK vector stores are used (backward compatibility)." + "description": "RAG IDs made available to the LLM as a file_search tool. Use 'okp' to include the OKP vector store. When omitted, all registered BYOK vector stores are used (backward compatibility)." } }, "additionalProperties": false, "type": "object", "title": "RagConfiguration", - "description": "RAG strategy configuration.\n\nControls which RAG sources are used for inline and tool-based retrieval.\n\nEach strategy lists RAG IDs to include. The special ID ``\"okp-rag\"`` defined in constants,\nactivates the OKP provider; all other IDs refer to entries in ``byok_rag``.\n\nBackward compatibility:\n - ``inline`` defaults to ``[]`` (no inline RAG).\n - ``tool`` defaults to ``None`` which means all registered vector stores\n are used (identical to the previous ``tool.byok.enabled = True`` default)." + "description": "RAG strategy configuration.\n\nControls which RAG sources are used for inline and tool-based retrieval.\n\nEach strategy lists RAG IDs to include. The special ID ``\"okp\"`` defined in constants,\nactivates the OKP provider; all other IDs refer to entries in ``byok_rag``.\n\nBackward compatibility:\n - ``inline`` defaults to ``[]`` (no inline RAG).\n - ``tool`` defaults to ``None`` which means all registered vector stores\n are used (identical to the previous ``tool.byok.enabled = True`` default)." }, "ReadinessResponse": { "properties": { diff --git a/docs/rag_guide.md b/docs/rag_guide.md index dd7cc9e87..fbf2e1eb2 100644 --- a/docs/rag_guide.md +++ b/docs/rag_guide.md @@ -278,9 +278,9 @@ The OKP (Offline Knowledge Portal) Solr Vector IO is a read-only vector search p ```yaml rag: inline: - - okp-rag # inject OKP context before the LLM request + - okp # inject OKP context before the LLM request tool: - - okp-rag # expose OKP as the file_search tool + - okp # expose OKP as the file_search tool okp: offline: true # true = use parent_id for source URLs (offline mode) @@ -288,7 +288,7 @@ okp: ``` > [!NOTE] -> When `okp-rag` is listed in `rag.inline` or `rag.tool`, Lightspeed Stack automatically enriches +> When `okp` is listed in `rag.inline` or `rag.tool`, Lightspeed Stack automatically enriches > the Llama Stack `run.yaml` at startup with the required `vector_io` provider and `registered_resources` > entries for the OKP vector store. No manual registration is needed. diff --git a/examples/lightspeed-stack-byok-okp-rag.yaml b/examples/lightspeed-stack-byok-okp-rag.yaml index 09a0cb85c..3cd358ff1 100644 --- a/examples/lightspeed-stack-byok-okp-rag.yaml +++ b/examples/lightspeed-stack-byok-okp-rag.yaml @@ -51,19 +51,19 @@ byok_rag: # RAG configuration rag: # Inline RAG: context injected before the LLM request from the listed sources - # List rag_ids from byok_rag, or 'okp-rag' to include OKP + # List rag_ids from byok_rag, or 'okp' to include OKP inline: - ocp-docs - knowledge-base - - okp-rag + - okp # Tool RAG: LLM can call file_search on demand to retrieve context - # List rag_ids from byok_rag, or 'okp-rag' to include OKP + # List rag_ids from byok_rag, or 'okp' to include OKP # Omit to use all registered BYOK stores (backward compatibility) tool: - ocp-docs - knowledge-base -# OKP provider settings (only used when 'okp-rag' is listed in rag.inline or rag.tool) +# OKP provider settings (only used when 'okp' is listed in rag.inline or rag.tool) okp: offline: true # true = use parent_id for source URLs, false = use reference_url # Solr fq applied to every OKP search request. Combine with AND for extra constraints: diff --git a/src/configuration.py b/src/configuration.py index 609c22bba..c9ea8e4af 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -418,7 +418,7 @@ def inline_solr_enabled(self) -> bool: """Return whether OKP is included in the inline RAG list. Returns: - bool: True if 'okp-rag' appears in rag.inline, False otherwise. + bool: True if 'okp' appears in rag.inline, False otherwise. Raises: LogicError: If the configuration has not been loaded. diff --git a/src/constants.py b/src/constants.py index 3d1f64973..0c5437fb2 100644 --- a/src/constants.py +++ b/src/constants.py @@ -181,7 +181,7 @@ # Solr OKP constants SOLR_VECTOR_SEARCH_DEFAULT_K = 5 -SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.0 +SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.3 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid" # SOLR OKP RAG @@ -202,7 +202,7 @@ DEFAULT_SCORE_MULTIPLIER = 1.0 # Special RAG ID that activates the OKP provider when listed in rag.inline or rag.tool -OKP_RAG_ID = "okp-rag" +OKP_RAG_ID = "okp" # Logging configuration constants # Environment variable name for configurable log level diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index 9314c49d5..fba64ee8b 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -543,7 +543,7 @@ def generate_configuration( # Enrichment: BYOK RAG enrich_byok_rag(ls_config, config.get("byok_rag", [])) - # Enrichment: Solr - enabled when "okp-rag" appears in either inline or tool list + # Enrichment: Solr - enabled when "okp" appears in either inline or tool list rag_config = config.get("rag", {}) inline_ids = rag_config.get("inline") or [] tool_ids = rag_config.get("tool") or [] diff --git a/src/models/config.py b/src/models/config.py index e4954bf97..af29553ce 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -1699,7 +1699,7 @@ class RagConfiguration(ConfigurationBase): Controls which RAG sources are used for inline and tool-based retrieval. - Each strategy lists RAG IDs to include. The special ID ``"okp-rag"`` defined in constants, + Each strategy lists RAG IDs to include. The special ID ``"okp"`` defined in constants, activates the OKP provider; all other IDs refer to entries in ``byok_rag``. Backward compatibility: @@ -1712,14 +1712,14 @@ class RagConfiguration(ConfigurationBase): default_factory=list, title="Inline RAG IDs", description="RAG IDs whose sources are injected as context before the LLM call. " - "Use 'okp-rag' to enable OKP inline RAG. Empty by default (no inline RAG).", + f"Use '{constants.OKP_RAG_ID}' to enable OKP inline RAG. Empty by default (no inline RAG).", ) tool: Optional[list[str]] = Field( default=None, title="Tool RAG IDs", description="RAG IDs made available to the LLM as a file_search tool. " - "Use 'okp-rag' to include the OKP vector store. " + f"Use '{constants.OKP_RAG_ID}' to include the OKP vector store. " "When omitted, all registered BYOK vector stores are used (backward compatibility).", ) @@ -1728,7 +1728,7 @@ class OkpConfiguration(ConfigurationBase): """OKP (Offline Knowledge Portal) provider configuration. Controls provider-specific behaviour for the OKP vector store. - Only relevant when ``"okp-rag"`` is listed in ``rag.inline`` or ``rag.tool``. + Only relevant when ``"okp"`` is listed in ``rag.inline`` or ``rag.tool``. """ offline: bool = Field( @@ -1895,7 +1895,7 @@ class Configuration(ConfigurationBase): okp: OkpConfiguration = Field( default_factory=OkpConfiguration, title="OKP configuration", - description="OKP provider settings. Only used when 'okp-rag' is listed " + description=f"OKP provider settings. Only used when '{constants.OKP_RAG_ID}' is listed " "in rag.inline or rag.tool.", ) diff --git a/src/utils/responses.py b/src/utils/responses.py index 034ca0b04..b44fb8d28 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -359,7 +359,7 @@ def resolve_vector_store_ids( Each ID is looked up against the BYOK RAG configuration. If a matching ``rag_id`` is found, the corresponding ``vector_db_id`` is returned. - The special ``okp-rag`` ID is mapped to the Solr vector store ID. + The special ``okp`` ID is mapped to the Solr vector store ID. Otherwise the ID is passed through unchanged (assumed to already be a llama-stack vector store ID). diff --git a/tests/unit/models/config/test_rag_configuration.py b/tests/unit/models/config/test_rag_configuration.py index a29f195b2..f13539189 100644 --- a/tests/unit/models/config/test_rag_configuration.py +++ b/tests/unit/models/config/test_rag_configuration.py @@ -6,6 +6,7 @@ import pytest from pydantic import ValidationError +import constants from models.config import OkpConfiguration, RagConfiguration @@ -25,19 +26,19 @@ def test_inline_with_byok_ids(self) -> None: assert config.tool is None def test_inline_with_okp_rag(self) -> None: - """Test inline list including the special okp-rag ID.""" - config = RagConfiguration(inline=["okp-rag", "store-1"]) - assert "okp-rag" in config.inline + """Test inline list including the special OKP ID.""" + config = RagConfiguration(inline=[constants.OKP_RAG_ID, "store-1"]) + assert constants.OKP_RAG_ID in config.inline assert "store-1" in config.inline def test_tool_with_okp_rag_and_byok(self) -> None: - """Test tool list with okp-rag and BYOK IDs.""" + """Test tool list with OKP and BYOK IDs.""" config = RagConfiguration( inline=["store-1"], - tool=["okp-rag", "store-1"], + tool=[constants.OKP_RAG_ID, "store-1"], ) assert config.inline == ["store-1"] - assert config.tool == ["okp-rag", "store-1"] + assert config.tool == [constants.OKP_RAG_ID, "store-1"] def test_tool_empty_list(self) -> None: """Test that an explicit empty tool list disables tool RAG.""" @@ -57,10 +58,10 @@ def test_no_unknown_fields_allowed(self) -> None: def test_fully_custom_config(self) -> None: """Test RagConfiguration with all fields set.""" config = RagConfiguration( - inline=["okp-rag", "store-1"], + inline=[constants.OKP_RAG_ID, "store-1"], tool=["store-1"], ) - assert "okp-rag" in config.inline + assert constants.OKP_RAG_ID in config.inline assert "store-1" in config.inline assert config.tool == ["store-1"]