diff --git a/EdgeCraftRAG/Dockerfile.server b/EdgeCraftRAG/Dockerfile.server
index 32701f6a32..fe245073e0 100644
--- a/EdgeCraftRAG/Dockerfile.server
+++ b/EdgeCraftRAG/Dockerfile.server
@@ -4,11 +4,18 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN apt-get update && apt-get install -y gnupg2 wget git
 RUN apt-get remove -y libze-intel-gpu1 libigc1 libigdfcl1 libze-dev || true; \
     apt-get update; \
-    apt-get install -y curl
+    apt-get install -y curl jq
 RUN curl -sL 'https://keyserver.ubuntu.com/pks/lookup?fingerprint=on&op=get&search=0x0C0E6AF955CE463C03FC51574D098D70AFBE5E1F' | tee /etc/apt/trusted.gpg.d/driver.asc
 RUN echo -e "Types: deb\nURIs: https://ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/\nSuites: questing\nComponents: main\nSigned-By: /etc/apt/trusted.gpg.d/driver.asc" > /etc/apt/sources.list.d/driver.sources
-RUN apt-get update && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc && apt-get install -y libze-dev intel-ocloc libze-intel-gpu-raytracing
-
+RUN apt-get update && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc && apt-get install -y libze-intel-gpu1 libze1 intel-metrics-discovery intel-opencl-icd clinfo intel-gsc && apt-get install -y libze-dev intel-ocloc libze-intel-gpu-raytracing libglib2.0-0 libgl1 libmagic-dev poppler-utils  tesseract-ocr pandoc
+RUN url=$(curl -s https://api.github.com/repos/intel/linux-npu-driver/releases/latest \
+        | jq -r '.assets[] | select(.name | test("ubuntu2404\\.tar\\.gz$")) | .browser_download_url' \
+        | head -n 1) \
+    && wget -O /tmp/linux-npu-driver.tar.gz "$url" \
+    && tar -xf /tmp/linux-npu-driver.tar.gz -C /tmp \
+    && dpkg -i /tmp/*.deb || apt-get install -f -y \
+    && rm -rf /tmp/linux-npu-driver*
+    
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
     chown -R user /home/user/
diff --git a/EdgeCraftRAG/README.md b/EdgeCraftRAG/README.md
index cef19d1749..0f225c9b91 100644
--- a/EdgeCraftRAG/README.md
+++ b/EdgeCraftRAG/README.md
@@ -7,9 +7,10 @@ quality and performance.
 
 ## What's New
 
-1. Support decouple operation for pipeline and knowledge base
-2. Optimize Agentic workflow user experience
-3. User Guide enhancement
+1. Support Agent component and enable deep_search agent
+2. Optimize pipeline execution performance with asynchronous api
+3. Support session list display in UI
+4. Support vllm-based embedding service
 
 ## Table of contents
 
diff --git a/EdgeCraftRAG/assets/img/Explore_Edge_Craft_RAG_02.jpg b/EdgeCraftRAG/assets/img/Explore_Edge_Craft_RAG_02.jpg
index e60cea1158..2e857af72e 100644
Binary files a/EdgeCraftRAG/assets/img/Explore_Edge_Craft_RAG_02.jpg and b/EdgeCraftRAG/assets/img/Explore_Edge_Craft_RAG_02.jpg differ
diff --git a/EdgeCraftRAG/cli/README.md b/EdgeCraftRAG/cli/README.md
new file mode 100644
index 0000000000..98dfaf5eb6
--- /dev/null
+++ b/EdgeCraftRAG/cli/README.md
@@ -0,0 +1,376 @@
+# EdgeCraft RAG CLI
+
+A command-line interface tool for managing EdgeCraft RAG system components including pipelines, models, knowledge bases, agents, and more.
+
+## Implementation Notes
+
+The CLI package is organized as follows:
+
+- `main.py`: Click command groups and subcommands
+- `client.py`: `EcragApiClient` REST wrapper
+- `config.py`: environment-based connection config
+- `quickstart.py`: quick start and connectivity check helper
+- `setup.py`: package metadata and `ecrag` console entry point
+
+## Installation
+
+Requires Python 3.8+.
+
+### Recommended
+
+```bash
+# From EdgeCraftRAG/cli/
+pip install -e .
+
+# Verify
+ecrag --help
+```
+
+If your OS uses externally-managed Python (PEP 668), use:
+
+```bash
+pip install --break-system-packages -e .
+ecrag --help
+```
+
+### Optional (non-editable install)
+
+```bash
+pip install .
+ecrag --help
+```
+
+## Usage
+
+The CLI is organized into logical command groups for different system components.
+
+### Basic Syntax
+
+```bash
+ecrag [--host HOST] [--port PORT] [--mega-port MEGA_PORT] COMMAND [OPTIONS]
+```
+
+```bash
+# Default top-level query uses ChatQnA (/v1/chatqna)
+ecrag query "Your question"
+```
+
+### Global Options
+
+- `--host`: Server host URL (default: `http://localhost`)
+- `--port`: Server port (default: `16010`)
+- `--mega-port`: Mega service port (default: `16011`)
+
+## Commands
+
+### Pipeline Management
+
+```bash
+# List all pipelines
+ecrag pipeline list
+
+# Create a pipeline
+ecrag pipeline create --name my_pipeline --file pipeline.json
+
+# Get pipeline details
+ecrag pipeline get --name my_pipeline
+
+# Get pipeline JSON
+ecrag pipeline get-json --name my_pipeline
+
+# Activate/Deactivate pipeline
+ecrag pipeline activate --name my_pipeline
+ecrag pipeline deactivate --name my_pipeline
+
+# Delete pipeline
+ecrag pipeline delete --name my_pipeline
+
+# Get benchmark data
+ecrag pipeline benchmark --name my_pipeline
+
+# Import pipeline from file
+ecrag pipeline import-pipeline --file pipeline.json
+```
+
+### Model Management
+
+```bash
+# Load a model
+ecrag model load --type LLM --id model-id --path /path/to/model --device cpu
+
+# List all models
+ecrag model list
+
+# Get model info
+ecrag model get --id model-id
+
+# Update model
+ecrag model update --id model-id --device gpu
+
+# Delete model
+ecrag model delete --id model-id
+
+# Get available weights
+ecrag model weights --id model-id
+
+# List available models by type
+ecrag model available --type LLM
+ecrag model available --type embedding
+```
+
+### Knowledge Base Management
+
+```bash
+# Create knowledge base
+ecrag kb create --name my_kb --description "My knowledge base"
+
+# List knowledge bases
+ecrag kb list
+
+# Get knowledge base details
+ecrag kb get --name my_kb
+
+# Get knowledge base JSON
+ecrag kb get-json --name my_kb
+
+# Get file map (pagination)
+ecrag kb filemap --name my_kb --page 1 --size 20
+
+# Update knowledge base
+ecrag kb update --name my_kb --description "Updated" --active true
+
+# Delete knowledge base
+ecrag kb delete --name my_kb
+
+# Add files to knowledge base
+ecrag kb add-files --name my_kb --paths /path/to/file1 /path/to/file2
+
+# Delete files from knowledge base
+ecrag kb delete-files --name my_kb --paths /path/to/file1
+```
+
+### Experience Management
+
+```bash
+# List all experiences
+ecrag experience list
+
+# Get experience
+ecrag experience get --id exp-id
+
+# Create/Update experience
+ecrag experience create --id exp-id --question "What is X?" --content "Answer"
+
+# Delete experience
+ecrag experience delete --id exp-id
+
+# Load experiences from file
+ecrag experience load-file --file experiences.json
+```
+
+### Agent Management
+
+```bash
+# List all agents
+ecrag agent list
+
+# Get agent details
+ecrag agent get --name my_agent
+
+# Get agent type configs
+ecrag agent configs --type react_llm
+
+# Create agent
+ecrag agent create --name my_agent --type react_llm --pipeline pipeline-idx
+
+# Update agent
+ecrag agent update --name my_agent --active true
+
+# Delete agent
+ecrag agent delete --name my_agent
+```
+
+### Prompt Management
+
+```bash
+# Get current prompt
+ecrag prompt get
+
+# Get tagged prompt
+ecrag prompt get-tagged
+
+# Get default prompt
+ecrag prompt get-default
+
+# Set prompt from text
+ecrag prompt set --text "Your prompt text here"
+
+# Set prompt from file
+ecrag prompt set --file prompt.txt
+
+# Reset to default prompt
+ecrag prompt reset
+```
+
+### Data Management
+
+```bash
+# Get all nodes
+ecrag data nodes
+
+# Get nodes by document
+ecrag data nodes-by-doc --name document_name
+
+# List all documents
+ecrag data documents
+
+# List all files
+ecrag data files
+
+# Get specific file
+ecrag data get-file --name filename
+
+# Upload file
+ecrag data upload --name filename --path /path/to/file
+```
+
+### Session Management
+
+```bash
+# List all sessions
+ecrag session list
+
+# Get session details
+ecrag session get --id session-id
+```
+
+### System Information
+
+```bash
+# Get system info (CPU, memory, disk, etc.)
+ecrag system info
+
+# Get available devices
+ecrag system devices
+```
+
+### Chat and Query
+
+```bash
+# Shortcut for ChatQnA
+ecrag query "Your question"
+
+# Retrieve relevant chunks
+ecrag chat retrieve --query "Your question" --top-n 5
+
+# Run RAG pipeline
+ecrag chat rag --query "Your question" --top-n 5
+
+# Run mega service (full pipeline)
+ecrag chat mega --query "Your question" --top-n 5
+
+# Check vLLM connection
+ecrag chat check-vllm --server http://localhost:8086 --model "Qwen/Qwen3-8B"
+```
+
+## Examples
+
+### Create a Knowledge Base and Add Files
+
+```bash
+# Create KB
+ecrag kb create --name research_kb --description "Research papers"
+
+# Add files
+ecrag kb add-files --name research_kb --paths /data/paper1.pdf /data/paper2.pdf
+```
+
+### Load a Model and Create Pipeline
+
+```bash
+# Load embedding model
+ecrag model load --type embedding --id BAAI/bge-base-en --device cpu
+
+# Create pipeline with the model
+ecrag pipeline create --name my_pipeline --file pipeline_config.json
+
+# Activate pipeline
+ecrag pipeline activate --name my_pipeline
+```
+
+### Query the System
+
+```bash
+# Run the default ChatQnA shortcut
+ecrag query "What is RAG?"
+
+# Equivalent explicit subcommand
+ecrag chat mega --query "What is RAG?" --top-n 5
+
+# RAG pipeline only
+ecrag chat rag --query "What is RAG?" --top-n 5
+
+# Retrieval only
+ecrag chat retrieve --query "What is RAG?" --top-n 5
+```
+
+## Configuration
+
+The CLI reads the following environment variables (optional):
+
+- `ECRAG_HOST`: Server host (default: `http://localhost`)
+- `ECRAG_PORT`: Server port (default: `16010`)
+- `ECRAG_MEGA_PORT`: Mega service port (default: `16011`)
+
+## Error Handling
+
+The CLI will display error messages from the API in JSON format. Network errors and other issues will be reported with descriptive error messages.
+
+## Tips
+
+- Use `--help` with any command to see detailed help:
+  ```bash
+  ecrag pipeline --help
+  ecrag pipeline create --help
+  ```
+
+- Pipe JSON output to other tools:
+  ```bash
+  ecrag kb list | jq '.[]' | head -n 20
+  ```
+
+- Use confirmation prompts for destructive operations:
+  ```bash
+  # Will ask before deleting
+  ecrag pipeline delete --name old_pipeline
+  ```
+
+## API Reference
+
+The CLI wraps the following API endpoints:
+
+- **Pipelines**: `/v1/settings/pipelines`
+- **Models**: `/v1/settings/models`
+- **Knowledge Bases**: `/v1/knowledge`
+- **Experiences**: `/v1/experiences`
+- **Agents**: `/v1/agents`
+- **Prompts**: `/v1/chatqna/prompt`
+- **Data**: `/v1/data/*`
+- **Sessions**: `/v1/sessions`
+- **System**: `/v1/system/*`
+- **Chat/Query**: `/v1/retrieval`, `/v1/ragqna`, `/v1/chatqna`
+
+For more details, see the main [API_Guide.md](../docs/API_Guide.md).
+
+## Endpoint Mapping Details
+
+The CLI maps command groups to REST endpoints as follows:
+
+- `pipeline`: `GET/POST /v1/settings/pipelines`, `GET/PATCH/DELETE /v1/settings/pipelines/{name}`
+- `model`: `GET/POST /v1/settings/models`, `GET/PATCH/DELETE /v1/settings/models/{id}`
+- `kb`: `GET/POST /v1/knowledge`, `GET/DELETE /v1/knowledge/{name}`, `PATCH /v1/knowledge/patch`, `POST/DELETE /v1/knowledge/{name}/files`
+- `experience`: `GET /v1/experiences`, `POST /v1/experience`, `PATCH/DELETE /v1/experiences`
+- `agent`: `GET/POST /v1/agents`, `GET/PATCH/DELETE /v1/agents/{name}`
+- `prompt`: `GET/POST /v1/chatqna/prompt`, `POST /v1/chatqna/prompt/reset`
+- `data`: `GET /v1/data/nodes`, `GET /v1/data/documents`, `GET /v1/data/files`, `POST /v1/data/file/{name}`
+- `chat`: `POST /v1/retrieval`, `POST /v1/ragqna`, `POST /v1/chatqna`
diff --git a/EdgeCraftRAG/cli/__init__.py b/EdgeCraftRAG/cli/__init__.py
new file mode 100644
index 0000000000..916f3a44b2
--- /dev/null
+++ b/EdgeCraftRAG/cli/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/EdgeCraftRAG/cli/client.py b/EdgeCraftRAG/cli/client.py
new file mode 100644
index 0000000000..b96c2d7ffb
--- /dev/null
+++ b/EdgeCraftRAG/cli/client.py
@@ -0,0 +1,376 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import requests
+from typing import Optional, Dict, Any
+from urllib.parse import urljoin
+
+
+class EcragApiClient:
+    """API client for Edge Craft RAG."""
+
+    def __init__(self, host: str = "http://localhost", server_port: int = 16010, mega_port: int = 16011):
+        """Initialize the API client.
+        
+        Args:
+            host: The host URL (default: http://localhost)
+            server_port: The server port (default: 16010)
+            mega_port: The mega service port (default: 16011)
+        """
+        # Normalize host URL
+        if not host.startswith(("http://", "https://")):
+            host = f"http://{host}"
+        
+        # Remove trailing slash if present
+        host = host.rstrip("/")
+        
+        self.server_url = f"{host}:{server_port}"
+        self.mega_url = f"{host}:{mega_port}"
+
+    def _request(self, method: str, url: str, **kwargs) -> Dict[str, Any]:
+        """Make an HTTP request."""
+        try:
+            response = requests.request(method, url, **kwargs)
+            response.raise_for_status()
+            return response.json() if response.text else {}
+        except requests.exceptions.RequestException as e:
+            return {"error": str(e)}
+
+    # Pipeline Management
+    def create_pipeline(self, pipeline_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create a pipeline."""
+        url = urljoin(self.server_url, "/v1/settings/pipelines")
+        return self._request("POST", url, json=pipeline_data, headers={"Content-Type": "application/json"})
+
+    def get_pipelines(self, gen_type: Optional[str] = None) -> Dict[str, Any]:
+        """Get all pipelines."""
+        url = urljoin(self.server_url, "/v1/settings/pipelines")
+        params = {"gen_type": gen_type} if gen_type else {}
+        return self._request("GET", url, params=params, headers={"Content-Type": "application/json"})
+
+    def get_pipeline(self, name: str) -> Dict[str, Any]:
+        """Get a specific pipeline."""
+        url = urljoin(self.server_url, f"/v1/settings/pipelines/{name}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_pipeline_json(self, name: str) -> Dict[str, Any]:
+        """Get pipeline JSON data."""
+        url = urljoin(self.server_url, f"/v1/settings/pipelines/{name}/json")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def update_pipeline(self, name: str, pipeline_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Update a pipeline."""
+        url = urljoin(self.server_url, f"/v1/settings/pipelines/{name}")
+        return self._request("PATCH", url, json=pipeline_data, headers={"Content-Type": "application/json"})
+
+    def activate_pipeline(self, name: str) -> Dict[str, Any]:
+        """Activate a pipeline."""
+        return self.update_pipeline(name, {"active": True})
+
+    def deactivate_pipeline(self, name: str) -> Dict[str, Any]:
+        """Deactivate a pipeline."""
+        return self.update_pipeline(name, {"active": False})
+
+    def delete_pipeline(self, name: str) -> Dict[str, Any]:
+        """Delete a pipeline."""
+        url = urljoin(self.server_url, f"/v1/settings/pipelines/{name}")
+        return self._request("DELETE", url, headers={"Content-Type": "application/json"})
+
+    def get_pipeline_benchmark(self) -> Dict[str, Any]:
+        """Get the active pipeline's benchmark."""
+        url = urljoin(self.server_url, "/v1/settings/pipeline/benchmark")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_pipeline_benchmarks(self, name: str) -> Dict[str, Any]:
+        """Get a specific pipeline's benchmark."""
+        url = urljoin(self.server_url, f"/v1/settings/pipelines/{name}/benchmarks")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def import_pipeline(self, file_path: str) -> Dict[str, Any]:
+        """Import pipeline from a JSON file."""
+        url = urljoin(self.server_url, "/v1/settings/pipelines/import")
+        with open(file_path, "rb") as f:
+            return self._request("POST", url, files={"file": f})
+
+    # Model Management
+    def load_model(self, model_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Load a model."""
+        url = urljoin(self.server_url, "/v1/settings/models")
+        return self._request("POST", url, json=model_data, headers={"Content-Type": "application/json"})
+
+    def get_models(self) -> Dict[str, Any]:
+        """Get all models."""
+        url = urljoin(self.server_url, "/v1/settings/models")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_model(self, model_id: str) -> Dict[str, Any]:
+        """Get a specific model."""
+        url = urljoin(self.server_url, f"/v1/settings/models/{model_id}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def update_model(self, model_id: str, model_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Update a model."""
+        url = urljoin(self.server_url, f"/v1/settings/models/{model_id}")
+        return self._request("PATCH", url, json=model_data, headers={"Content-Type": "application/json"})
+
+    def delete_model(self, model_id: str) -> Dict[str, Any]:
+        """Delete a model."""
+        url = urljoin(self.server_url, f"/v1/settings/models/{model_id}")
+        return self._request("DELETE", url, headers={"Content-Type": "application/json"})
+
+    def get_model_weights(self, model_id: str) -> Dict[str, Any]:
+        """Get available model weights."""
+        url = urljoin(self.server_url, f"/v1/settings/weight/{model_id}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_available_models(self, model_type: str, server_address: Optional[str] = None) -> Dict[str, Any]:
+        """Get available models by type."""
+        url = urljoin(self.server_url, f"/v1/settings/avail-models/{model_type}")
+        params = {"server_address": server_address} if server_address else {}
+        return self._request("GET", url, params=params, headers={"Content-Type": "application/json"})
+
+    # Knowledge Base Management
+    def create_knowledge_base(self, kb_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create a knowledge base."""
+        url = urljoin(self.server_url, "/v1/knowledge")
+        return self._request("POST", url, json=kb_data, headers={"Content-Type": "application/json"})
+
+    def get_knowledge_bases(self) -> Dict[str, Any]:
+        """Get all knowledge bases."""
+        url = urljoin(self.server_url, "/v1/knowledge")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_knowledge_base(self, kb_name: str) -> Dict[str, Any]:
+        """Get a specific knowledge base."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_knowledge_base_json(self, kb_name: str) -> Dict[str, Any]:
+        """Get knowledge base JSON data."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}/json")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_knowledge_base_filemap(self, kb_name: str, page_num: int = 1, page_size: int = 20) -> Dict[str, Any]:
+        """Get knowledge base file map."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}/filemap")
+        params = {"page_num": page_num, "page_size": page_size}
+        return self._request("GET", url, params=params, headers={"Content-Type": "application/json"})
+
+    def update_knowledge_base(self, kb_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Update a knowledge base."""
+        url = urljoin(self.server_url, "/v1/knowledge/patch")
+        return self._request("PATCH", url, json=kb_data, headers={"Content-Type": "application/json"})
+
+    def delete_knowledge_base(self, kb_name: str) -> Dict[str, Any]:
+        """Delete a knowledge base."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}")
+        return self._request("DELETE", url, headers={"Content-Type": "application/json"})
+
+    def add_files_to_kb(self, kb_name: str, local_paths: list) -> Dict[str, Any]:
+        """Add files to a knowledge base."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}/files")
+        return self._request("POST", url, json={"local_paths": local_paths}, headers={"Content-Type": "application/json"})
+
+    def delete_files_from_kb(self, kb_name: str, local_paths: list) -> Dict[str, Any]:
+        """Delete files from a knowledge base."""
+        url = urljoin(self.server_url, f"/v1/knowledge/{kb_name}/files")
+        return self._request("DELETE", url, json={"local_paths": local_paths}, headers={"Content-Type": "application/json"})
+
+    # Experience Management
+    def get_experiences(self) -> Dict[str, Any]:
+        """Get all experiences."""
+        url = urljoin(self.server_url, "/v1/experiences")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_experience(self, exp_id: str) -> Dict[str, Any]:
+        """Get a specific experience."""
+        url = urljoin(self.server_url, "/v1/experience")
+        return self._request("POST", url, json={"idx": exp_id}, headers={"Content-Type": "application/json"})
+
+    def create_experience(self, exp_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create an experience."""
+        return self.update_experience(exp_data)
+
+    def update_experience(self, exp_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Update an experience."""
+        url = urljoin(self.server_url, "/v1/experiences")
+        return self._request("PATCH", url, json=exp_data, headers={"Content-Type": "application/json"})
+
+    def delete_experience(self, exp_id: str) -> Dict[str, Any]:
+        """Delete an experience."""
+        url = urljoin(self.server_url, "/v1/experiences")
+        return self._request("DELETE", url, json={"idx": exp_id}, headers={"Content-Type": "application/json"})
+
+    def add_experiences_from_file(self, file_path: str) -> Dict[str, Any]:
+        """Add experiences from a file."""
+        url = urljoin(self.server_url, "/v1/experiences/files")
+        return self._request("POST", url, json={"local_path": file_path}, headers={"Content-Type": "application/json"})
+
+    def check_multiple_experiences(self, experiences: list) -> Dict[str, Any]:
+        """Check multiple experiences for duplicates."""
+        url = urljoin(self.server_url, "/v1/multiple_experiences/check")
+        return self._request("POST", url, json=experiences, headers={"Content-Type": "application/json"})
+
+    def confirm_multiple_experiences(self, experiences: list, flag: bool = True) -> Dict[str, Any]:
+        """Confirm multiple experiences."""
+        url = urljoin(self.server_url, f"/v1/multiple_experiences/confirm?flag={flag}")
+        return self._request("POST", url, json=experiences, headers={"Content-Type": "application/json"})
+
+    # Agent Management
+    def get_agents(self) -> Dict[str, Any]:
+        """Get all agents."""
+        url = urljoin(self.server_url, "/v1/agents")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_agent(self, agent_name: str) -> Dict[str, Any]:
+        """Get a specific agent."""
+        url = urljoin(self.server_url, f"/v1/agents/{agent_name}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_agent_configs(self, agent_type: str) -> Dict[str, Any]:
+        """Get default configs for an agent type."""
+        url = urljoin(self.server_url, f"/v1/agents/configs/{agent_type}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def create_agent(self, agent_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create an agent."""
+        url = urljoin(self.server_url, "/v1/agents")
+        return self._request("POST", url, json=agent_data, headers={"Content-Type": "application/json"})
+
+    def update_agent(self, agent_name: str, agent_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Update an agent."""
+        url = urljoin(self.server_url, f"/v1/agents/{agent_name}")
+        return self._request("PATCH", url, json=agent_data, headers={"Content-Type": "application/json"})
+
+    def delete_agent(self, agent_name: str) -> Dict[str, Any]:
+        """Delete an agent."""
+        url = urljoin(self.server_url, f"/v1/agents/{agent_name}")
+        return self._request("DELETE", url, headers={"Content-Type": "application/json"})
+
+    # Prompt Management
+    def get_prompt(self) -> Dict[str, Any]:
+        """Get the current system prompt."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_tagged_prompt(self) -> Dict[str, Any]:
+        """Get the tagged system prompt."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt/tagged")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_default_prompt(self) -> Dict[str, Any]:
+        """Get the default system prompt."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt/default")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def update_prompt(self, prompt: str) -> Dict[str, Any]:
+        """Update the system prompt."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt")
+        return self._request("POST", url, json={"prompt": prompt}, headers={"Content-Type": "application/json"})
+
+    def upload_prompt_from_file(self, file_path: str) -> Dict[str, Any]:
+        """Upload system prompt from a file."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt-file")
+        with open(file_path, "rb") as f:
+            return self._request("POST", url, files={"file": f})
+
+    def reset_prompt(self) -> Dict[str, Any]:
+        """Reset system prompt to default."""
+        url = urljoin(self.server_url, "/v1/chatqna/prompt/reset")
+        return self._request("POST", url, headers={"Content-Type": "application/json"})
+
+    # Data Management
+    def get_nodes(self) -> Dict[str, Any]:
+        """Get all nodes in the active knowledge base."""
+        url = urljoin(self.server_url, "/v1/data/nodes")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_nodes_by_document(self, document_name: str) -> Dict[str, Any]:
+        """Get nodes by document name."""
+        url = urljoin(self.server_url, f"/v1/data/{document_name}/nodes")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_documents(self) -> Dict[str, Any]:
+        """Get all document names in the active knowledge base."""
+        url = urljoin(self.server_url, "/v1/data/documents")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_files(self) -> Dict[str, Any]:
+        """Get all files."""
+        url = urljoin(self.server_url, "/v1/data/files")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_file(self, file_name: str) -> Dict[str, Any]:
+        """Get a specific file."""
+        url = urljoin(self.server_url, f"/v1/data/files/{file_name}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def upload_file(self, file_name: str, file_path: str) -> Dict[str, Any]:
+        """Upload a file."""
+        url = urljoin(self.server_url, f"/v1/data/file/{file_name}")
+        with open(file_path, "rb") as f:
+            return self._request("POST", url, files={"file": f})
+
+    # Session Management
+    def get_sessions(self) -> Dict[str, Any]:
+        """Get all sessions."""
+        url = urljoin(self.server_url, "/v1/sessions")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_session(self, session_id: str) -> Dict[str, Any]:
+        """Get a specific session."""
+        url = urljoin(self.server_url, f"/v1/session/{session_id}")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    # System Information
+    def get_system_info(self) -> Dict[str, Any]:
+        """Get system information."""
+        url = urljoin(self.server_url, "/v1/system/info")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    def get_available_devices(self) -> Dict[str, Any]:
+        """Get available inference devices."""
+        url = urljoin(self.server_url, "/v1/system/device")
+        return self._request("GET", url, headers={"Content-Type": "application/json"})
+
+    # Chat/Query APIs
+    def retrieval(self, messages: str, top_n: int = 5, max_tokens: int = 512) -> Dict[str, Any]:
+        """Retrieve relevant context chunks."""
+        url = urljoin(self.server_url, "/v1/retrieval")
+        return self._request(
+            "POST",
+            url,
+            json={"messages": messages, "top_n": top_n, "max_tokens": max_tokens},
+            headers={"Content-Type": "application/json"},
+        )
+
+    def chatqna(self, messages: str, top_n: int = 5, max_tokens: int = 512) -> Dict[str, Any]:
+        """Run full RAG pipeline through mega service."""
+        url = urljoin(self.mega_url, "/v1/chatqna")
+        return self._request(
+            "POST",
+            url,
+            json={"messages": messages, "top_n": top_n, "max_tokens": max_tokens},
+            headers={"Content-Type": "application/json"},
+        )
+
+    def ragqna(self, messages: str, top_n: int = 5, max_tokens: int = 512, stream: bool = False) -> Dict[str, Any]:
+        """Run RAG pipeline with contexts in response."""
+        url = urljoin(self.server_url, "/v1/ragqna")
+        return self._request(
+            "POST",
+            url,
+            json={"messages": messages, "top_n": top_n, "max_tokens": max_tokens, "stream": stream},
+            headers={"Content-Type": "application/json"},
+        )
+
+    def check_vllm_connection(self, server_address: str, model_name: str) -> Dict[str, Any]:
+        """Check vLLM server connection."""
+        url = urljoin(self.server_url, "/v1/check/vllm")
+        return self._request(
+            "POST",
+            url,
+            json={"server_address": server_address, "model_name": model_name},
+            headers={"Content-Type": "application/json"},
+        )
diff --git a/EdgeCraftRAG/cli/config.py b/EdgeCraftRAG/cli/config.py
new file mode 100644
index 0000000000..4308d629db
--- /dev/null
+++ b/EdgeCraftRAG/cli/config.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Configuration for EdgeCraft RAG CLI."""
+
+import os
+from typing import Optional
+
+
+class CLIConfig:
+    """CLI configuration from environment variables."""
+
+    def __init__(self):
+        """Initialize configuration from environment variables."""
+        self.host = os.getenv("ECRAG_HOST", "http://localhost")
+        self.port = int(os.getenv("ECRAG_PORT", "16010"))
+        self.mega_port = int(os.getenv("ECRAG_MEGA_PORT", "16011"))
+
+    def get_server_url(self) -> str:
+        """Get the server URL."""
+        return f"{self.host}:{self.port}"
+
+    def get_mega_url(self) -> str:
+        """Get the mega service URL."""
+        return f"{self.host}:{self.mega_port}"
+
+
+def get_config() -> CLIConfig:
+    """Get CLI configuration."""
+    return CLIConfig()
diff --git a/EdgeCraftRAG/cli/main.py b/EdgeCraftRAG/cli/main.py
new file mode 100644
index 0000000000..33d7df33f3
--- /dev/null
+++ b/EdgeCraftRAG/cli/main.py
@@ -0,0 +1,788 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import click
+import os
+from pathlib import Path
+from typing import Optional
+from cli.client import EcragApiClient
+from cli.config import get_config
+
+
+def pretty_print(data):
+    """Pretty print JSON data."""
+    click.echo(json.dumps(data, indent=2))
+
+
+def run_rag_query(client: EcragApiClient, query: str, top_n: int, max_tokens: int):
+    """Run the standard RAG query flow used by both query and chat rag."""
+    result = client.ragqna(query, top_n, max_tokens)
+    pretty_print(result)
+
+
+def run_chatqna_query(client: EcragApiClient, query: str, top_n: int, max_tokens: int):
+    """Run the default ChatQnA query flow used by the top-level query command."""
+    result = client.chatqna(query, top_n, max_tokens)
+    pretty_print(result)
+
+
+@click.group()
+@click.option("--host", default=None, help="Server host URL (env: ECRAG_HOST)")
+@click.option("--port", default=None, type=int, help="Server port (env: ECRAG_PORT)")
+@click.option("--mega-port", default=None, type=int, help="Mega service port (env: ECRAG_MEGA_PORT)")
+@click.pass_context
+def cli(ctx, host: Optional[str], port: Optional[int], mega_port: Optional[int]):
+    """EdgeCraft RAG CLI Tool.
+    
+    Configure server connection via command-line options or environment variables:
+    - ECRAG_HOST: Server host (default: http://localhost)
+    - ECRAG_PORT: Server port (default: 16010)
+    - ECRAG_MEGA_PORT: Mega service port (default: 16011)
+    """
+    ctx.ensure_object(dict)
+    
+    # Get defaults from config
+    config = get_config()
+    
+    # Use provided options or environment/defaults
+    final_host = host or config.host
+    final_port = port or config.port
+    final_mega_port = mega_port or config.mega_port
+    
+    # Normalize host URL
+    if not final_host.startswith(("http://", "https://")):
+        final_host = f"http://{final_host}"
+    
+    ctx.obj["client"] = EcragApiClient(host=final_host, server_port=final_port, mega_port=final_mega_port)
+
+
+# ============== Pipeline Commands ==============
+
+
+@cli.group()
+def pipeline():
+    """Manage pipelines."""
+    pass
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.option("-f", "--file", type=click.Path(exists=True), help="Pipeline JSON file")
+@click.option("-d", "--data", help="Pipeline data as JSON string")
+@click.pass_context
+def create(ctx, name: str, file: Optional[str], data: Optional[str]):
+    """Create a new pipeline."""
+    client = ctx.obj["client"]
+
+    if file:
+        with open(file, "r") as f:
+            pipeline_data = json.load(f)
+    elif data:
+        pipeline_data = json.loads(data)
+    else:
+        click.echo("Error: either --file or --data must be provided")
+        return
+
+    pipeline_data["name"] = name
+    result = client.create_pipeline(pipeline_data)
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.pass_context
+def list(ctx):
+    """List all pipelines."""
+    client = ctx.obj["client"]
+    result = client.get_pipelines()
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def get(ctx, name: str):
+    """Get a specific pipeline."""
+    client = ctx.obj["client"]
+    result = client.get_pipeline(name)
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def get_json(ctx, name: str):
+    """Get pipeline JSON data."""
+    client = ctx.obj["client"]
+    result = client.get_pipeline_json(name)
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def activate(ctx, name: str):
+    """Activate a pipeline."""
+    client = ctx.obj["client"]
+    result = client.activate_pipeline(name)
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def deactivate(ctx, name: str):
+    """Deactivate a pipeline."""
+    client = ctx.obj["client"]
+    result = client.deactivate_pipeline(name)
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def delete(ctx, name: str):
+    """Delete a pipeline."""
+    client = ctx.obj["client"]
+    if click.confirm(f"Are you sure you want to delete pipeline '{name}'?"):
+        result = client.delete_pipeline(name)
+        pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-n", "--name", required=True, help="Pipeline name")
+@click.pass_context
+def benchmark(ctx, name: Optional[str]):
+    """Get pipeline benchmark data."""
+    client = ctx.obj["client"]
+    if name:
+        result = client.get_pipeline_benchmarks(name)
+    else:
+        result = client.get_pipeline_benchmark()
+    pretty_print(result)
+
+
+@pipeline.command()
+@click.option("-f", "--file", type=click.Path(exists=True), required=True, help="Pipeline JSON file to import")
+@click.pass_context
+def import_pipeline(ctx, file: str):
+    """Import a pipeline from JSON file."""
+    client = ctx.obj["client"]
+    result = client.import_pipeline(file)
+    pretty_print(result)
+
+
+# ============== Model Commands ==============
+
+
+@cli.group()
+def model():
+    """Manage models."""
+    pass
+
+
+@model.command()
+@click.option("--type", "model_type", default="LLM", help="Model type (LLM, vLLM, reranker, embedding, etc.)")
+@click.option("--id", "model_id", required=True, help="Model ID")
+@click.option("--path", "model_path", default="./", help="Model path")
+@click.option("--device", default="cpu", help="Device (cpu, gpu)")
+@click.option("--weight", default="INT4", help="Weight type (INT4, INT8, FP16)")
+@click.pass_context
+def load(ctx, model_type: str, model_id: str, model_path: str, device: str, weight: str):
+    """Load a model."""
+    client = ctx.obj["client"]
+    model_data = {
+        "model_type": model_type,
+        "model_id": model_id,
+        "model_path": model_path,
+        "device": device,
+        "weight": weight,
+    }
+    result = client.load_model(model_data)
+    pretty_print(result)
+
+
+@model.command()
+@click.pass_context
+def list(ctx):
+    """List all models."""
+    client = ctx.obj["client"]
+    result = client.get_models()
+    pretty_print(result)
+
+
+@model.command()
+@click.option("--id", "model_id", required=True, help="Model ID")
+@click.pass_context
+def get(ctx, model_id: str):
+    """Get a specific model."""
+    client = ctx.obj["client"]
+    result = client.get_model(model_id)
+    pretty_print(result)
+
+
+@model.command()
+@click.option("--id", "model_id", required=True, help="Model ID")
+@click.option("--device", help="New device")
+@click.option("--weight", help="New weight")
+@click.pass_context
+def update(ctx, model_id: str, device: Optional[str], weight: Optional[str]):
+    """Update a model."""
+    client = ctx.obj["client"]
+    model_data = {}
+    if device:
+        model_data["device"] = device
+    if weight:
+        model_data["weight"] = weight
+    result = client.update_model(model_id, model_data)
+    pretty_print(result)
+
+
+@model.command()
+@click.option("--id", "model_id", required=True, help="Model ID")
+@click.pass_context
+def delete(ctx, model_id: str):
+    """Delete a model."""
+    client = ctx.obj["client"]
+    if click.confirm(f"Are you sure you want to delete model '{model_id}'?"):
+        result = client.delete_model(model_id)
+        pretty_print(result)
+
+
+@model.command()
+@click.option("--id", "model_id", required=True, help="Model ID")
+@click.pass_context
+def weights(ctx, model_id: str):
+    """Get available weights for a model."""
+    client = ctx.obj["client"]
+    result = client.get_model_weights(model_id)
+    pretty_print(result)
+
+
+@model.command()
+@click.option("--type", "model_type", required=True, help="Model type (LLM, vLLM, reranker, embedding, etc.)")
+@click.option("--server", help="vLLM server address (optional)")
+@click.pass_context
+def available(ctx, model_type: str, server: Optional[str]):
+    """List available models by type."""
+    client = ctx.obj["client"]
+    result = client.get_available_models(model_type, server)
+    pretty_print(result)
+
+
+# ============== Knowledge Base Commands ==============
+
+
+@cli.group()
+def kb():
+    """Manage knowledge bases."""
+    pass
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.option("--description", help="Knowledge base description")
+@click.option("-f", "--file", type=click.Path(exists=True), help="KB config JSON file")
+@click.pass_context
+def create(ctx, name: str, description: Optional[str], file: Optional[str]):
+    """Create a knowledge base."""
+    client = ctx.obj["client"]
+
+    if file:
+        with open(file, "r") as f:
+            kb_data = json.load(f)
+    else:
+        kb_data = {"name": name}
+        if description:
+            kb_data["description"] = description
+
+    result = client.create_knowledge_base(kb_data)
+    pretty_print(result)
+
+
+@kb.command()
+@click.pass_context
+def list(ctx):
+    """List all knowledge bases."""
+    client = ctx.obj["client"]
+    result = client.get_knowledge_bases()
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.pass_context
+def get(ctx, name: str):
+    """Get a specific knowledge base."""
+    client = ctx.obj["client"]
+    result = client.get_knowledge_base(name)
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.pass_context
+def get_json(ctx, name: str):
+    """Get knowledge base JSON data."""
+    client = ctx.obj["client"]
+    result = client.get_knowledge_base_json(name)
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.option("--page", "page_num", default=1, type=int, help="Page number")
+@click.option("--size", "page_size", default=20, type=int, help="Page size")
+@click.pass_context
+def filemap(ctx, name: str, page_num: int, page_size: int):
+    """Get knowledge base file map."""
+    client = ctx.obj["client"]
+    result = client.get_knowledge_base_filemap(name, page_num, page_size)
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.option("--active", type=bool, help="Set active status")
+@click.option("--description", help="Update description")
+@click.pass_context
+def update(ctx, name: str, active: Optional[bool], description: Optional[str]):
+    """Update a knowledge base."""
+    client = ctx.obj["client"]
+    kb_data = {"name": name}
+    if active is not None:
+        kb_data["active"] = active
+    if description:
+        kb_data["description"] = description
+    result = client.update_knowledge_base(kb_data)
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.pass_context
+def delete(ctx, name: str):
+    """Delete a knowledge base."""
+    client = ctx.obj["client"]
+    if click.confirm(f"Are you sure you want to delete knowledge base '{name}'?"):
+        result = client.delete_knowledge_base(name)
+        pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.option("--paths", multiple=True, required=True, help="File paths to add")
+@click.pass_context
+def add_files(ctx, name: str, paths: tuple):
+    """Add files to a knowledge base."""
+    client = ctx.obj["client"]
+    result = client.add_files_to_kb(name, list(paths))
+    pretty_print(result)
+
+
+@kb.command()
+@click.option("-n", "--name", required=True, help="Knowledge base name")
+@click.option("--paths", multiple=True, required=True, help="File paths to delete")
+@click.pass_context
+def delete_files(ctx, name: str, paths: tuple):
+    """Delete files from a knowledge base."""
+    client = ctx.obj["client"]
+    result = client.delete_files_from_kb(name, list(paths))
+    pretty_print(result)
+
+
+# ============== Experience Commands ==============
+
+
+@cli.group()
+def experience():
+    """Manage experiences (Q&A pairs)."""
+    pass
+
+
+@experience.command()
+@click.pass_context
+def list(ctx):
+    """List all experiences."""
+    client = ctx.obj["client"]
+    result = client.get_experiences()
+    pretty_print(result)
+
+
+@experience.command()
+@click.option("--id", required=True, help="Experience ID")
+@click.pass_context
+def get(ctx, id: str):
+    """Get a specific experience."""
+    client = ctx.obj["client"]
+    result = client.get_experience(id)
+    pretty_print(result)
+
+
+@experience.command()
+@click.option("--id", required=True, help="Experience ID")
+@click.option("--question", required=True, help="Question")
+@click.option("--content", multiple=True, required=True, help="Answer content")
+@click.pass_context
+def create(ctx, id: str, question: str, content: tuple):
+    """Create or update an experience."""
+    client = ctx.obj["client"]
+    exp_data = {"idx": id, "question": question, "content": list(content)}
+    result = client.update_experience(exp_data)
+    pretty_print(result)
+
+
+@experience.command()
+@click.option("--id", required=True, help="Experience ID")
+@click.pass_context
+def delete(ctx, id: str):
+    """Delete an experience."""
+    client = ctx.obj["client"]
+    if click.confirm(f"Are you sure you want to delete experience '{id}'?"):
+        result = client.delete_experience(id)
+        pretty_print(result)
+
+
+@experience.command()
+@click.option("-f", "--file", type=click.Path(exists=True), required=True, help="Experiences JSON file")
+@click.pass_context
+def load_file(ctx, file: str):
+    """Load experiences from a file."""
+    client = ctx.obj["client"]
+    result = client.add_experiences_from_file(file)
+    pretty_print(result)
+
+
+# ============== Agent Commands ==============
+
+
+@cli.group()
+def agent():
+    """Manage agents."""
+    pass
+
+
+@agent.command()
+@click.pass_context
+def list(ctx):
+    """List all agents."""
+    client = ctx.obj["client"]
+    result = client.get_agents()
+    pretty_print(result)
+
+
+@agent.command()
+@click.option("-n", "--name", required=True, help="Agent name")
+@click.pass_context
+def get(ctx, name: str):
+    """Get a specific agent."""
+    client = ctx.obj["client"]
+    result = client.get_agent(name)
+    pretty_print(result)
+
+
+@agent.command()
+@click.option("--type", required=True, help="Agent type (react_llm, etc.)")
+@click.pass_context
+def configs(ctx, type: str):
+    """Get default configs for an agent type."""
+    client = ctx.obj["client"]
+    result = client.get_agent_configs(type)
+    pretty_print(result)
+
+
+@agent.command()
+@click.option("-n", "--name", required=True, help="Agent name")
+@click.option("--type", required=True, help="Agent type")
+@click.option("--pipeline", required=True, help="Pipeline index or name")
+@click.pass_context
+def create(ctx, name: str, type: str, pipeline: str):
+    """Create an agent."""
+    client = ctx.obj["client"]
+    agent_data = {"name": name, "type": type, "pipeline_idx": pipeline}
+    result = client.create_agent(agent_data)
+    pretty_print(result)
+
+
+@agent.command()
+@click.option("-n", "--name", required=True, help="Agent name")
+@click.option("--active", type=bool, help="Active status")
+@click.pass_context
+def update(ctx, name: str, active: Optional[bool]):
+    """Update an agent."""
+    client = ctx.obj["client"]
+    agent_data = {}
+    if active is not None:
+        agent_data["active"] = active
+    result = client.update_agent(name, agent_data)
+    pretty_print(result)
+
+
+@agent.command()
+@click.option("-n", "--name", required=True, help="Agent name")
+@click.pass_context
+def delete(ctx, name: str):
+    """Delete an agent."""
+    client = ctx.obj["client"]
+    if click.confirm(f"Are you sure you want to delete agent '{name}'?"):
+        result = client.delete_agent(name)
+        pretty_print(result)
+
+
+# ============== Prompt Commands ==============
+
+
+@cli.group()
+def prompt():
+    """Manage system prompts."""
+    pass
+
+
+@prompt.command()
+@click.pass_context
+def get(ctx):
+    """Get the current system prompt."""
+    client = ctx.obj["client"]
+    result = client.get_prompt()
+    pretty_print(result)
+
+
+@prompt.command()
+@click.pass_context
+def get_tagged(ctx):
+    """Get the tagged system prompt."""
+    client = ctx.obj["client"]
+    result = client.get_tagged_prompt()
+    pretty_print(result)
+
+
+@prompt.command()
+@click.pass_context
+def get_default(ctx):
+    """Get the default system prompt."""
+    client = ctx.obj["client"]
+    result = client.get_default_prompt()
+    pretty_print(result)
+
+
+@prompt.command()
+@click.option("--text", help="Prompt text")
+@click.option("-f", "--file", type=click.Path(exists=True), help="Prompt file")
+@click.pass_context
+def set(ctx, text: Optional[str], file: Optional[str]):
+    """Update the system prompt."""
+    client = ctx.obj["client"]
+
+    if file:
+        with open(file, "r") as f:
+            prompt_text = f.read()
+    elif text:
+        prompt_text = text
+    else:
+        click.echo("Error: either --text or --file must be provided")
+        return
+
+    result = client.update_prompt(prompt_text)
+    pretty_print(result)
+
+
+@prompt.command()
+@click.pass_context
+def reset(ctx):
+    """Reset the system prompt to default."""
+    client = ctx.obj["client"]
+    if click.confirm("Are you sure you want to reset prompt to default?"):
+        result = client.reset_prompt()
+        pretty_print(result)
+
+
+# ============== Data Commands ==============
+
+
+@cli.group()
+def data():
+    """Manage data (nodes, documents, files)."""
+    pass
+
+
+@data.command()
+@click.pass_context
+def nodes(ctx):
+    """Get all nodes in active knowledge base."""
+    client = ctx.obj["client"]
+    result = client.get_nodes()
+    pretty_print(result)
+
+
+@data.command()
+@click.option("-n", "--name", required=True, help="Document name")
+@click.pass_context
+def nodes_by_doc(ctx, name: str):
+    """Get nodes by document name."""
+    client = ctx.obj["client"]
+    result = client.get_nodes_by_document(name)
+    pretty_print(result)
+
+
+@data.command()
+@click.pass_context
+def documents(ctx):
+    """Get all document names in active knowledge base."""
+    client = ctx.obj["client"]
+    result = client.get_documents()
+    pretty_print(result)
+
+
+@data.command()
+@click.pass_context
+def files(ctx):
+    """Get all files."""
+    client = ctx.obj["client"]
+    result = client.get_files()
+    pretty_print(result)
+
+
+@data.command()
+@click.option("-n", "--name", required=True, help="File name")
+@click.pass_context
+def get_file(ctx, name: str):
+    """Get a specific file."""
+    client = ctx.obj["client"]
+    result = client.get_file(name)
+    pretty_print(result)
+
+
+@data.command()
+@click.option("-n", "--name", required=True, help="File name")
+@click.option("--path", required=True, type=click.Path(exists=True), help="File path")
+@click.pass_context
+def upload(ctx, name: str, path: str):
+    """Upload a file."""
+    client = ctx.obj["client"]
+    result = client.upload_file(name, path)
+    pretty_print(result)
+
+
+# ============== Session Commands ==============
+
+
+@cli.group()
+def session():
+    """Manage sessions."""
+    pass
+
+
+@session.command()
+@click.pass_context
+def list(ctx):
+    """List all sessions."""
+    client = ctx.obj["client"]
+    result = client.get_sessions()
+    pretty_print(result)
+
+
+@session.command()
+@click.option("--id", required=True, help="Session ID")
+@click.pass_context
+def get(ctx, id: str):
+    """Get a specific session."""
+    client = ctx.obj["client"]
+    result = client.get_session(id)
+    pretty_print(result)
+
+
+# ============== System Commands ==============
+
+
+@cli.group()
+def system():
+    """Get system information."""
+    pass
+
+
+@system.command()
+@click.pass_context
+def info(ctx):
+    """Get system information."""
+    client = ctx.obj["client"]
+    result = client.get_system_info()
+    pretty_print(result)
+
+
+@system.command()
+@click.pass_context
+def devices(ctx):
+    """Get available inference devices."""
+    client = ctx.obj["client"]
+    result = client.get_available_devices()
+    pretty_print(result)
+
+
+# ============== Chat/Query Commands ==============
+
+
+@cli.group()
+def chat():
+    """Chat and query operations."""
+    pass
+
+
+@chat.command()
+@click.option("--query", required=True, help="Query string")
+@click.option("--top-n", default=5, type=int, help="Number of results to retrieve")
+@click.option("--max-tokens", default=512, type=int, help="Max tokens in response")
+@click.pass_context
+def retrieve(ctx, query: str, top_n: int, max_tokens: int):
+    """Retrieve relevant context chunks."""
+    client = ctx.obj["client"]
+    result = client.retrieval(query, top_n, max_tokens)
+    pretty_print(result)
+
+
+@chat.command()
+@click.option("--query", required=True, help="Query string")
+@click.option("--top-n", default=5, type=int, help="Number of results to retrieve")
+@click.option("--max-tokens", default=512, type=int, help="Max tokens in response")
+@click.pass_context
+def rag(ctx, query: str, top_n: int, max_tokens: int):
+    """Run RAG pipeline (retrieval + generation)."""
+    client = ctx.obj["client"]
+    run_rag_query(client, query, top_n, max_tokens)
+
+
+@cli.command()
+@click.argument("query")
+@click.option("--top-n", default=5, type=int, help="Number of results to retrieve")
+@click.option("--max-tokens", default=512, type=int, help="Max tokens in response")
+@click.pass_context
+def query(ctx, query: str, top_n: int, max_tokens: int):
+    """Shortcut for chat mega using a positional query argument."""
+    client = ctx.obj["client"]
+    run_chatqna_query(client, query, top_n, max_tokens)
+
+
+@chat.command()
+@click.option("--query", required=True, help="Query string")
+@click.option("--top-n", default=5, type=int, help="Number of results to retrieve")
+@click.option("--max-tokens", default=512, type=int, help="Max tokens in response")
+@click.pass_context
+def mega(ctx, query: str, top_n: int, max_tokens: int):
+    """Run full ChatQnA (mega service)."""
+    client = ctx.obj["client"]
+    result = client.chatqna(query, top_n, max_tokens)
+    pretty_print(result)
+
+
+@chat.command()
+@click.option("--server", required=True, help="vLLM server address")
+@click.option("--model", required=True, help="Model name")
+@click.pass_context
+def check_vllm(ctx, server: str, model: str):
+    """Check vLLM server connection."""
+    client = ctx.obj["client"]
+    result = client.check_vllm_connection(server, model)
+    pretty_print(result)
+
+
+if __name__ == "__main__":
+    cli(obj={})
diff --git a/EdgeCraftRAG/cli/pyproject.toml b/EdgeCraftRAG/cli/pyproject.toml
new file mode 100644
index 0000000000..737e278f1a
--- /dev/null
+++ b/EdgeCraftRAG/cli/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/EdgeCraftRAG/cli/quickstart.py b/EdgeCraftRAG/cli/quickstart.py
new file mode 100644
index 0000000000..d37188763f
--- /dev/null
+++ b/EdgeCraftRAG/cli/quickstart.py
@@ -0,0 +1,119 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Quick start guide for EdgeCraft RAG CLI."""
+
+import json
+import sys
+from cli.client import EcragApiClient
+
+
+def test_connection(host: str = "http://localhost", port: int = 16010):
+    """Test connection to EdgeCraft RAG server."""
+    client = EcragApiClient(host=host, server_port=port)
+    
+    try:
+        print(f"Testing connection to {client.server_url}...")
+        result = client.get_system_info()
+        
+        if "error" in result:
+            print(f"❌ Connection failed: {result['error']}")
+            return False
+        
+        print("✓ Connection successful!")
+        print(f"  System Info: {json.dumps(result, indent=2)}")
+        return True
+    except Exception as e:
+        print(f"❌ Connection error: {str(e)}")
+        return False
+
+
+def quick_start_guide():
+    """Print quick start guide."""
+    guide = """
+╔══════════════════════════════════════════════════════════════╗
+║         EdgeCraft RAG CLI - Quick Start Guide                ║
+╚══════════════════════════════════════════════════════════════╝
+
+INSTALLATION:
+  Requires Python 3.8+
+
+  Recommended (from EdgeCraftRAG/cli/):
+    pip install -e .
+    ecrag --help
+
+  If pip reports externally-managed environment (PEP 668):
+    pip install --break-system-packages -e .
+    ecrag --help
+
+  Optional (non-editable install):
+    pip install .
+    ecrag --help
+
+BASIC USAGE:
+  # Check help
+  ecrag --help
+  ecrag pipeline --help
+
+CONFIGURATION:
+  Set environment variables for connection:
+    export ECRAG_HOST=http://your-host
+    export ECRAG_PORT=16010
+    export ECRAG_MEGA_PORT=16011
+
+COMMON COMMANDS:
+  
+  Pipeline Management:
+    ecrag pipeline list
+    ecrag pipeline get --name <name>
+    ecrag pipeline create --name <name> --file pipeline.json
+    ecrag pipeline activate --name <name>
+
+  Knowledge Base:
+    ecrag kb list
+    ecrag kb create --name <name>
+    ecrag kb add-files --name <name> --paths /path/to/file
+
+  Models:
+    ecrag model list
+    ecrag model load --type LLM --id <model-id>
+
+  Query & Chat:
+    ecrag query "Your question"
+    ecrag chat retrieve --query "Your question"
+    ecrag chat rag --query "Your question"
+    ecrag chat mega --query "Your question"
+
+  System:
+    ecrag system info
+    ecrag system devices
+
+TROUBLESHOOTING:
+  Connection refused?
+    - Make sure EdgeCraft RAG server is running
+    - Check HOST (default: http://localhost) and PORT (default: 16010)
+    - Try: ecrag system info
+
+  Command not found?
+    - Make sure you're in the EdgeCraftRAG directory
+    - Try: ecrag <command>
+
+  Module not found?
+    - Verify cli/ directory exists with __init__.py
+    - Check PYTHONPATH includes the EdgeCraftRAG directory
+
+For more information, see cli/README.md
+"""
+    print(guide)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "--test":
+        host = sys.argv[2] if len(sys.argv) > 2 else "http://localhost"
+        port = int(sys.argv[3]) if len(sys.argv) > 3 else 16010
+        test_connection(host, port)
+    elif len(sys.argv) > 1 and sys.argv[1] == "--guide":
+        quick_start_guide()
+    else:
+        quick_start_guide()
+        print("\nRun with --test to check connection to server")
diff --git a/EdgeCraftRAG/cli/requirements.txt b/EdgeCraftRAG/cli/requirements.txt
new file mode 100644
index 0000000000..f1839c34a3
--- /dev/null
+++ b/EdgeCraftRAG/cli/requirements.txt
@@ -0,0 +1,2 @@
+click>=8.0
+requests>=2.28
diff --git a/EdgeCraftRAG/cli/setup.py b/EdgeCraftRAG/cli/setup.py
new file mode 100644
index 0000000000..ca5255ba0a
--- /dev/null
+++ b/EdgeCraftRAG/cli/setup.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Canonical setup script for EdgeCraft RAG CLI."""
+
+from setuptools import setup
+
+
+setup(
+    name="ecrag-cli",
+    version="0.1.0",
+    description="Command-line interface for EdgeCraft RAG",
+    author="Intel Corporation",
+    license="Apache-2.0",
+    packages=["cli"],
+    package_dir={"": ".."},
+    install_requires=[
+        "click>=8.0",
+        "requests>=2.28",
+    ],
+    entry_points={
+        "console_scripts": [
+            "ecrag=cli.main:cli",
+        ],
+    },
+    python_requires=">=3.8",
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+    ],
+)
\ No newline at end of file
diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md b/EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md
index a0c71dba87..ec1b737195 100755
--- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md
+++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/README.md
@@ -20,22 +20,19 @@ This section describes how to quickly deploy and test the EdgeCraftRAG service m
 
 ### 1. Prerequisites
 
-EC-RAG supports vLLM deployment(default method) and local OpenVINO deployment for Intel Arc GPU and Core Ultra Platform. Prerequisites are shown as below:
+EC-RAG supports vLLM deployment(default method) and local OpenVINO deployment for Intel Arc GPU and Core Ultra Platform. Prerequisites are shown as below:  
 
 #### Core Ultra
-
 **OS**: Ubuntu 24.04 or newer  
 **Driver & libraries**: Please refer to [Installing Client GPUs on Ubuntu Desktop](https://dgpu-docs.intel.com/driver/client/overview.html#installing-client-gpus-on-ubuntu-desktop)  
 **Available Inferencing Framework**: openVINO
 
 #### Intel Arc B60
-
-**OS**: Ubuntu 25.04 Desktop (for Core Ultra and Xeon-W), Ubuntu 25.04 Server (for Xeon-SP).  
+**OS**: Ubuntu 25.04 Desktop (for Core Ultra and Xeon-W), Ubuntu 25.04 Server (for Xeon-SP).   
 **Driver & libraries**: Please refer to [Install Bare Metal Environment](https://github.com/intel/llm-scaler/tree/main/vllm#11-install-bare-metal-environment) for detailed setup  
 **Available Inferencing Framework**: openVINO, vLLM
 
 #### Intel Arc A770
-
 **OS**: Ubuntu Server 22.04.1 or newer (at least 6.2 LTS kernel)  
 **Driver & libraries**: Please refer to [Installing GPUs Drivers](https://dgpu-docs.intel.com/driver/installation-rolling.html#installing-gpu-drivers) for detailed driver & libraries setup  
 **Available Inferencing Framework**: openVINO, vLLM
@@ -51,9 +48,9 @@ cd GenAIExamples/EdgeCraftRAG
 
 > **NOTE**: If you want to checkout a released version, such as v1.5:
 >
-> ```
-> git checkout v1.5
-> ```
+>```
+>git checkout v1.5
+>```
 
 ### 3. Run quick_start.sh
 
diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/README_zh.md b/EdgeCraftRAG/docker_compose/intel/gpu/arc/README_zh.md
index f46ff6255d..469b6ea2de 100644
--- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/README_zh.md
+++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/README_zh.md
@@ -23,19 +23,16 @@
 EC-RAG 支持 vLLM 部署（默认方式）以及面向 Intel Arc GPU 和 Core Ultra 平台的本地 OpenVINO 部署。前置条件如下：
 
 #### Core Ultra
-
 **操作系统**：Ubuntu 24.04 或更高版本  
 **驱动与库**：请参考 [Installing Client GPUs on Ubuntu Desktop](https://dgpu-docs.intel.com/driver/client/overview.html#installing-client-gpus-on-ubuntu-desktop)  
 **可用推理框架**：openVINO
 
 #### Intel Arc B60
-
 **操作系统**：Ubuntu 25.04 Desktop（适用于 Core Ultra 和 Xeon-W），Ubuntu 25.04 Server（适用于 Xeon-SP）。  
 **驱动与库**：详细安装请参考 [Install Bare Metal Environment](https://github.com/intel/llm-scaler/tree/main/vllm#11-install-bare-metal-environment)  
 **可用推理框架**：openVINO、vLLM
 
 #### Intel Arc A770
-
 **操作系统**：Ubuntu Server 22.04.1 或更高版本（至少 6.2 LTS 内核）  
 **驱动与库**：详细驱动与库安装请参考 [Installing GPUs Drivers](https://dgpu-docs.intel.com/driver/installation-rolling.html#installing-gpu-drivers)  
 **可用推理框架**：openVINO、vLLM
@@ -51,9 +48,9 @@ cd GenAIExamples/EdgeCraftRAG
 
 > **注意**：如果你想切换到某个发布版本，例如 v1.5：
 >
-> ```
-> git checkout v1.5
-> ```
+>```
+>git checkout v1.5
+>```
 
 ### 3. 运行 quick_start.sh
 
@@ -115,11 +112,11 @@ If you are accessing from another machine, replace ${HOST_IP} with your server's
 
 下表全面概述了示例 Docker Compose 文件中各类部署所使用的 EdgeCraftRAG 服务。表中每一行代表一个独立服务，详细说明了可用镜像及其在部署架构中的功能描述。
 
-| 服务名称            | 可选镜像名称                             | 可选 | 描述                                                       |
-| ------------------- | ---------------------------------------- | ---- | ---------------------------------------------------------- |
-| etcd                | quay.io/coreos/etcd:v3.5.5               | 否   | 提供分布式键值存储，用于服务发现和配置管理。               |
-| minio               | minio/minio:RELEASE.2023-03-20T20-16-18Z | 否   | 提供对象存储服务，用于存储文档和模型文件。                 |
-| milvus-standalone   | milvusdb/milvus:v2.4.6                   | 否   | 提供向量数据库能力，用于管理 embedding 和相似度检索。      |
-| edgecraftrag-server | opea/edgecraftrag-server:latest          | 否   | 作为 EdgeCraftRAG 服务后端，具体形态随部署方式不同而变化。 |
-| edgecraftrag-ui     | opea/edgecraftrag-ui:latest              | 否   | 提供 EdgeCraftRAG 服务的用户界面。                         |
-| ecrag               | opea/edgecraftrag:latest                 | 否   | 作为反向代理，管理 UI 与后端服务之间的流量。               |
+| 服务名称            | 可选镜像名称                             | 可选 | 描述                                                                                             |
+| ------------------- | ---------------------------------------- | ---- | ------------------------------------------------------------------------------------------------ |
+| etcd                | quay.io/coreos/etcd:v3.5.5               | 否   | 提供分布式键值存储，用于服务发现和配置管理。                                                     |
+| minio               | minio/minio:RELEASE.2023-03-20T20-16-18Z | 否   | 提供对象存储服务，用于存储文档和模型文件。                                                       |
+| milvus-standalone   | milvusdb/milvus:v2.4.6                   | 否   | 提供向量数据库能力，用于管理 embedding 和相似度检索。                                            |
+| edgecraftrag-server | opea/edgecraftrag-server:latest          | 否   | 作为 EdgeCraftRAG 服务后端，具体形态随部署方式不同而变化。                                       |
+| edgecraftrag-ui     | opea/edgecraftrag-ui:latest              | 否   | 提供 EdgeCraftRAG 服务的用户界面。                                                               |
+| ecrag               | opea/edgecraftrag:latest                 | 否   | 作为反向代理，管理 UI 与后端服务之间的流量。                                                     |
diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
index 1a169cc364..f71c80609e 100644
--- a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
+++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml
@@ -11,6 +11,8 @@ services:
       - ETCD_AUTO_COMPACTION_RETENTION=1000
       - ETCD_QUOTA_BACKEND_BYTES=4294967296
       - ETCD_SNAPSHOT_COUNT=50000
+      - no_proxy=${no_proxy},milvus-standalone,milvus-minio,milvus-etcd
+      - NO_PROXY=${NO_PROXY},milvus-standalone,milvus-minio,milvus-etcd
     volumes:
       - ${DOCKER_VOLUME_DIRECTORY:-${PWD}}/volumes/etcd:/etcd
     command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
@@ -28,6 +30,8 @@ services:
     environment:
       MINIO_ACCESS_KEY: minioadmin
       MINIO_SECRET_KEY: minioadmin
+      no_proxy: ${no_proxy},milvus-standalone,milvus-minio,milvus-etcd
+      NO_PROXY: ${NO_PROXY},milvus-standalone,milvus-minio,milvus-etcd
     ports:
       - "${MINIO_PORT1:-5044}:9001"
       - "${MINIO_PORT2:-5043}:9000"
@@ -51,6 +55,12 @@ services:
     environment:
       ETCD_ENDPOINTS: etcd:2379
       MINIO_ADDRESS: minio:9000
+      no_proxy: ${no_proxy},milvus-standalone,milvus-minio,milvus-etcd
+      NO_PROXY: ${NO_PROXY},milvus-standalone,milvus-minio,milvus-etcd
+      HTTP_PROXY: ${HTTP_PROXY}
+      HTTPS_PROXY: ${HTTPS_PROXY}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
     volumes:
       - ./milvus-config.yaml:/milvus/configs/milvus.yaml
       - ${DOCKER_VOLUME_DIRECTORY:-${PWD}}/volumes/milvus:/var/lib/milvus
@@ -76,11 +86,16 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       vLLM_ENDPOINT: ${vLLM_ENDPOINT:-http://${HOST_IP}:${VLLM_SERVICE_PORT_B60:-8086}}
+      OVMS_ENDPOINT: ${OVMS_ENDPOINT:-http://${HOST_IP}:${OVMS_SERVICE_PORT:-8000}}
       LLM_MODEL: ${LLM_MODEL}
       ENABLE_BENCHMARK: ${ENABLE_BENCHMARK:-false}
       MAX_MODEL_LEN: ${MAX_MODEL_LEN:-49152}
       CHAT_HISTORY_ROUND: ${CHAT_HISTORY_ROUND:-0}
       METADATA_DATABASE_URL: ${METADATA_DATABASE_URL:-""}
+      GPU_MEMORY_UTIL: ${GPU_MEMORY_UTIL:-0.8}
+      VLLM_AFFINITY_MASK: ${ZE_AFFINITY_MASK:-0}
+      ENABLE_GENAI: ${ENABLE_GENAI:-false}
+      TP: ${TP:-1}
     volumes:
       - ${MODEL_PATH:-${PWD}}:/home/user/models
       - ${DOC_PATH:-${PWD}}:/home/user/docs
@@ -91,6 +106,7 @@ services:
       - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010}
     devices:
       - /dev/dri:/dev/dri
+      - ${ACCEL_DEV:-/dev/dri:/dev/dri}
     group_add:
       - ${VIDEOGROUPID:-44}
       - ${RENDERGROUPID:-109}
@@ -133,7 +149,7 @@ services:
       - ecrag
   llm-serving-xpu-b60:
     container_name: ipex-serving-xpu-container
-    image: intel/llm-scaler-vllm:1.1-preview
+    image: intel/llm-scaler-vllm:0.11.1-b7
     privileged: true
     restart: always
     ports:
@@ -156,6 +172,7 @@ services:
       MAX_MODEL_LEN: ${MAX_MODEL_LEN:-49152}
       BLOCK_SIZE: ${BLOCK_SIZE:-64}
       QUANTIZATION: ${QUANTIZATION:-fp8}
+      TOOL_PARSER: ${TOOL_PARSER:-qwen3_coder}
       LLM_MODEL: ${LLM_MODEL}
       TP: ${TP:-1}
       DP: ${DP:-1}
@@ -183,9 +200,54 @@ services:
       --block-size $${BLOCK_SIZE} \
       --quantization $${QUANTIZATION} \
       -tp=$${TP} \
-      -dp=$${DP}"
+      -dp=$${DP} \
+      --enable_prefix_caching True \
+      --enable-auto-tool-choice \
+      --tool-call-parser $${TOOL_PARSER}"
     profiles:
       - b60
+  ovms-serving:
+    container_name: ovms-serving
+    image: openvino/model_server:latest-gpu
+    restart: always
+    user: ${OVMS_UID:-1000}:${OVMS_GID:-1000}
+    ports:
+      - ${OVMS_SERVICE_PORT:-8000}:8000
+    devices:
+      - /dev/dri:/dev/dri
+    group_add:
+      - ${RENDERGROUPID:-109}
+    volumes:
+      - ${MODEL_PATH:-${PWD}}:/models
+    environment:
+      OVMS_REST_PORT: ${OVMS_SERVICE_PORT:-8000}
+      OVMS_SOURCE_MODEL: ${OVMS_SOURCE_MODEL:-OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov}
+      OVMS_MODEL_NAME: ${OVMS_MODEL_NAME:-OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov}
+      OVMS_TARGET_DEVICE: ${OVMS_TARGET_DEVICE:-GPU.0}
+      OVMS_TASK: ${OVMS_TASK:-text_generation}
+      OVMS_CACHE_DIR: ${OVMS_CACHE_DIR:-/models/.ov_cache}
+      OVMS_ENABLE_PREFIX_CACHING: ${OVMS_ENABLE_PREFIX_CACHING:-true}
+      OVMS_TOOL_PARSER: ${OVMS_TOOL_PARSER:-qwen3coder}
+      OVMS_ENABLE_TOOL_GUIDED_GENERATION: ${OVMS_ENABLE_TOOL_GUIDED_GENERATION:-true}
+      OVMS_MAX_NUM_BATCHED_TOKENS: ${OVMS_MAX_NUM_BATCHED_TOKENS:-8192}
+    entrypoint:
+      - /bin/bash
+      - -c
+      - >-
+        /ovms/bin/ovms
+        --rest_port $${OVMS_REST_PORT}
+        --source_model $${OVMS_SOURCE_MODEL}
+        --model_repository_path /models
+        --model_name $${OVMS_MODEL_NAME}
+        --target_device $${OVMS_TARGET_DEVICE}
+        --task $${OVMS_TASK}
+        --cache_dir $${OVMS_CACHE_DIR}
+        --enable_prefix_caching $${OVMS_ENABLE_PREFIX_CACHING}
+        --tool_parser $${OVMS_TOOL_PARSER}
+        --enable_tool_guided_generation $${OVMS_ENABLE_TOOL_GUIDED_GENERATION}
+        --max_num_batched_tokens $${OVMS_MAX_NUM_BATCHED_TOKENS}
+    profiles:
+      - ovms
   llm-serving-xpu-770:
     container_name: ipex-llm-serving-xpu-770
     image: intelanalytics/ipex-llm-serving-xpu:0.8.3-b20
diff --git a/EdgeCraftRAG/docs/API_Guide.md b/EdgeCraftRAG/docs/API_Guide.md
index 0fbb9fd50f..2b0c318bc6 100644
--- a/EdgeCraftRAG/docs/API_Guide.md
+++ b/EdgeCraftRAG/docs/API_Guide.md
@@ -1,7 +1,6 @@
 # Edge Craft Retrieval-Augmented Generation API Guide
 
 > **Base URLs**
->
 > - EC-RAG Server: `http://${HOST_IP}:16010`
 > - EC-RAG Mega Service: `http://${HOST_IP}:16011`
 
diff --git a/EdgeCraftRAG/docs/Advanced_Setup.md b/EdgeCraftRAG/docs/Advanced_Setup.md
index 3ac7fa5f91..3c57e6cc1f 100644
--- a/EdgeCraftRAG/docs/Advanced_Setup.md
+++ b/EdgeCraftRAG/docs/Advanced_Setup.md
@@ -32,7 +32,7 @@ Embedding and reranking are usually servered by local OpenVINO inferencing, to p
 
 ```bash
 # Prepare models for embedding, reranking:
-export MODEL_PATH="${PWD}/models" # Your model path for embedding, reranking and LLM models
+export MODEL_PATH="${PWD}/workspace/models" # Your model path for embedding, reranking and LLM models
 mkdir -p $MODEL_PATH
 pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
 optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
@@ -42,17 +42,15 @@ optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-re
 #### LLM
 
 ##### openVINO
-
 If you have Core Ultra platform only, please prepare openVINO models:  
 You can also run openVINO models on discrete GPU.
 
 ```bash
 # Prepare LLM model for openVINO
-optimum-cli export openvino --model Qwen/Qwen3-8B ${MODEL_PATH}/Qwen/Qwen3-8B/INT4_compressed_weights --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
+optimum-cli export openvino --model Qwen/Qwen3-8B ${MODEL_PATH}/OpenVINO/Qwen3-8B-int4-ov --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
 ```
 
 ##### vLLM
-
 Alternatively, if you have discrete GPU and want to use vLLM, please prepare models for vLLM:
 
 ```bash
@@ -83,11 +81,16 @@ export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server
 # export HF_ENDPOINT=https://hf-mirror.com # your HF mirror endpoint"
 
 # Make sure all 3 folders have 1000:1000 permission, otherwise
-export DOC_PATH=${PWD}/tests
-export TMPFILE_PATH=${PWD}/tests
+export DOC_PATH=${PWD}/workspace
+export TMPFILE_PATH=${PWD}/workspace
 chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
 # In addition, also make sure the .cache folder has 1000:1000 permission, otherwise
 chown 1000:1000 -R $HOME/.cache
+
+# Check whether system support NPU
+if [ -e /dev/accel ]; then
+  export ACCEL_DEV="/dev/accel:/dev/accel"
+fi
 ```
 
 Set Milvus DB and chat history round for inference:
@@ -122,6 +125,18 @@ docker compose --profile b60 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 docker compose --profile a770 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 ```
 
+#### Option c.3. Deploy OVMS based EC-RAG
+
+OVMS uses the OpenVINO LLM model prepared above, for example `${MODEL_PATH}/OpenVINO/Qwen3-8B-int4-ov`.
+
+```bash
+export OVMS_SERVICE_PORT=8000
+export OVMS_SOURCE_MODEL=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_MODEL_NAME=OpenVINO/Qwen3-8B-int4-ov
+
+docker compose --profile ovms -f docker_compose/intel/gpu/arc/compose.yaml up -d
+```
+
 ### 6. Cleanup the Deployment (Manual)
 
 To stop the containers associated with the deployment, execute the following command:
@@ -153,7 +168,7 @@ In this sample, we will use Qwen3-30B-A3B deployment on 4 Arc B60 GPUs as an exa
 Before started, please prepare models into MODEL_PATH and prepare docker images
 
 ```bash
-export MODEL_PATH="${PWD}/models" # Your model path
+export MODEL_PATH="${PWD}/workspace/models" # Same default model path used by quick_start.sh
 export LLM_MODEL="Qwen/Qwen3-30B-A3B"
 pip install modelscope
 modelscope download --model $LLM_MODEL --local_dir "${MODEL_PATH}/${LLM_MODEL}"
@@ -163,3 +178,5 @@ export TP=4 # for multi GPU, you can change TP value
 export ZE_AFFINITY_MASK=0,1,2,3 # for multi GPU, you can export ZE_AFFINITY_MASK=0,1,2...
 docker compose --profile b60 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 ```
+
+
diff --git a/EdgeCraftRAG/docs/Advanced_Setup_zh.md b/EdgeCraftRAG/docs/Advanced_Setup_zh.md
index a1a870355f..f8b16dc0d7 100644
--- a/EdgeCraftRAG/docs/Advanced_Setup_zh.md
+++ b/EdgeCraftRAG/docs/Advanced_Setup_zh.md
@@ -32,7 +32,7 @@ Embedding 与 Reranking 通常由本地 OpenVINO 推理提供，可按如下方
 
 ```bash
 # 准备 embedding、reranking 模型：
-export MODEL_PATH="${PWD}/models" # embedding、reranking、LLM 模型目录
+export MODEL_PATH="${PWD}/workspace/models" # embedding、reranking、LLM 模型目录
 mkdir -p $MODEL_PATH
 pip install --upgrade --upgrade-strategy eager "optimum[openvino]"
 optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
@@ -42,17 +42,15 @@ optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-re
 #### LLM
 
 ##### openVINO
-
 如果仅使用 Core Ultra 平台，请准备 openVINO 模型：  
 你也可以在独立 GPU 上运行 openVINO 模型。
 
 ```bash
 # 准备 openVINO 的 LLM 模型
-optimum-cli export openvino --model Qwen/Qwen3-8B ${MODEL_PATH}/Qwen/Qwen3-8B/INT4_compressed_weights --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
+optimum-cli export openvino --model Qwen/Qwen3-8B ${MODEL_PATH}/OpenVINO/Qwen3-8B-int4-ov --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
 ```
 
 ##### vLLM
-
 如果你有独立 GPU 并希望使用 vLLM，可按如下方式准备模型：
 
 ```bash
@@ -83,11 +81,16 @@ export NO_PROXY=${NO_PROXY},${HOST_IP},edgecraftrag,edgecraftrag-server
 # export HF_ENDPOINT=https://hf-mirror.com # 你的 HF 镜像地址
 
 # 确保以下 3 个目录权限为 1000:1000
-export DOC_PATH=${PWD}/tests
-export TMPFILE_PATH=${PWD}/tests
+export DOC_PATH=${PWD}/workspace
+export TMPFILE_PATH=${PWD}/workspace
 chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
 # 同时确保 .cache 目录权限为 1000:1000
 chown 1000:1000 -R $HOME/.cache
+
+# 判断当前系统是否支持NPU
+if [ -e /dev/accel ]; then
+  export ACCEL_DEV="/dev/accel:/dev/accel"
+fi
 ```
 
 设置 Milvus 和聊天历史轮数：
@@ -102,6 +105,7 @@ export MILVUS_ENABLED=0
 # export CHAT_HISTORY_ROUND= # 按需修改
 ```
 
+
 ### 使用 Docker Compose 在 Intel GPU 上部署服务
 
 #### 选项 a：为 Core Ultra / Arc B60 / Arc A770 部署基于 openVINO LLM 的 EC-RAG
@@ -122,6 +126,18 @@ docker compose --profile b60 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 docker compose --profile a770 -f docker_compose/intel/gpu/arc/compose.yaml up -d
 ```
 
+#### 选项 c：部署基于 OVMS 的 EC-RAG
+
+OVMS 使用前文准备好的 OpenVINO LLM 模型，例如 `${MODEL_PATH}/OpenVINO/Qwen3-8B-int4-ov`。
+
+```bash
+export OVMS_SERVICE_PORT=8000
+export OVMS_SOURCE_MODEL=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_MODEL_NAME=OpenVINO/Qwen3-8B-int4-ov
+
+docker compose --profile ovms -f docker_compose/intel/gpu/arc/compose.yaml up -d
+```
+
 ### 6. 清理部署（手动）
 
 若要停止本次部署相关容器，请执行：
@@ -153,7 +169,7 @@ EC-RAG 镜像准备与本地推理章节一致，请参考 [自行构建镜像](
 开始前请先准备模型到 `MODEL_PATH` 并准备好 Docker 镜像。
 
 ```bash
-export MODEL_PATH="${PWD}/models" # 模型路径
+export MODEL_PATH="${PWD}/workspace/models" # 与 quick_start.sh 一致的默认模型路径
 export LLM_MODEL="Qwen/Qwen3-30B-A3B"
 pip install modelscope
 modelscope download --model $LLM_MODEL --local_dir "${MODEL_PATH}/${LLM_MODEL}"
diff --git a/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG.md b/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG.md
index 9cebfcb5b2..ee091a5fc8 100644
--- a/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG.md
+++ b/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG.md
@@ -37,17 +37,18 @@ Then, you can submit messages in the chat box in `Chat` page.
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_08.jpg)
 
 ## ChatQnA with Kbadmin in UI
-
+ 
 ### Kbadmin Knowledge Base
-
+ 
 Go to `Knowledge Base` page and click `Create Knowledge Base` button to create your knowledge base.  
 Please select 'kbadmin' in `Type`and select kb name from the kbs you created in kbadmin UI page. Loading kb name might be slow ,please wait with patient
-
+ 
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_09.png)
-
-Ten you can select embedding information in 'Indexer' page
-
+ 
+Ten you can select embedding infomation in 'Indexer' page
+ 
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_10.png)
-
+ 
 After creation, you can see kbadmin tag in knowledge base then you can submit messages in the chat box in `Chat` page.
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_11.png)
+
diff --git a/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG_zh.md b/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG_zh.md
index f01bb4ef47..10534cd127 100644
--- a/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG_zh.md
+++ b/EdgeCraftRAG/docs/Explore_Edge_Craft_RAG_zh.md
@@ -42,12 +42,12 @@
 
 流水线创建完成后，前往 `Knowledge Base` 页面，点击 `Create Knowledge Base` 按钮创建知识库。  
 请在 `Type` 中选择 `kbadmin`，并从 kbadmin UI 页面中已创建的知识库列表中选择 kb 名称。加载kb名称可能比较耗时，请耐心等待。
-
+ 
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_09.png)
-
+ 
 在 `Indexer` 页面，填写 Embedding 服务和向量数据库信息，注意 Embedding 服务端口为 13020，向量数据库端口为 29530。
-
+ 
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_10.png)
-
+ 
 然后，在 `Chat` 页面的聊天框中提交您的问题。
 ![alt text](../assets/img/Explore_Edge_Craft_RAG_11.png)
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/agent.py b/EdgeCraftRAG/edgecraftrag/api/v1/agent.py
index a7bf33ed4a..ac18604471 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/agent.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/agent.py
@@ -11,6 +11,7 @@
 from edgecraftrag.context import ctx
 from edgecraftrag.env import AGENT_FILE
 from fastapi import FastAPI, HTTPException, status
+from pydantic import BaseModel
 
 agent_app = FastAPI()
 
@@ -79,10 +80,6 @@ async def update_agent(name, request: AgentCreateIn):
             ret = agentmgr.update_agent(name, request)
             if ret:
                 await save_agent_configurations("update", ctx.get_agent_mgr().get_agents())
-                # manage agent bound pipeline status, trigger kb indexing if needed
-                # can be removed once kb indexing is decoupled from pipeline
-                pl_idx = agent.pipeline_idx
-                await manage_agent_bound_pipeline(pl_idx, request)
             return ret
         else:
             raise HTTPException(status_code=status.HTTP_404_NOT_FOUND)
@@ -107,6 +104,33 @@ async def delete_agent(name):
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 
 
+class AgentActiveIn(BaseModel):
+    active: bool
+
+
+# PATCH Agent active status
+@agent_app.patch(path="/v1/agents/{name}/active")
+async def set_agent_active(name, request: AgentActiveIn):
+    try:
+        agentmgr = ctx.get_agent_mgr()
+        agent = agentmgr.get_agent_by_name(name)
+        if not agent:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND)
+        if request.active:
+            result = agentmgr.activate_agent(agent.idx)
+        else:
+            result = agentmgr.deactivate_agent(agent.idx)
+        if result:
+            await save_agent_configurations("update", agentmgr.get_agents())
+            return {"name": name, "active": request.active}
+        else:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+    except HTTPException:
+        raise
+    except (ValueError, Exception) as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+
+
 # GET Agent Type default configs
 @agent_app.get(path="/v1/agents/configs/{agent_type}")
 async def get_agent_default_configs(agent_type):
@@ -155,32 +179,3 @@ async def load_agent(request: AgentCreateIn):
         agentmgr.remove_agent_by_name(request.name)
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
     return agent
-
-
-async def manage_agent_bound_pipeline(bound_pl_idx, request):
-    # case1: activate agent, while bound pipeline is not active -> activate it, cache previous active pipeline if exists
-    # case2: activate agent, while bound pipeline is already active -> still call activate, for caching current pipeline
-    # case3: deactivate agent, while bound pipeline **was** active -> do NOT deactivate bound pipeline, do nothing
-    # case4: deactivate agent, while bound pipeline **was NOT** active -> deactivate bound pipeline, activate previous active pipeline if exists
-    pl_manager = ctx.get_pipeline_mgr()
-
-    active_kbs = ctx.knowledgemgr.get_active_knowledge_base()
-    # TODO: update single kb with kbs
-    # kb_name = active_kbs.name if active_kb else "default"
-
-    if request.active:
-        pl_manager.activate_pipeline(bound_pl_idx, request.active, active_kbs, cache_prev=True)
-    else:
-        # at deactivate, prev_active_pl can be 1.other pl/2.None/3.current bound_pl
-        prev_active_pl = pl_manager.get_prev_active_pipeline_name()
-        if prev_active_pl and prev_active_pl != bound_pl_idx:
-            # 1, restore to the other pipeline activated
-            pl_manager.activate_pipeline(prev_active_pl, True, active_kbs)
-        elif not prev_active_pl:
-            # 2, deactivate current bound pipeline, leave no active pipeline as before
-            pl_manager.activate_pipeline(bound_pl_idx, False, active_kbs)
-        else:
-            # 3, do nothing
-            pass
-        # when agent is deactivated, clear cached previous active pipeline
-        pl_manager.clear_prev_active_pipeline_name()
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
index 67357f2183..01ddaabc65 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py
@@ -10,7 +10,7 @@
 from edgecraftrag.api_schema import RagOut
 from edgecraftrag.base import GeneratorType
 from edgecraftrag.context import ctx
-from edgecraftrag.utils import chain_async_generators, serialize_contexts, stream_generator
+from edgecraftrag.utils import chain_async_generators, serialize_contexts, serialize_node_with_score, stream_generator
 from fastapi import Body, FastAPI, HTTPException, status
 from fastapi.responses import StreamingResponse
 
@@ -59,7 +59,7 @@ async def chatqna(request: ChatCompletionRequest):
 
         # Run agent if activated, otherwise, run pipeline
         if ctx.get_agent_mgr().get_active_agent():
-            run_agent_gen = await ctx.get_agent_mgr().run_agent(chat_request=request)
+            run_agent_gen, _ = await ctx.get_agent_mgr().run_agent(chat_request=request)
             return StreamingResponse(save_session(sessionid, run_agent_gen), media_type="text/plain")
 
         else:
@@ -95,6 +95,48 @@ async def ragqna(request: ChatCompletionRequest):
         request.user = active_kb if active_kb else None
         if experience_kb:
             request.tool_choice = "auto" if experience_kb.experience_active else "none"
+
+        def serialize_retrievals(retrievals):
+            return {
+                "retrievals": [
+                    {
+                        "step": retrieval.step,
+                        "query": retrieval.query,
+                        "retrieved": [serialize_node_with_score(node) for node in retrieval.retrieved],
+                        "reranked": [serialize_node_with_score(node) for node in retrieval.reranked],
+                    }
+                    for retrieval in retrievals
+                ]
+            }
+
+        if ctx.get_agent_mgr().get_active_agent():
+            # Save original query string before agent mutates request.messages
+            original_query = request.messages
+            run_agent_gen, retrievals = await ctx.get_agent_mgr().run_agent(chat_request=request)
+
+            if request.stream:
+
+                async def res_gen_json():
+                    async for token in run_agent_gen:
+                        yield json.dumps(token, ensure_ascii=False)[1:-1]
+
+                # Lazily serialize retrievals so it runs after res_gen_json() exhausts
+                async def context_suffix_gen():
+                    yield '","contexts":' + json.dumps(serialize_retrievals(retrievals)) + "}"
+
+                query_gen = stream_generator('{"query":' + json.dumps(original_query, ensure_ascii=False) + ',"response":"')
+                output_gen = chain_async_generators([query_gen, res_gen_json(), context_suffix_gen()])
+
+                return StreamingResponse(output_gen, media_type="text/plain")
+            else:
+                response_tokens = []
+                async for token in run_agent_gen:
+                    response_tokens.append(token)
+                    await asyncio.sleep(0)
+                serialized_contexts = serialize_retrievals(retrievals)
+                ragout = RagOut(query=original_query, contexts=serialized_contexts, response="".join(response_tokens))
+                return ragout
+
         generator = ctx.get_pipeline_mgr().get_active_pipeline().get_generator(GeneratorType.CHATQNA)
         if generator:
             request.model = generator.model_id
@@ -104,10 +146,10 @@ async def ragqna(request: ChatCompletionRequest):
             # Escape newlines for json format as value
             async def res_gen_json():
                 async for token in res_gen:
-                    yield token.replace("\n", "\\n")
+                    yield json.dumps(token, ensure_ascii=False)[1:-1]
 
             # Reconstruct RagOut in stream response
-            query_gen = stream_generator('{"query":"' + request.messages + '",')
+            query_gen = stream_generator('{"query":' + json.dumps(request.messages, ensure_ascii=False) + ',')
 
             s_contexts = json.dumps(serialize_contexts(contexts))
             context_gen = stream_generator('"contexts":' + s_contexts + ',"response":"')
@@ -144,6 +186,29 @@ def check_vllm(request_data: dict = Body(...)):
         return {"status": "500", "message": f"connection failed: {str(e)}"}
 
 
+# Detecting if ovms is connected
+@chatqna_app.post(path="/v1/check/ovms")
+def check_ovms(request_data: dict = Body(...)):
+    try:
+        server = request_data.get("server_address", "http://localhost:8000").rstrip("/")
+        model = request_data.get("model_name", "Qwen/Qwen3-8B")
+        url = f"{server}/v3/chat/completions"
+        payload = {
+            "model": model,
+            "messages": [{"role": "user", "content": "Hi"}],
+            "max_tokens": 16,
+            "temperature": 0,
+        }
+
+        response = requests.post(url, json=payload, timeout=60)
+        if response.status_code == 200:
+            return {"status": "200"}
+        else:
+            raise HTTPException(status_code=500)
+    except Exception as e:
+        return {"status": "500", "message": f"connection failed: {str(e)}"}
+
+
 async def save_session(sessionid, run_agent_gen):
     collected_data = []
     session_mgr = ctx.get_session_mgr()
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/data.py b/EdgeCraftRAG/edgecraftrag/api/v1/data.py
index a72d727584..ec7cba6359 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/data.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/data.py
@@ -5,13 +5,12 @@
 import os
 from typing import List
 
-from edgecraftrag.api.v1.knowledge_base import add_file_to_knowledge_base
 from edgecraftrag.api_schema import DataIn, FilesIn
 from edgecraftrag.config_repository import MilvusConfigRepository
 from edgecraftrag.context import ctx
 from edgecraftrag.env import UI_DIRECTORY
 from fastapi import FastAPI, File, HTTPException, UploadFile, status
-
+from edgecraftrag.api.v1.knowledge_base import add_file_to_knowledge_base
 data_app = FastAPI()
 
 
@@ -25,7 +24,7 @@ async def get_nodes_with_kb(kb_name=None):
         kb = ctx.get_knowledge_mgr().get_active_knowledge_base()
     if kb.indexer.comp_subtype == "faiss_vector":
         return kb.indexer.docstore.docs
-    elif kb.indexer.comp_subtype == "milvus_vector":
+    elif kb.indexer.comp_subtype == "milvus_vector":  
         collection_name = kb.name
         Milvus_node_list = MilvusConfigRepository.create_connection(collection_name, 1, kb.indexer.vector_url)
         results = Milvus_node_list.get_configs(output_fields=["text", "_node_content", "doc_id"])
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/knowledge_base.py b/EdgeCraftRAG/edgecraftrag/api/v1/knowledge_base.py
index f06c76616a..c17298c138 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/knowledge_base.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/knowledge_base.py
@@ -8,13 +8,26 @@
 from typing import Dict, List, Union
 
 from edgecraftrag.api_schema import DataIn, ExperienceIn, KnowledgeBaseCreateIn
+from edgecraftrag.components.query_preprocess import query_search
+from edgecraftrag.components.indexer import get_kbs_info
+from edgecraftrag.config_repository import (
+    MilvusConfigRepository,
+    save_knowledge_configurations,
+)
+from edgecraftrag.context import ctx
+from edgecraftrag.env import (
+    KNOWLEDGEBASE_FILE,
+    SEARCH_CONFIG_PATH,
+    SEARCH_DIR,
+    UI_DIRECTORY,
+)
 from edgecraftrag.base import (
     IndexerType,
     ModelType,
     NodeParserType,
 )
 from edgecraftrag.components.benchmark import Benchmark
-from edgecraftrag.components.indexer import KBADMINIndexer, VectorIndexer, get_kbs_info
+from edgecraftrag.components.indexer import KBADMINIndexer, VectorIndexer
 from edgecraftrag.components.node_parser import (
     HierarchyNodeParser,
     KBADMINParser,
@@ -22,19 +35,7 @@
     SWindowNodeParser,
     UnstructedNodeParser,
 )
-from edgecraftrag.components.query_preprocess import query_search
-from edgecraftrag.config_repository import (
-    MilvusConfigRepository,
-    save_knowledge_configurations,
-)
-from edgecraftrag.context import ctx
-from edgecraftrag.env import (
-    KNOWLEDGEBASE_FILE,
-    SEARCH_CONFIG_PATH,
-    SEARCH_DIR,
-    UI_DIRECTORY,
-)
-from fastapi import FastAPI, HTTPException, Query, status
+from fastapi import FastAPI, HTTPException, status, Query
 
 kb_app = FastAPI()
 
@@ -50,9 +51,7 @@ async def get_all_knowledge_bases():
 
 # Get knowledge base files in a certain range.
 @kb_app.get("/v1/knowledge/{knowledge_name}/filemap")
-async def get_knowledge_base_filemap(
-    knowledge_name: str, page_num: int = Query(1, ge=1), page_size: int = Query(20, ge=1)
-):
+async def get_knowledge_base_filemap(knowledge_name: str, page_num: int = Query(1, ge=1), page_size: int = Query(20, ge=1)):
     kb = ctx.knowledgemgr.get_knowledge_base_by_name_or_id(knowledge_name)
     if kb and kb.file_map:
         file_map = kb.file_map
@@ -62,7 +61,7 @@ async def get_knowledge_base_filemap(
         if start >= filemap_len:
             return None
         file_map_subset = itertools.islice(file_map.items(), start, end)
-        return {"file_map": dict(file_map_subset), "total": kb.calculate_totals()}
+        return {"file_map": dict(file_map_subset),"total": kb.calculate_totals()}
     else:
         return None
 
@@ -102,7 +101,7 @@ async def create_knowledge_base(knowledge: KnowledgeBaseCreateIn):
                     active_pl.update_retriever_list(ctx.knowledgemgr.get_active_knowledge_base())
             except Exception as e:
                 ctx.knowledgemgr.delete_knowledge_base(knowledge.name)
-                raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+                raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))  
         await save_knowledge_configurations("add", kb)
         return "Create knowledge base successfully"
     except Exception as e:
@@ -157,12 +156,17 @@ async def update_knowledge_base(knowledge: KnowledgeBaseCreateIn):
                 raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
 
             # reload data for knowledge base
-            node_parser_changed = kb_node_parser != kb.node_parser
+            node_parser_changed = (kb_node_parser != kb.node_parser)
             if node_parser_changed or kb_indexer != kb.indexer:
                 await handle_reload_data(kb, node_parser_changed)
         elif kb.comp_subtype == "kbadmin_kb":
             kb.indexer.config_kbadmin_milvus(kb.name)
         active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
+        try:
+            kb.indexer.model._model.clear_requests()
+            kb.indexer.model._model.compile()
+        except Exception as e:
+            pass
         result = ctx.knowledgemgr.update_knowledge_base(knowledge, active_pl)
         # Update knowledge json
         knowledge_dict = knowledge.dict()
@@ -177,10 +181,10 @@ async def update_knowledge_base(knowledge: KnowledgeBaseCreateIn):
 @kb_app.post(path="/v1/knowledge/{knowledge_name}/files")
 async def add_file_to_knowledge_base(knowledge_name, file_path: DataIn):
     """
-    1. Parse file into Llamaindex Document and add file to filemgr
-    2. Add file path to knowledge base
-    3. Update nodes and vector store for knowledge base
-    4. Update pipeline retriever if active knowledge base's indexer changed
+        1. Parse file into Llamaindex Document and add file to filemgr
+        2. Add file path to knowledge base
+        3. Update nodes and vector store for knowledge base
+        4. Update pipeline retriever if active knowledge base's indexer changed
     """
     try:
         kb = ctx.knowledgemgr.get_knowledge_base_by_name_or_id(knowledge_name)
@@ -223,7 +227,7 @@ async def add_file_to_knowledge_base(knowledge_name, file_path: DataIn):
             raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Error uploading file.")
 
         # update retriever with indexer since indexer updated
-        if kb.active:
+        if  kb.active:
             active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
             if active_pl:
                 active_pl.update_retriever(kb, prev_indexer)
@@ -259,7 +263,7 @@ async def remove_file_from_knowledge_base(knowledge_name, file_path: DataIn):
             )
         await remove_document_handler(document_list, kb)
         # update retriever with indexer since indexer updated
-        if kb.active:
+        if  kb.active:
             active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
             if active_pl:
                 active_pl.update_retriever(kb, prev_indexer)
@@ -413,8 +417,8 @@ async def restore_knowledge_configurations():
             all_data = json.loads(all_Knowledgebases)
             for Knowledgebase_data in all_data:
                 all_datas.append(Knowledgebase_data)
-    try:
-        for Knowledgebase_data in all_datas:
+    for Knowledgebase_data in all_datas:
+        try:
             Knoweldge_req = KnowledgeBaseCreateIn(**Knowledgebase_data)
             knowledge_json = Knoweldge_req.model_dump_json()
             kb = ctx.knowledgemgr.create_knowledge_base(Knoweldge_req, knowledge_json)
@@ -422,9 +426,10 @@ async def restore_knowledge_configurations():
                 await update_kb_handler(kb, Knoweldge_req)
             except Exception as e:
                 ctx.knowledgemgr.delete_knowledge_base(Knoweldge_req.name)
-                raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+                print(f"Error loading knowledge base '{Knoweldge_req.name}': {e}")
+                continue
             if kb.comp_type == "knowledge" and kb.comp_subtype == "origin_kb":
-                if Knowledgebase_data["file_paths"]:
+                if Knowledgebase_data.get("file_paths"):
                     if kb.indexer.comp_subtype == "milvus_vector":
                         kb.indexer.reinitialize_indexer(Knowledgebase_data["name"])
                     else:
@@ -432,12 +437,12 @@ async def restore_knowledge_configurations():
                         await handle_reload_data(kb, node_parser_changed=True)
             elif kb.comp_subtype == "kbadmin_kb":
                 kb.indexer.config_kbadmin_milvus(kb.name)
-        # connect retriever with active kb's indexers
-        active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
-        if active_pl:
-            active_pl.update_retriever_list(ctx.knowledgemgr.get_active_knowledge_base())
-    except Exception as e:
-        print(f"Error load Knowledge base: {e}")
+        except Exception as e:
+            print(f"Error loading knowledge base: {e}")
+    # connect retriever with active kb's indexers
+    active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
+    if active_pl:
+        active_pl.update_retriever_list(ctx.knowledgemgr.get_active_knowledge_base())
 
 
 # reloading data that is not a milvus indexer
@@ -460,7 +465,6 @@ async def handle_reload_data(kb, node_parser_changed: bool = False):
     # update indexer
     await kb.update_nodes_to_indexer()
 
-
 async def update_kb_handler(kb, knowledge):
     if kb.enable_benchmark:
         kb.benchmark = Benchmark(True, "")
@@ -495,9 +499,7 @@ async def update_kb_handler(kb, knowledge):
             ctx.get_node_parser_mgr().add(kb.node_parser)
     if knowledge.indexer is not None:
         ind = knowledge.indexer
-        found_indexer = (
-            ctx.get_indexer_mgr().search_indexer(ind) if ind.indexer_type != IndexerType.MILVUS_VECTOR else None
-        )
+        found_indexer = ctx.get_indexer_mgr().search_indexer(ind) if ind.indexer_type != IndexerType.MILVUS_VECTOR else None
         if found_indexer is not None:
             kb.indexer = found_indexer
         else:
@@ -527,4 +529,7 @@ async def update_kb_handler(kb, knowledge):
             del kb.indexer
             kb.indexer = new_indexer
             ctx.get_indexer_mgr().add(kb.indexer)
+    active_pl = ctx.get_pipeline_mgr().get_active_pipeline()
+    if active_pl:
+        active_pl.check_top_k(ctx.get_knowledge_mgr().get_all_knowledge_bases())
     return kb
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/model.py b/EdgeCraftRAG/edgecraftrag/api/v1/model.py
index 061917d6ad..7dbccaf284 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/model.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/model.py
@@ -3,7 +3,8 @@
 
 import gc
 import os
-from typing import Optional
+from typing import List, Optional
+from urllib.parse import urlparse
 
 import requests
 from edgecraftrag.api_schema import ModelIn
@@ -16,16 +17,52 @@
 CONTAINER_MODEL_PATH = "/home/user/models/"
 
 
+def _get_model_roots() -> List[str]:
+    roots = []
+    env_model_path = os.getenv("MODEL_PATH")
+    candidates = [
+        env_model_path,
+        CONTAINER_MODEL_PATH,
+        os.path.join(os.getcwd(), "models"),
+        os.path.join(os.getcwd(), "../models"),
+        os.path.join(os.path.dirname(__file__), "../../../models"),
+        os.path.join(os.path.dirname(__file__), "../../../../models"),
+    ]
+
+    for candidate in candidates:
+        if not candidate:
+            continue
+        resolved = os.path.realpath(os.path.normpath(os.path.expanduser(candidate)))
+        if os.path.isdir(resolved) and resolved not in roots:
+            roots.append(resolved)
+
+    return roots
+
+
+def _resolve_model_path(model_id: str) -> str:
+    for root in _get_model_roots():
+        requested_path = os.path.realpath(os.path.normpath(os.path.join(root, model_id)))
+        if requested_path.startswith(root + os.sep) and os.path.exists(requested_path):
+            weights = get_available_weights(requested_path)
+            if len(weights) > 0:
+                return requested_path
+
+    for root in _get_model_roots():
+        requested_path = os.path.realpath(os.path.normpath(os.path.join(root, model_id)))
+        if requested_path.startswith(root + os.sep):
+            return requested_path
+
+    raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid model path")
+
+
 # Search available model weight
 @model_app.get(path="/v1/settings/weight/{model_id:path}")
 async def get_model_weight(model_id):
     try:
-        # Normalize and validate the path
-        base_path = os.path.normpath(CONTAINER_MODEL_PATH)
-        requested_path = os.path.normpath(os.path.join(CONTAINER_MODEL_PATH, model_id))
-        if not requested_path.startswith(base_path):
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid model path")
+        requested_path = _resolve_model_path(model_id)
         return get_available_weights(requested_path)
+    except HTTPException:
+        raise
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=" GET model weight failed")
 
@@ -34,20 +71,22 @@ async def get_model_weight(model_id):
 @model_app.get(path="/v1/settings/avail-models/{model_type}")
 async def get_model_id(
     model_type: str,
-    server_address: Optional[str] = Query(default=None, description="vLLM server address (optional)"),
+    server_address: Optional[str] = Query(default=None, description="remote inference server address (optional)"),
 ):
     try:
-        if model_type == "vLLM":
-            if not server_address:
-                server_address = "http://localhost:8086"
-            return get_available_vllm_models(server_address)
-        elif model_type == "vLLM_embedding":
-            if not server_address:
-                server_address = "http://localhost:8087"
-            return get_available_vllm_models(server_address)
+        normalized_type = (model_type or "").strip().lower()
+
+        if normalized_type == "vllm":
+            return get_available_vllm_models(_normalize_server_address(server_address, "http://localhost:8086"))
+        elif normalized_type == "ovms":
+            return get_available_ovms_models(_normalize_server_address(server_address, "http://localhost:8000"))
+        elif normalized_type == "vllm_embedding":
+            return get_available_vllm_models(_normalize_server_address(server_address, "http://localhost:8087"))
         else:
             return get_available_models(model_type)
-    except Exception as e:
+    except HTTPException:
+        raise
+    except Exception:
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail=" GET model failed",
@@ -126,40 +165,126 @@ async def delete_model(model_id):
 
 
 def get_available_weights(model_path):
-    avail_weights_compression = []
+    avail_weights_compression = set()
+
+    model_name = os.path.basename(model_path).upper()
+    if "INT4" in model_name:
+        avail_weights_compression.add("INT4")
+    if "INT8" in model_name:
+        avail_weights_compression.add("INT8")
+    if "FP16" in model_name:
+        avail_weights_compression.add("FP16")
+
     for _, dirs, _ in os.walk(model_path):
         for dir_name in dirs:
-            if "INT4" in dir_name:
-                avail_weights_compression.append("INT4")
-            if "INT8" in dir_name:
-                avail_weights_compression.append("INT8")
-            if "FP16" in dir_name:
-                avail_weights_compression.append("FP16")
-    return avail_weights_compression
+            upper_name = dir_name.upper()
+            if "INT4" in upper_name:
+                avail_weights_compression.add("INT4")
+            if "INT8" in upper_name:
+                avail_weights_compression.add("INT8")
+            if "FP16" in upper_name:
+                avail_weights_compression.add("FP16")
+
+    return [weight for weight in ["INT4", "INT8", "FP16"] if weight in avail_weights_compression]
 
 
 def get_available_models(model_type):
     avail_models = []
-    if model_type == "vLLM":
-        LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen3-8B")
-        avail_models.append(LLM_MODEL)
-    elif model_type == "LLM":
-        items = os.listdir(CONTAINER_MODEL_PATH)
-        for item in items:
-            if item == "BAAI":
+    seen_models = set()
+    model_roots = _get_model_roots()
+    if not model_roots:
+        model_roots = [os.path.realpath(os.path.normpath(CONTAINER_MODEL_PATH))]
+
+    normalized_model_type = (model_type or "").strip().lower()
+
+    def _is_llm_model_dir(file_names: set) -> bool:
+        if "openvino_model.xml" in file_names and any(
+            name.endswith(".bin") for name in file_names
+        ):
+            return True
+
+        if "config.json" in file_names and (
+            "pytorch_model.bin" in file_names
+            or "model.safetensors" in file_names
+            or any(name.endswith(".safetensors") for name in file_names)
+            or "openvino_model.xml" in file_names
+        ):
+            return True
+
+        if any(name.endswith(".gguf") for name in file_names):
+            return True
+
+        return False
+
+    def _discover_llm_model_ids(model_root: str, max_depth: int = 6) -> List[str]:
+        results: List[str] = []
+
+        root = os.path.realpath(os.path.normpath(os.path.expanduser(model_root)))
+        if not os.path.isdir(root):
+            return results
+
+        stack: List[tuple[str, str, int]] = [(root, "", 0)]
+        while stack:
+            abs_dir, rel_dir, depth = stack.pop()
+
+            try:
+                entries = list(os.scandir(abs_dir))
+            except OSError:
+                continue
+
+            file_names = {e.name for e in entries if e.is_file(follow_symlinks=False)}
+            if rel_dir and _is_llm_model_dir(file_names):
+                results.append(rel_dir)
+                continue
+
+            if depth >= max_depth:
                 continue
-            sub_paths = os.listdir(os.path.join(CONTAINER_MODEL_PATH, item))
-            if sub_paths and "INT4" not in sub_paths[0] and "INT8" not in sub_paths[0] and "FP16" not in sub_paths[0]:
-                for sub_path in sub_paths:
-                    avail_models.append(item + "/" + sub_path)
-            else:
-                avail_models.append(item)
-    elif model_type == "kbadmin_embedding_model":
+
+            subdirs = [e for e in entries if e.is_dir(follow_symlinks=False)]
+            subdirs.sort(key=lambda e: e.name.lower(), reverse=True)
+            for entry in subdirs:
+                name = entry.name
+                if not name or name.startswith("."):
+                    continue
+                if name == "BAAI" or (rel_dir and rel_dir.split("/", 1)[0] == "BAAI"):
+                    continue
+                if name in {"__pycache__", ".ov_cache", "ov_cache", "cache", "tmp"}:
+                    continue
+
+                next_rel = f"{rel_dir}/{name}" if rel_dir else name
+                if next_rel.split("/", 1)[0] == "BAAI":
+                    continue
+                stack.append((entry.path, next_rel, depth + 1))
+
+        return list(dict.fromkeys(results))
+
+    def add_model(model_name: str):
+        if model_name not in seen_models:
+            seen_models.add(model_name)
+            avail_models.append(model_name)
+
+    if normalized_model_type == "vllm":
+        LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen3-8B")
+        add_model(LLM_MODEL)
+    elif normalized_model_type == "llm":
+        llm_candidates: List[str] = []
+        for model_root in model_roots:
+            llm_candidates.extend(_discover_llm_model_ids(model_root))
+
+        for model_id in sorted(dict.fromkeys(llm_candidates), key=lambda s: s.lower()):
+            add_model(model_id)
+    elif normalized_model_type == "kbadmin_embedding_model":
         return ["BAAI/bge-large-zh-v1.5"]
     else:
-        for item in os.listdir(CONTAINER_MODEL_PATH + "BAAI"):
-            if (model_type == "reranker" and "rerank" in item) or (model_type == "embedding" and "rerank" not in item):
-                avail_models.append("BAAI/" + item)
+        for model_root in model_roots:
+            baai_dir = os.path.join(model_root, "BAAI")
+            if not os.path.isdir(baai_dir):
+                continue
+            for item in os.listdir(baai_dir):
+                if (normalized_model_type == "reranker" and "rerank" in item) or (
+                    normalized_model_type == "embedding" and "rerank" not in item
+                ):
+                    add_model("BAAI/" + item)
 
     return avail_models
 
@@ -167,16 +292,124 @@ def get_available_models(model_type):
 @model_app.get(path="/v1/available_models")
 def get_available_vllm_models(server_address: str):
     try:
-        url = f"{server_address}/v1/models"
+        base_url = _normalize_server_address(server_address, "http://localhost:8086")
+        url = f"{base_url}/v1/models"
         response = requests.get(url, timeout=60)
         response.raise_for_status()
-        response_data = response.json()
-        model_entries = response_data.get("data", [])
-        models = [entry.get("id") for entry in model_entries if entry.get("id")]
-
-        return models
+        return _extract_model_ids(response.json())
 
     except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"Failed to connect to vLLM server: {str(e)}")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
+
+
+def _normalize_server_address(server_address: Optional[str], default: str) -> str:
+    address = (server_address or "").strip() or default
+    if not address.startswith(("http://", "https://")):
+        address = f"http://{address}"
+
+    parsed = urlparse(address)
+    base = f"{parsed.scheme}://{parsed.netloc}" if parsed.netloc else address
+    path = (parsed.path or "").rstrip("/")
+
+    # Accept inputs like http://host:port/v1 and normalize back to base host.
+    if path and path != "/v1":
+        base = f"{base}{path}"
+
+    return base.rstrip("/")
+
+
+def _extract_model_ids(response_data) -> List[str]:
+    models = []
+
+    if isinstance(response_data, dict):
+        data = response_data.get("data")
+        if isinstance(data, list):
+            for entry in data:
+                if isinstance(entry, dict) and entry.get("id"):
+                    models.append(entry["id"])
+
+        items = response_data.get("models")
+        if isinstance(items, list):
+            for entry in items:
+                if isinstance(entry, dict):
+                    model_name = entry.get("name") or entry.get("id")
+                    if model_name:
+                        models.append(model_name)
+
+        config = response_data.get("config")
+        if isinstance(config, dict):
+            models.extend(config.keys())
+
+        if not models:
+            # Some OVMS APIs return a model-name keyed dict at top level.
+            for key, value in response_data.items():
+                if isinstance(value, dict) and ("base_path" in value or "model_version_policy" in value):
+                    models.append(key)
+
+    # Keep original order while deduplicating.
+    return list(dict.fromkeys(models))
+
+
+def _extract_ovms_model_names(response_data) -> List[str]:
+    models: List[str] = []
+    try:
+        models.extend(_extract_model_ids(response_data))
+    except Exception:
+        pass
+
+    if isinstance(response_data, dict):
+        model_config_list = response_data.get("model_config_list")
+        if isinstance(model_config_list, list):
+            for entry in model_config_list:
+                if isinstance(entry, dict) and entry.get("name"):
+                    models.append(entry["name"])
+
+        for key, value in response_data.items():
+            if not isinstance(key, str):
+                continue
+            if not isinstance(value, dict):
+                continue
+            if "model_version_status" in value and isinstance(value.get("model_version_status"), list):
+                models.append(key)
+                continue
+
+            if any(field in value for field in ("base_path", "model_version_policy", "state")):
+                models.append(key)
+    if isinstance(response_data, list):
+        for entry in response_data:
+            if isinstance(entry, dict):
+                name = entry.get("name") or entry.get("id")
+                if name:
+                    models.append(name)
+            elif isinstance(entry, str):
+                models.append(entry)
+
+    return list(dict.fromkeys([m for m in models if isinstance(m, str) and m.strip()]))
+
+
+def get_available_ovms_models(server_address: str) -> List[str]:
+    base_url = _normalize_server_address(server_address, "http://localhost:8000")
+    errors = []
+
+    for endpoint in ("/v1/models", "/v1/config", "/v2/models"):
+        url = f"{base_url}{endpoint}"
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            models = _extract_ovms_model_names(response.json())
+            if models:
+                return models
+        except Exception as exc:
+            errors.append(f"{endpoint}: {exc}")
+
+    # Fall back to configured/default model name to keep generator setup usable.
+    fallback_model = os.getenv("OVMS_MODEL_NAME") or os.getenv("LLM_MODEL")
+    if fallback_model:
+        return [fallback_model]
+
+    detail = "Failed to discover OVMS models"
+    if errors:
+        detail += f" ({'; '.join(errors)})"
+    raise HTTPException(status_code=500, detail=detail)
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
index 89238f48c2..35cf91a875 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py
@@ -6,7 +6,7 @@
 import re
 import time
 import weakref
-
+from openvino import Core, Type
 from edgecraftrag.api_schema import MilvusConnectRequest, PipelineCreateIn
 from edgecraftrag.base import (
     GeneratorType,
@@ -17,6 +17,7 @@
 from edgecraftrag.components.benchmark import Benchmark
 from edgecraftrag.components.generator import FreeChatGenerator, QnAGenerator
 from edgecraftrag.components.postprocessor import MetadataReplaceProcessor, RerankProcessor
+
 from edgecraftrag.config_repository import MilvusConfigRepository, save_pipeline_configurations
 from edgecraftrag.context import ctx
 from edgecraftrag.env import PIPELINE_FILE
@@ -95,6 +96,11 @@ async def update_pipeline(name, request: PipelineCreateIn):
     async with ctx.get_pipeline_mgr()._lock:
         try:
             await update_pipeline_handler(pl, request)
+            try:
+                pl.postprocessor[0].model._model.clear_requests()
+                pl.postprocessor[0].model._model.compile()
+            except Exception as e:
+                pass
             pipeline_dict = request.dict()
             pl.update_pipeline_json(pipeline_dict)
         except (ValueError, Exception) as e:
@@ -152,9 +158,10 @@ async def load_pipeline(request):
 
 
 async def update_pipeline_handler(pl, req):
-
     if req.retriever is not None:
         retr = req.retriever
+        if pl.max_retrieve_topk != 0:
+            retr.retrieve_topk = min(retr.retrieve_topk, pl.max_retrieve_topk)
         pl.update_retriever_config(retr.retriever_type, retr.retrieve_topk)
 
     if req.postprocessor is not None:
@@ -170,6 +177,8 @@ async def update_pipeline_handler(pl, req):
                             prm.model_type = ModelType.RERANKER
                             reranker_model = ctx.get_model_mgr().load_model(prm)
                             ctx.get_model_mgr().add(reranker_model)
+                        if pl.max_retrieve_topk != 0:
+                            processor.top_n = min(processor.top_n, pl.max_retrieve_topk)
                         postprocessor = RerankProcessor(reranker_model, processor.top_n)
                         pl.postprocessor.append(postprocessor)
                     else:
@@ -188,6 +197,8 @@ async def update_pipeline_handler(pl, req):
                 if model is None:
                     if gen.inference_type == InferenceType.VLLM:
                         gen.model.model_type = ModelType.VLLM
+                    elif gen.inference_type == InferenceType.OVMS:
+                        gen.model.model_type = ModelType.OVMS
                     else:
                         gen.model.model_type = ModelType.LLM
                     if pl.enable_benchmark:
@@ -200,11 +211,18 @@ async def update_pipeline_handler(pl, req):
                 if gen.generator_type == GeneratorType.CHATQNA:
                     pl.generator.append(
                         QnAGenerator(
-                            model_ref, gen.prompt_path, gen.inference_type, gen.vllm_endpoint, gen.prompt_content
+                            model_ref,
+                            gen.prompt_path,
+                            gen.inference_type,
+                            gen.vllm_endpoint,
+                            gen.prompt_content,
+                            gen.ovms_endpoint,
                         )
                     )
                 elif gen.generator_type == GeneratorType.FREECHAT:
-                    pl.generator.append(FreeChatGenerator(model_ref, gen.inference_type, gen.vllm_endpoint))
+                    pl.generator.append(
+                        FreeChatGenerator(model_ref, gen.inference_type, gen.vllm_endpoint, gen.ovms_endpoint)
+                    )
 
                 if pl.enable_benchmark:
                     if "tokenizer" not in locals() or tokenizer is None:
@@ -214,11 +232,11 @@ async def update_pipeline_handler(pl, req):
                     pl.benchmark = Benchmark(pl.enable_benchmark, gen.inference_type)
             else:
                 raise Exception("Inference Type Not Supported")
-
+    flag = pl.check_top_k(ctx.get_knowledge_mgr().get_all_knowledge_bases())
+    if flag == True:
+        await save_pipeline_configurations("update", pl)
     if pl.status.active != req.active:
-        ctx.get_pipeline_mgr().activate_pipeline(
-            pl.name, req.active, ctx.get_knowledge_mgr().get_active_knowledge_base()
-        )
+        ctx.get_pipeline_mgr().activate_pipeline(pl.name, req.active, ctx.get_knowledge_mgr().get_active_knowledge_base())
     return pl
 
 
@@ -237,12 +255,13 @@ async def restore_pipeline_configurations():
                 all_pipelines = f.read()
         if all_pipelines:
             all_pipelines = json.loads(all_pipelines)
-    try:
-        for pipeline_data in all_pipelines:
+    for pipeline_data in all_pipelines:
+        try:
             pipeline_req = PipelineCreateIn(**pipeline_data)
             await load_pipeline(pipeline_req)
-    except Exception as e:
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+        except Exception as e:
+            print(f"Error loading pipeline: {e}")
+            continue
 
 
 # Detecting if milvus is connected
diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/system.py b/EdgeCraftRAG/edgecraftrag/api/v1/system.py
index 7a0fd62751..9c746151e3 100644
--- a/EdgeCraftRAG/edgecraftrag/api/v1/system.py
+++ b/EdgeCraftRAG/edgecraftrag/api/v1/system.py
@@ -14,8 +14,6 @@
 def get_available_devices():
     core = ov.Core()
     avail_devices = core.available_devices + ["AUTO"]
-    if "NPU" in avail_devices:
-        avail_devices.remove("NPU")
     return avail_devices
 
 
diff --git a/EdgeCraftRAG/edgecraftrag/api_schema.py b/EdgeCraftRAG/edgecraftrag/api_schema.py
index 4b43b378f9..5c98df05a5 100644
--- a/EdgeCraftRAG/edgecraftrag/api_schema.py
+++ b/EdgeCraftRAG/edgecraftrag/api_schema.py
@@ -50,6 +50,7 @@ class GeneratorIn(BaseModel):
     model: Optional[ModelIn] = None
     inference_type: Optional[str] = "local"
     vllm_endpoint: Optional[str] = None
+    ovms_endpoint: Optional[str] = None
 
 
 class PipelineCreateIn(BaseModel):
diff --git a/EdgeCraftRAG/edgecraftrag/base.py b/EdgeCraftRAG/edgecraftrag/base.py
index 53209ca043..c0a431ad50 100644
--- a/EdgeCraftRAG/edgecraftrag/base.py
+++ b/EdgeCraftRAG/edgecraftrag/base.py
@@ -37,6 +37,7 @@ class ModelType(str, Enum):
     RERANKER = "reranker"
     LLM = "llm"
     VLLM = "vllm"
+    OVMS = "ovms"
     VLLM_EMBEDDING = "vllm_embedding"
 
 
@@ -89,6 +90,7 @@ class InferenceType(str, Enum):
 
     LOCAL = "local"
     VLLM = "vllm"
+    OVMS = "ovms"
 
 
 class CallbackType(str, Enum):
diff --git a/EdgeCraftRAG/edgecraftrag/components/agent.py b/EdgeCraftRAG/edgecraftrag/components/agent.py
index 525c10b8e3..914edb6229 100644
--- a/EdgeCraftRAG/edgecraftrag/components/agent.py
+++ b/EdgeCraftRAG/edgecraftrag/components/agent.py
@@ -3,13 +3,21 @@
 
 import os
 from abc import abstractmethod
+from typing import Any, List
 
 from comps.cores.proto.api_protocol import ChatCompletionRequest
 from edgecraftrag.base import BaseComponent, CallbackType, CompType, GeneratorType
 from edgecraftrag.components.agents.utils import remove_think_tags
 from edgecraftrag.utils import stream_generator
 from langgraph.config import get_stream_writer
-from pydantic import model_serializer
+from pydantic import BaseModel, Field, model_serializer
+
+
+class Retrieval(BaseModel):
+    step: Any
+    query: str
+    retrieved: List[Any] = Field(...)
+    reranked: List[Any] = Field(...)
 
 
 class Agent(BaseComponent):
@@ -22,12 +30,17 @@ def __init__(self, name, agent_type, pipeline_idx, configs):
         self.pipeline_idx = pipeline_idx
         self.manager = None
         self.configs = configs
+        self.retrievals: List[Retrieval] = []
 
     @classmethod
     @abstractmethod
     def get_default_configs(cls):
         pass
 
+    @abstractmethod
+    async def run(self, **kwargs) -> Any:
+        pass
+
     def get_bound_pipeline(self):
         if self.manager is not None:
             pl = self.manager.get_pipeline_by_name_or_id(self.pipeline_idx)
diff --git a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/cfgs/default.json b/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/cfgs/default.json
deleted file mode 100644
index 7872c89016..0000000000
--- a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/cfgs/default.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "system_instruction": "As an expert AI assistant, your goal is to provide accurate solutions. Analyze the user's question, create a retrieval plan, gather information, and synthesize a step-by-step answer. Follow all instructions.",
-  "plan_instruction": "To maximize retrieval recall, create a multi-step query plan. First, deconstruct the user's question into its core components and symptoms. Then, generate hypotheses about the potential root causes. Finally, create a numbered list of 2-5 queries to investigate these hypotheses.\n\n*   **Step 1 (Rephrase and Broaden):** Start with a comprehensive query that rephrases the user's question, including synonyms and alternative phrasings to ensure broad initial coverage.\n*   **Subsequent Steps (Hypothesis Testing):** Each following query should be a targeted, self-contained question designed to confirm or deny a specific hypothesis. These queries must include precise technical terms, component names, and potential error codes to retrieve the most relevant documents.\n\nYour final output must be only the numbered list of queries.",
-  "query_instruction": "After each retrieval, evaluate if you have enough information to solve the problem. If not, and if your plan has more steps, formulate the next query. This query must be a concise, targeted sub-question with precise keywords to fill a specific knowledge gap. Do not use prefixes like 'Query:'./no_think",
-  "answer_instruction": "Synthesize the retrieved information into a final, actionable answer for the user.\n\n**User's Question:**\n{question}\n\n**Retrieved Information:**\n{plan_with_information}\n\n**Your Task:**\n1.  **Synthesize and Filter:** Review all retrieved context, using only the most relevant information to address the user's problem.\n2.  **Structure and Format:** Organize the solution into a clear, step-by-step guide. Present it as a numbered or bulleted list, highlighting any warnings at the beginning./no_think",
-  "domain_knowledge": "",
-  "prompt_templates": {
-    "system": "{system_instruction}\n\n{query_instruction}\n\n{domain_knowledge}\n\n{experiences}\n",
-    "generate_query": "Now generate a query for the next retrieval./no_think",
-    "make_plan": "Now generate a plan based on the user's question above. \n\n{plan_instruction}\n\nFormat the plan as a (Python) list containing the ordered steps, each step is a string./no_think",
-    "plan": "The following is the plan to step by step retrieve knowledge needed and work out an answer to user's question:\n{plan_steps}\n",
-    "plan_step": "Step {num}: {step}.",
-    "context": "<context>\n{context}\n</context>\n",
-    "contexts": "The following are the retrieved contexts for current query.\n{contexts}\n",
-    "continue_decision": "Is more information needed? Answer Yes or No. Then explain why or why not.",
-    "experiences": "The following are question-plan examples by human experts. Refer to them to better make your plan. If you find that there is a question that is highly similar or exactly match the input question, then strictly follow the subquestions to make the plan.\n\n{experiences}\n"
-  },
-
-  "retrieve_top_k": 60,
-  "rerank_top_k": 3,
-  "mece_retrieval": true,
-  "max_retrievals": 3,
-  "max_plan_steps": 3
-}
diff --git a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/config.py b/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/config.py
index 1c40ed023c..e7066c767d 100644
--- a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/config.py
+++ b/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/config.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+from copy import deepcopy
 import json
 from pathlib import Path
 from typing import Any, Dict
@@ -44,6 +45,36 @@ class Config(BaseModel):
     prompt_templates: PromptTemplates
 
 
+DEFAULT_CONFIG_DICT: Dict[str, Any] = {
+    "system_instruction": "As an expert AI assistant, your goal is to provide accurate solutions. Analyze the user's question, create a retrieval plan, gather information, and synthesize a step-by-step answer. Follow all instructions.",
+    "plan_instruction": "To maximize retrieval recall, create a multi-step query plan. First, deconstruct the user's question into its core components and symptoms. Then, generate hypotheses about the potential root causes. Finally, create a numbered list of 2-5 queries to investigate these hypotheses.\n\n*   **Step 1 (Rephrase and Broaden):** Start with a comprehensive query that rephrases the user's question, including synonyms and alternative phrasings to ensure broad initial coverage.\n*   **Subsequent Steps (Hypothesis Testing):** Each following query should be a targeted, self-contained question designed to confirm or deny a specific hypothesis. These queries must include precise technical terms, component names, and potential error codes to retrieve the most relevant documents.\n\nYour final output must be only the numbered list of queries.",
+    "query_instruction": "After each retrieval, evaluate if you have enough information to solve the problem. If not, and if your plan has more steps, formulate the next query. This query must be a concise, targeted sub-question with precise keywords to fill a specific knowledge gap. Do not use prefixes like 'Query:'./no_think",
+    "answer_instruction": "Synthesize the retrieved information into a final, actionable answer for the user.\n\n**User's Question:**\n{question}\n\n**Retrieved Information:**\n{plan_with_information}\n\n**Your Task:**\n1.  **Synthesize and Filter:** Review all retrieved context, using only the most relevant information to address the user's problem.\n2.  **Structure and Format:** Organize the solution into a clear, step-by-step guide. Present it as a numbered or bulleted list, highlighting any warnings at the beginning.\n\n**Citation Rules (MUST follow):**\n- Use only provided DOCUMENT_NODE evidence.\n- For each claim based on DOCUMENT_NODE_CONTEXT, append citation after the current paragraph:\n  - Chinese answer: (来自 [DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH))\n  - Non-Chinese answer: (from [DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH))\n- At the end of the answer, output:\n\n --- \n\n### Document Source:\n- DOCUMENT_NODE_SOURCE\n\nOnly include unique DOCUMENT_NODE_SOURCE values (deduplicated). Do NOT include links/URLs/paths in this final Document Source block./no_think",
+    "domain_knowledge": "",
+    "prompt_templates": {
+        "system": "{system_instruction}\n\n{query_instruction}\n\n{domain_knowledge}\n\n{experiences}\n",
+        "generate_query": "Now generate a query for the next retrieval./no_think",
+        "make_plan": "Now generate a plan based on the user's question above. \n\n{plan_instruction}\n\nFormat the plan as a (Python) list containing the ordered steps, each step is a string./no_think",
+        "plan": "The following is the plan to step by step retrieve knowledge needed and work out an answer to user's question:\n{plan_steps}\n",
+        "plan_step": "Step {num}: {step}.",
+        "context": "<context>\n{context}\n</context>\n",
+        "contexts": "The following are the retrieved contexts for current query.\n{contexts}\n",
+        "continue_decision": "Is more information needed? Answer Yes or No. Then explain why or why not.",
+        "experiences": "The following are question-plan examples by human experts. Refer to them to better make your plan. If you find that there is a question that is highly similar or exactly match the input question, then strictly follow the subquestions to make the plan.\n\n{experiences}\n",
+    },
+    "retrieve_top_k": 60,
+    "rerank_top_k": 3,
+    "mece_retrieval": True,
+    "max_retrievals": 3,
+    "max_plan_steps": 3,
+}
+
+
+def get_default_config() -> Config:
+    """Return the built-in DeepSearch default configuration."""
+    return Config(**deepcopy(DEFAULT_CONFIG_DICT))
+
+
 def _resolve_path(value: str, base_path: Path) -> str:
     """Resolve value relative to ``base_path`` if it is an existing file."""
     if not value:
diff --git a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/deep_search.py b/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/deep_search.py
index ada6713be1..34fba173af 100644
--- a/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/deep_search.py
+++ b/EdgeCraftRAG/edgecraftrag/components/agents/deep_search/deep_search.py
@@ -10,27 +10,18 @@
 
 from comps.cores.proto.api_protocol import ChatCompletionRequest
 from edgecraftrag.base import AgentType, CallbackType, CompType
-from edgecraftrag.components.agent import Agent, stream_writer
+from edgecraftrag.components.agent import Agent, Retrieval, stream_writer
+from edgecraftrag.components.agents.utils import build_document_node_block
 from langgraph.graph import END, START, StateGraph
 from pydantic import BaseModel, Field
 
-from .config import load_config
+from .config import Config, PromptTemplates, get_default_config
 from .logging_utils import format_terminal_str, log_status
 from .postprocessing import postproc_answer as default_postproc_answer
 from .postprocessing import postproc_plan as default_postproc_plan
 from .postprocessing import postproc_query as default_postproc_query
 from .utils import Role, import_module_from_path
 
-DEFAULT_CONFIG = "./edgecraftrag/components/agents/deep_search/cfgs/default.json"
-
-
-class Retrieval(BaseModel):
-    step: str
-    query: str
-    retrieved: List[Any] = Field(...)
-    reranked: List[Any] = Field(...)
-
-
 class DeepSearchState(BaseModel):
     question: str
     query: str
@@ -52,21 +43,17 @@ class DeepSearchAgent(Agent):
     def __init__(self, idx, name, pipeline_idx, cfg):
         super().__init__(name=name, agent_type=AgentType.DEEPSEARCH, pipeline_idx=pipeline_idx, configs=cfg)
 
-        # Load the configuration
-        # TODO: remove deep path
-        self.cfg = load_config(DEFAULT_CONFIG)
+        cfg = cfg or {}
+        default_cfg = get_default_config().model_dump()
+        merged_cfg = {**default_cfg, **cfg}
+        merged_cfg["prompt_templates"] = {
+            **default_cfg["prompt_templates"],
+            **cfg.get("prompt_templates", {}),
+        }
+        self.cfg = Config(**merged_cfg)
+        self.configs = merged_cfg
         if idx is not None:
             self.idx = idx
-        if "retrieve_top_k" in cfg:
-            self.cfg.retrieve_top_k = cfg["retrieve_top_k"]
-        if "rerank_top_k" in cfg:
-            self.cfg.rerank_top_k = cfg["rerank_top_k"]
-        if "mece_retrieval" in cfg:
-            self.cfg.mece_retrieval = cfg["mece_retrieval"]
-        if "max_retrievals" in cfg:
-            self.cfg.max_retrievals = cfg["max_retrievals"]
-        if "max_plan_steps" in cfg:
-            self.cfg.max_plan_steps = cfg["max_plan_steps"]
 
         self.graph = self._build_graph()
         self._messages: List[dict] = []
@@ -90,16 +77,30 @@ def __init__(self, idx, name, pipeline_idx, cfg):
 
     @classmethod
     def get_default_configs(cls):
-        cfg = load_config(DEFAULT_CONFIG)
-        return {
-            "retrieve_top_k": cfg.retrieve_top_k,
-            "rerank_top_k": cfg.rerank_top_k,
-            "mece_retrieval": cfg.mece_retrieval,
-            "max_retrievals": cfg.max_retrievals,
-            "max_plan_steps": cfg.max_plan_steps,
-        }
+        return get_default_config().model_dump()
 
     def update(self, cfg):
+        for key in [
+            "system_instruction",
+            "plan_instruction",
+            "query_instruction",
+            "answer_instruction",
+            "recur_summarize_instruction",
+        ]:
+            value = cfg.get(key, None)
+            if value is not None and isinstance(value, str):
+                setattr(self.cfg, key, value)
+                self.configs[key] = value
+
+        prompt_templates = cfg.get("prompt_templates", None)
+        if prompt_templates is not None and isinstance(prompt_templates, dict):
+            updated_templates = {
+                **self.cfg.prompt_templates.model_dump(),
+                **prompt_templates,
+            }
+            self.cfg.prompt_templates = PromptTemplates(**updated_templates)
+            self.configs["prompt_templates"] = updated_templates
+
         retrieve = cfg.get("retrieve_top_k", None)
         if retrieve and isinstance(retrieve, int):
             self.cfg.retrieve_top_k = retrieve
@@ -111,7 +112,7 @@ def update(self, cfg):
             self.configs["rerank_top_k"] = rerank
 
         mr = cfg.get("mece_retrieval", None)
-        if mr and isinstance(mr, int):
+        if mr is not None and isinstance(mr, bool):
             self.cfg.mece_retrieval = mr
             self.configs["mece_retrieval"] = mr
 
@@ -195,8 +196,8 @@ async def _retrieve_and_rerank(
 
         if mece_retrieve:
             new_retrieved = [node for node in retrieved if node.node_id not in state.context_chunk_ids]
-            # TODO: Using top_k from request, need to change?
-            new_retrieved = new_retrieved[: request.k]
+            effective_top_k = request.k if request.k not in (None, 0) else self.cfg.retrieve_top_k
+            new_retrieved = new_retrieved[:effective_top_k]
         else:
             new_retrieved = retrieved
 
@@ -427,7 +428,7 @@ async def generate_answer(self, state: DeepSearchState) -> dict:
                     for doc in retrieval.reranked:
                         node_id = doc.node_id
                         if node_id not in presented_ids:
-                            plan_with_information += f"{doc.text}\n\n"
+                            plan_with_information += build_document_node_block(doc) + "\n"
                             presented_ids.append(node_id)
             else:
                 plan_with_information = "Plan with Retrieved Information:\n"
@@ -439,7 +440,7 @@ async def generate_answer(self, state: DeepSearchState) -> dict:
                             related_docs = retrieval.reranked
                             break
                     for doc in related_docs:
-                        plan_with_information += f"- {doc.text}\n"
+                        plan_with_information += build_document_node_block(doc)
                     plan_with_information += "\n"
 
         self._messages = [
diff --git a/EdgeCraftRAG/edgecraftrag/components/agents/simple.py b/EdgeCraftRAG/edgecraftrag/components/agents/simple.py
index 722bd3acda..131afe63e8 100644
--- a/EdgeCraftRAG/edgecraftrag/components/agents/simple.py
+++ b/EdgeCraftRAG/edgecraftrag/components/agents/simple.py
@@ -2,21 +2,48 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+from copy import deepcopy
 from typing import Any, List
 
 from comps.cores.proto.api_protocol import ChatCompletionRequest
 from edgecraftrag.base import AgentType, CallbackType, CompType
-from edgecraftrag.components.agent import Agent, stream_writer
-from edgecraftrag.components.agents.utils import ROLE, format_terminal_str
+from edgecraftrag.components.agent import Agent, Retrieval, stream_writer
+from edgecraftrag.components.agents.utils import ROLE, build_document_node_block, format_terminal_str
 from langgraph.graph import END, START, StateGraph
 from pydantic import BaseModel, Field
 
 
-class Retrieval(BaseModel):
-    step: int
-    query: str
-    retrieved: List[Any] = Field(...)
-    reranked: List[Any] = Field(...)
+class PromptTemplates(BaseModel):
+    system: str
+    generate_query: str
+    context: str
+    contexts: str
+    continue_decision: str
+
+
+class Config(BaseModel):
+    system_instruction: str
+    query_instruction: str
+    answer_instruction: str
+    domain_knowledge: str = ""
+    max_retrievals: int = 3
+    prompt_templates: PromptTemplates
+
+
+DEFAULT_CONFIG = {
+    "system_instruction": "You will be provided with a question from a user, and you need to create queries and execute them based on the question for the final answer.\nYou should only use the information provided in the search results to answer the user's question. \nMake your response in the same language as the user's question./no_think",
+    "query_instruction": 'Every time when asked if more information is needed, check the retrieved contexts and try to identify new content that is related. Then based on what you get and all above, decide if a new query is needed to gather more potential useful information. The query should be a very concise and clear sub-question that is specific to the user\'s question. A good query should include all the related actions or keywords that can help to retrieve the most related context. Response with the query directly.\nDO NOT use any prefix, such as "Query:"/no_think',
+    "answer_instruction": "You have been provided with a question from user:\n{question}\n\nThe following are the plan steps you generated and the corresponding retrieved information:\n{plan_with_information}\n\nBased on the above, come up with a final answer for the user's question. Format the answer as a list of steps that can guide the user to solve the problem.\n\nCitation rules (MUST follow):\n1) Use only the provided DOCUMENT_NODE evidence.\n2) For each claim based on DOCUMENT_NODE_CONTEXT, append a citation after the current paragraph using:\n   - Chinese answer: (来自 [DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH))\n   - Non-Chinese answer: (from [DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH))\n3) At the end of your answer, output:\n\n --- \n\n### Document Source:\n- DOCUMENT_NODE_SOURCE\n\nOnly list unique DOCUMENT_NODE_SOURCE values (deduplicated). Do NOT output links/URLs/paths in this final Document Source block./no_think",
+    "domain_knowledge": "",
+    "max_retrievals": 3,
+    "prompt_templates": {
+        "system": """{system_instruction}\n\n{query_instruction}\n\n{domain_knowledge}\n\n""",
+        "generate_query": "Now generate a query for the next retrieval.",
+        "context": """<context>\n{context}\n</context>\n""",
+        "contexts": """The following are the retrieved contexts for current query.\n{contexts}\n""",
+        "continue_decision": "Is more information needed? Answer Yes or No. Then explain why or why not.",
+    },
+}
 
 
 class QnaState(BaseModel):
@@ -34,30 +61,52 @@ class SimpleRAGAgent(Agent):
 
     def __init__(self, idx, name, pipeline_idx, cfg):
         super().__init__(name=name, agent_type=AgentType.SIMPLE, pipeline_idx=pipeline_idx, configs=cfg)
+        cfg = cfg or {}
+        merged_cfg = {**DEFAULT_CONFIG, **cfg}
+        merged_cfg["prompt_templates"] = {
+            **DEFAULT_CONFIG["prompt_templates"],
+            **cfg.get("prompt_templates", {}),
+        }
+        self.cfg = Config(**merged_cfg)
+        self.configs = merged_cfg
+
         self.graph = self._build_graph()
         self._messages = []
         self.conversation_history = []
+        self.retrievals: List[Retrieval] = []
         if idx is not None:
             self.idx = idx
-        if "max_retrievals" in cfg:
-            self.max_retrievals = int(cfg["max_retrievals"])
-        else:
-            self.max_retrievals = 3
+        self.max_retrievals = int(self.cfg.max_retrievals)
         self.postproc_query = postproc_query
         self.postproc_answer = postproc_answer
 
     @classmethod
     def get_default_configs(cls):
-        return {"max_retrievals": 3}
+        return deepcopy(DEFAULT_CONFIG)
 
     def update(self, cfg):
         max_r = cfg.get("max_retrievals", None)
         if max_r and isinstance(max_r, int):
             self.max_retrievals = int(max_r)
             self.configs["max_retrievals"] = self.max_retrievals
-            return True
-        else:
-            return False
+            self.cfg.max_retrievals = self.max_retrievals
+
+        for key in ["system_instruction", "query_instruction", "answer_instruction", "domain_knowledge"]:
+            value = cfg.get(key, None)
+            if value and isinstance(value, str):
+                setattr(self.cfg, key, value)
+                self.configs[key] = value
+
+        prompt_templates = cfg.get("prompt_templates", None)
+        if prompt_templates and isinstance(prompt_templates, dict):
+            updated_templates = {
+                **self.cfg.prompt_templates.model_dump(),
+                **prompt_templates,
+            }
+            self.cfg.prompt_templates = PromptTemplates(**updated_templates)
+            self.configs["prompt_templates"] = updated_templates
+
+        return True
 
     def _build_graph(self):
 
@@ -103,7 +152,7 @@ async def generate_query(self, state: QnaState) -> dict:
 
         messages = [
             {"role": ROLE.USER, "content": state.question},
-            {"role": ROLE.SYSTEM, "content": PROMPT_TEMPLATE.GENERATE_QUERY},
+            {"role": ROLE.SYSTEM, "content": self.cfg.prompt_templates.generate_query},
         ]
         self._messages.extend(messages)
         self.conversation_history.extend(messages)
@@ -123,14 +172,14 @@ async def check_retrieved(self, state: QnaState) -> str:
         await stream_writer("🤔 **Evaluating if more information is needed...**\n\n")
 
         # Format context for the next decision
-        contexts = PROMPT_TEMPLATE.CONTEXTS.format(
+        contexts = self.cfg.prompt_templates.contexts.format(
             contexts="\n".join(
-                [PROMPT_TEMPLATE.CONTEXT.format(context=doc.text) for doc in state.retrievals[-1].reranked]
+                [self.cfg.prompt_templates.context.format(context=doc.text) for doc in state.retrievals[-1].reranked]
             )
         )
         messages = [
             {"role": ROLE.SYSTEM, "content": contexts},
-            {"role": ROLE.SYSTEM, "content": PROMPT_TEMPLATE.CONTINUE},
+            {"role": ROLE.SYSTEM, "content": self.cfg.prompt_templates.continue_decision},
         ]
         self._messages.extend(messages)
         self.conversation_history.extend(messages)
@@ -169,14 +218,14 @@ async def generate_answer(self, state: QnaState) -> dict:
             if r.step != prev_step:
                 plan_with_information += f"Step {i+1}\n\nRetrieved:\n"
             for doc in r.reranked:
-                plan_with_information += doc.text + "\n"
+                plan_with_information += build_document_node_block(doc)
             plan_with_information += "\n"
             prev_step = r.step
 
         self._messages = [
             {
                 "role": ROLE.SYSTEM,
-                "content": answer_instruction.format(
+                "content": self.cfg.answer_instruction.format(
                     question=state.question, plan_with_information=plan_with_information
                 ),
             }
@@ -202,6 +251,7 @@ async def run(self, **kwargs) -> Any:
         if "cbtype" in kwargs:
             if kwargs["cbtype"] == CallbackType.RUNAGENT:
                 request = kwargs["chat_request"]
+                self.retrievals.clear()
 
                 print(
                     "🤿",
@@ -214,9 +264,16 @@ async def run(self, **kwargs) -> Any:
                 self._messages = self._build_init_messages(request.messages)
 
                 async def async_gen():
-                    async for chunk in self.graph.astream(state, stream_mode="custom"):
-                        yield chunk
+                    final_state = None
+                    async for mode, chunk in self.graph.astream(state, stream_mode=["custom", "values"]):
+                        if mode == "custom":
+                            yield chunk
+                        elif mode == "values":
+                            final_state = chunk
                         await asyncio.sleep(0)
+                    if isinstance(final_state, dict):
+                        self.retrievals.clear()
+                        self.retrievals.extend(final_state.get("retrievals", []))
 
                 print("✅", format_terminal_str("RAG process completed", color="cyan", bold=True))
                 return async_gen()
@@ -225,10 +282,10 @@ def _build_init_messages(self, question) -> List[dict]:
         return [
             {
                 "role": ROLE.SYSTEM,
-                "content": PROMPT_TEMPLATE.SYSTEM.format(
-                    system_instruction=system_instruction,
-                    query_instruction=query_instruction,
-                    domain_knowledge="",
+                "content": self.cfg.prompt_templates.system.format(
+                    system_instruction=self.cfg.system_instruction,
+                    query_instruction=self.cfg.query_instruction,
+                    domain_knowledge=self.cfg.domain_knowledge,
                 ),
             }
         ]
@@ -247,28 +304,3 @@ def postproc_query(text, state):
 
 def postproc_answer(text, state):
     return text
-
-
-system_instruction = "You will be provided with a question from a user, and you need to create queries and execute them based on the question for the final answer.\nYou should only use the information provided in the search results to answer the user's question. \nMake your response in the same language as the user's question./no_think"
-query_instruction = 'Every time when asked if more information is needed, check the retrieved contexts and try to identify new content that is related. Then based on what you get and all above, decide if a new query is needed to gather more potential useful information. The query should be a very concise and clear sub-question that is specific to the user\'s question. A good query should include all the related actions or keywords that can help to retrieve the most related context. Response with the query directly.\nDO NOT use any prefix, such as "Query:"/no_think'
-answer_instruction = "You have been provided with a question from user:\n{question}\n\nThe following are the plan steps you generated and the corresponding retrieved information:{plan_with_information}\n\nBased on the above, come up with a final answer for the user's question. Format the answer as a list of steps that can guide the user to solve the problem./no_think"
-
-
-class PROMPT_TEMPLATE:
-    # only contain formatting related instructions here
-
-    SYSTEM = """{system_instruction}
-
-{query_instruction}
-
-{domain_knowledge}
-
-"""
-    GENERATE_QUERY = "Now generate a query for the next retrieval."
-
-    CONTEXT = """<context>\n{context}\n</context>\n"""
-    CONTEXTS = """The following are the retrieved contexts for current query.\n{contexts}\n"""
-
-    CONTINUE = "Is more information needed? Answer Yes or No. Then explain why or why not."
-
-    EXPERIENCES = """The following are question-plan examples by human experts. Refer to them to better make your plan. If you find that there is a question that is highly similar or exactly match the input question, then strictly follow the subquestions to make the plan.\n\n{experiences}\n"""
diff --git a/EdgeCraftRAG/edgecraftrag/components/agents/utils.py b/EdgeCraftRAG/edgecraftrag/components/agents/utils.py
index a1aa79d531..b11df280ca 100644
--- a/EdgeCraftRAG/edgecraftrag/components/agents/utils.py
+++ b/EdgeCraftRAG/edgecraftrag/components/agents/utils.py
@@ -8,6 +8,7 @@
 import re
 import sys
 from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib.parse import quote
 
 import numpy
 from pydantic import BaseModel
@@ -126,6 +127,40 @@ def remove_tagged(text, tag="think"):
     return re.sub(pattern, "", text, flags=re.DOTALL).strip()
 
 
+def encode_document_file_path(file_path: str) -> str:
+    if not isinstance(file_path, str) or not file_path:
+        return ""
+    return quote(file_path, safe="/:%")
+
+
+def build_document_node_block(doc: Any) -> str:
+    metadata = {}
+    if hasattr(doc, "node") and hasattr(doc.node, "metadata") and isinstance(doc.node.metadata, dict):
+        metadata = doc.node.metadata
+
+    source = metadata.get("file_name", "") if isinstance(metadata, dict) else ""
+    file_path = metadata.get("file_path", "") if isinstance(metadata, dict) else ""
+    page_num = metadata.get("page_label", "") if isinstance(metadata, dict) else ""
+    encoded_file_path = encode_document_file_path(file_path)
+    page_num_str = str(page_num).strip() if page_num is not None else ""
+    page_suffix = f"#page={page_num_str}" if page_num_str else ""
+
+    if hasattr(doc, "text") and isinstance(doc.text, str):
+        node_context = doc.text.strip()
+    elif hasattr(doc, "node") and hasattr(doc.node, "text") and isinstance(doc.node.text, str):
+        node_context = doc.node.text.strip()
+    else:
+        node_context = ""
+
+    return (
+        "<DOCUMENT_NODE>\n"
+        f"<DOCUMENT_NODE_SOURCE>{source}</DOCUMENT_NODE_SOURCE>\n"
+        f"<DOCUMENT_NODE_FILE_PATH>{encoded_file_path}{page_suffix}</DOCUMENT_NODE_FILE_PATH>\n"
+        f"<DOCUMENT_NODE_CONTEXT>{node_context}</DOCUMENT_NODE_CONTEXT>\n"
+        "</DOCUMENT_NODE>\n"
+    )
+
+
 def _extract_pattern_and_text(line: str) -> Optional[Tuple[str, int, str, str]]:
     """Checks if a line matches the pattern [prefix][digit][suffix][text].
 
diff --git a/EdgeCraftRAG/edgecraftrag/components/benchmark.py b/EdgeCraftRAG/edgecraftrag/components/benchmark.py
index f91324a9d8..20872e59cc 100644
--- a/EdgeCraftRAG/edgecraftrag/components/benchmark.py
+++ b/EdgeCraftRAG/edgecraftrag/components/benchmark.py
@@ -19,7 +19,8 @@ def __init__(self, enable_benchmark, inference_type, tokenizer=None, bench_hook=
         self.vllm_metrics = {}
         if inference_type == InferenceType.VLLM:
             self.is_vllm = True
-            self.vllm_metrics = self.get_vllm_metrics()
+            if self.enabled:
+                self.vllm_metrics = self.get_vllm_metrics()
         else:
             self.is_vllm = False
         self.tokenizer = tokenizer
@@ -78,6 +79,10 @@ def update_benchmark_data(self, idx, comp_type, data):
         if self.is_enabled() and idx in self.benchmark_data_list and comp_type in self.benchmark_data_list[idx]:
             self.benchmark_data_list[idx][comp_type] = data
 
+    def update_benchmark_data_genai(self, idx, comp_type, data, model):
+        if self.is_enabled() and idx in self.benchmark_data_list and comp_type in self.benchmark_data_list[idx]:
+            self.benchmark_data_list[idx][comp_type] = data
+
     def get_benchmark_data(self, idx, comp_type):
         if self.is_enabled() and idx in self.benchmark_data_list and comp_type in self.benchmark_data_list[idx]:
             return self.benchmark_data_list[idx][comp_type]
@@ -108,6 +113,17 @@ def insert_llm_data(self, idx, input_token_size=-1):
 
             self.llm_data_list[idx] = metrics
 
+    def insert_llm_data_genai(self, idx, input_token_size=-1, model=None):
+        if self.is_enabled():
+            metrics = {}
+            metrics["input_token_size"] = input_token_size
+            metrics["output_token_size"] = model().perf_metrics.get_num_generated_tokens()
+            metrics["generation_time"] = model().perf_metrics.get_inference_duration().mean/1000
+            metrics["first_token_latency"] = model().perf_metrics.get_ttft().mean/1000
+            metrics["other_tokens_avg_latency"] = model().perf_metrics.get_tpot().mean/1000
+
+        self.llm_data_list[idx] = metrics
+
     def get_vllm_metrics(self):
         # self.vllm_metrics is the previous vllm metric
         vllm_metrics = [
@@ -123,7 +139,14 @@ def get_vllm_metrics(self):
                 metrics[key] = 0
 
         llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8086")
-        response = requests.get(f"{llm_endpoint}/metrics", headers={"Content-Type": "application/json"})
+        try:
+            response = requests.get(
+                f"{llm_endpoint}/metrics",
+                headers={"Content-Type": "application/json"},
+                timeout=3,
+            )
+        except requests.RequestException:
+            return {}
         if response.status_code == 200:
             metrics_data = text_string_to_metric_families(response.text)
         else:
@@ -151,9 +174,9 @@ def ser_model(self):
             set = {
                 "Benchmark enabled": self.enabled,
                 "last_benchmark_data": (
-                    self.benchmark_data_list[self.dict_idx] if self.dict_idx in self.benchmark_data_list else None
+                    self.benchmark_data_list[self.last_idx] if self.last_idx in self.benchmark_data_list else None
                 ),
-                "llm_metrics": self.llm_data_list[self.dict_idx] if self.dict_idx in self.llm_data_list else None,
+                "llm_metrics": self.llm_data_list[self.last_idx] if self.last_idx in self.llm_data_list else None,
             }
         else:
             set = {
diff --git a/EdgeCraftRAG/edgecraftrag/components/generator.py b/EdgeCraftRAG/edgecraftrag/components/generator.py
index 50d1452457..fcad2433d8 100644
--- a/EdgeCraftRAG/edgecraftrag/components/generator.py
+++ b/EdgeCraftRAG/edgecraftrag/components/generator.py
@@ -5,13 +5,15 @@
 import json
 import os
 import time
+import weakref
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse
 
 from comps.cores.proto.api_protocol import ChatCompletionRequest
 from edgecraftrag.base import BaseComponent, CompType, GeneratorType, InferenceType, NodeParserType
-from edgecraftrag.utils import get_prompt_template
+from edgecraftrag.utils import get_prompt_template, resolve_prompt_template_path
+from edgecraftrag.components.agents.utils import build_document_node_block
 from fastapi.responses import StreamingResponse
 from llama_index.llms.openai_like import OpenAILike
 from pydantic import model_serializer
@@ -86,21 +88,28 @@ def build_stream_response(status=None, content=None, error=None):
     return response
 
 
-async def local_stream_generator(lock, llm, prompt_str, unstructured_str):
+async def local_stream_generator(lock, llm, prompt_str, unstructured_str, benchmark=None, benchmark_index=None):
+    enable_benchmark = benchmark.is_enabled() if benchmark else False
+    start_time = time.perf_counter() if enable_benchmark else None
     async with lock:
-        response = await llm.astream_complete(prompt_str)
+        if enable_benchmark:
+            response = await llm.astream_complete_with_bench(prompt_str)
+        else:
+            response = await llm.astream_complete(prompt_str)
         try:
             async for r in response:
                 yield r.delta or ""
                 await asyncio.sleep(0)
             if unstructured_str:
                 yield unstructured_str
+            if enable_benchmark:
+                benchmark.update_benchmark_data_genai(benchmark_index, CompType.GENERATOR, time.perf_counter() - start_time, weakref.ref(llm))
+                benchmark.insert_llm_data_genai(benchmark_index, benchmark.cal_input_token_size(prompt_str), weakref.ref(llm))
         except Exception as e:
             start_idx = str(e).find("message") + len("message")
             result_error = str(e)[start_idx:]
             yield f"code:0000{result_error}"
 
-
 async def stream_generator(llm, prompt_str, unstructured_str, benchmark=None, benchmark_index=None):
     enable_benchmark = benchmark.is_enabled() if benchmark else False
     start_time = time.perf_counter() if enable_benchmark else None
@@ -140,6 +149,7 @@ def clone_generator(src_generator: BaseComponent, dst_generator_cfg: dict = None
         "llm_model": src_generator.llm,
         "inference_type": src_generator.inference_type,
         "vllm_endpoint": src_generator.vllm_endpoint,
+        "ovms_endpoint": getattr(src_generator, "ovms_endpoint", ""),
     }
 
     if generator_type == GeneratorType.CHATQNA:
@@ -164,7 +174,16 @@ def clone_generator(src_generator: BaseComponent, dst_generator_cfg: dict = None
 
 class QnAGenerator(BaseComponent):
 
-    def __init__(self, llm_model, prompt_template_file, inference_type, vllm_endpoint, prompt_content, **kwargs):
+    def __init__(
+        self,
+        llm_model,
+        prompt_template_file,
+        inference_type,
+        vllm_endpoint,
+        prompt_content,
+        ovms_endpoint="",
+        **kwargs,
+    ):
         BaseComponent.__init__(
             self,
             comp_type=CompType.GENERATOR,
@@ -186,7 +205,12 @@ def __init__(self, llm_model, prompt_template_file, inference_type, vllm_endpoin
             llm_instance = llm_model()
             if llm_instance.model_path is None or llm_instance.model_path == "":
                 self.model_id = llm_instance.model_id
-                self.model_path = os.path.join("/home/user/models", os.getenv("LLM_MODEL", "Qwen/Qwen3-8B"))
+                if self.inference_type in (InferenceType.VLLM, InferenceType.OVMS):
+                    # Remote inference may not have local model files. Use model id directly
+                    # to avoid invalid absolute-path repo id validation failures.
+                    self.model_path = self.model_id
+                else:
+                    self.model_path = os.path.join("/home/user/models", os.getenv("LLM_MODEL", "Qwen/Qwen3-8B"))
             else:
                 self.model_id = llm_instance.model_id
                 self.model_path = llm_instance.model_path
@@ -195,13 +219,22 @@ def __init__(self, llm_model, prompt_template_file, inference_type, vllm_endpoin
         )
 
         self.llm = llm_model
+        self.vllm_name  = llm_model().model_id if not isinstance(llm_model, str) else llm_model
         if self.inference_type == InferenceType.LOCAL:
             self.lock = asyncio.Lock()
         if self.inference_type == InferenceType.VLLM:
-            self.vllm_name = llm_model().model_id
             if vllm_endpoint == "":
                 vllm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8086")
+        if self.inference_type == InferenceType.OVMS:
+            if ovms_endpoint == "":
+                ovms_endpoint = os.getenv("OVMS_ENDPOINT", "http://localhost:8000")
         self.vllm_endpoint = vllm_endpoint
+        self.ovms_endpoint = ovms_endpoint
+
+        if self.inference_type == InferenceType.OVMS:
+            self.remote_endpoint = self.ovms_endpoint
+        else:
+            self.remote_endpoint = self.vllm_endpoint
 
     def prompt_handler(
         self, model_path, prompt_content=None, prompt_template_file=None, enable_think=False, enable_rag_retrieval=True
@@ -214,16 +247,11 @@ def prompt_handler(
             return prompt_template
         else:
             if enable_rag_retrieval:
-                safe_root = "/templates"
+                resolve_prompt_template_path(prompt_template_file)
             else:
                 prompt_content = "### User Guide ###You are a helpful assistant. Please respond to user inquiries with concise and professional answers.### Historical Content ###{chat_history}"
                 return get_prompt_template(model_path, prompt_content, prompt_template_file, enable_think)
 
-            prompt_template_file = os.path.normpath(os.path.join(safe_root, prompt_template_file))
-            if not prompt_template_file.startswith(safe_root):
-                raise ValueError("Invalid template path")
-            if not os.path.exists(prompt_template_file):
-                raise ValueError("Template file not exists")
             return get_prompt_template(model_path, prompt_content, prompt_template_file, enable_think)
 
     def set_prompt(self, prompt):
@@ -256,8 +284,7 @@ def query_transform(self, chat_request, retrieved_nodes, sub_questions=None):
         :return: Generated text_gen_context and prompt_str."""
         text_gen_context = ""
         for n in retrieved_nodes:
-            origin_text = n.node.text
-            text_gen_context += self.clean_string(origin_text.strip())
+            text_gen_context += build_document_node_block(n)
         query = chat_request.messages
         chat_history = chat_request.input
         # Modify model think status
@@ -292,25 +319,20 @@ async def run(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
             # This could happen when User delete all LLMs through RESTful API
             raise ValueError("No LLM available, please load LLM")
         # query transformation
+        benchmark = kwargs.get("benchmark", None)
+        benchmark_index = kwargs.get("benchmark_index", None)
         sub_questions = kwargs.get("sub_questions", None)
         text_gen_context, prompt_str = self.query_transform(chat_request, retrieved_nodes, sub_questions=sub_questions)
-        generate_kwargs = dict(
-            temperature=chat_request.temperature,
-            do_sample=chat_request.temperature > 0.0,
-            top_p=chat_request.top_p,
-            top_k=chat_request.top_k,
-            typical_p=chat_request.typical_p,
-            repetition_penalty=chat_request.repetition_penalty,
-        )
-        self.llm().generate_kwargs = generate_kwargs
-        self.llm().max_new_tokens = chat_request.max_tokens
+        # self.llm().config.update_generation_config(config)
+        self.llm().config.update_generation_config(temperature=chat_request.temperature,top_p=chat_request.top_p, top_k=chat_request.top_k, typical_p=chat_request.typical_p, repetition_penalty=chat_request.repetition_penalty, do_sample=chat_request.temperature > 0.0)
+        self.llm().config.max_new_tokens = chat_request.max_tokens
         unstructured_str = ""
-        if node_parser_type == NodeParserType.UNSTRUCTURED or node_parser_type == NodeParserType.SIMPLE:
+        if node_parser_type == NodeParserType.UNSTRUCTURED:
             unstructured_str = extract_unstructured_eles(retrieved_nodes, text_gen_context)
         if chat_request.stream:
             # Asynchronous generator
             async def generator():
-                async for chunk in local_stream_generator(self.lock, self.llm(), prompt_str, unstructured_str):
+                async for chunk in local_stream_generator(self.lock, self.llm(), prompt_str, unstructured_str, benchmark, benchmark_index):
                     yield chunk or ""
                     await asyncio.sleep(0)
 
@@ -319,15 +341,16 @@ async def generator():
             result = self.llm().complete(prompt_str)
             return result
 
-    async def run_vllm(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
+    async def run_remote(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
         # query transformation
         sub_questions = kwargs.get("sub_questions", None)
         benchmark = kwargs.get("benchmark", None)
         benchmark_index = kwargs.get("benchmark_index", None)
         text_gen_context, prompt_str = self.query_transform(chat_request, retrieved_nodes, sub_questions=sub_questions)
+        api_base_suffix = "/v3" if self.inference_type == InferenceType.OVMS else "/v1"
         llm = OpenAILike(
             api_key="fake",
-            api_base=self.vllm_endpoint + "/v1",
+            api_base=self.remote_endpoint.rstrip("/") + api_base_suffix,
             max_tokens=chat_request.max_tokens,
             model=self.vllm_name,
             top_p=chat_request.top_p,
@@ -337,7 +360,7 @@ async def run_vllm(self, chat_request, retrieved_nodes, node_parser_type, **kwar
             repetition_penalty=chat_request.repetition_penalty,
         )
         unstructured_str = ""
-        if node_parser_type == NodeParserType.UNSTRUCTURED or node_parser_type == NodeParserType.SIMPLE:
+        if node_parser_type == NodeParserType.UNSTRUCTURED:
             unstructured_str = extract_unstructured_eles(retrieved_nodes, text_gen_context)
         if chat_request.stream:
 
@@ -352,6 +375,9 @@ async def generator():
             result = await llm.acomplete(prompt_str)
             return result
 
+    async def run_vllm(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
+        return await self.run_remote(chat_request, retrieved_nodes, node_parser_type, **kwargs)
+
     @model_serializer
     def ser_model(self):
         set = {
@@ -360,13 +386,14 @@ def ser_model(self):
             "inference_type": self.inference_type,
             "model": self.llm(),
             "vllm_endpoint": self.vllm_endpoint,
+            "ovms_endpoint": self.ovms_endpoint,
         }
         return set
 
 
 class FreeChatGenerator(BaseComponent):
 
-    def __init__(self, llm_model, inference_type, vllm_endpoint, **kwargs):
+    def __init__(self, llm_model, inference_type, vllm_endpoint, ovms_endpoint="", **kwargs):
         BaseComponent.__init__(
             self,
             comp_type=CompType.GENERATOR,
@@ -393,19 +420,28 @@ def __init__(self, llm_model, inference_type, vllm_endpoint, **kwargs):
                 self.model_path = llm_instance.model_path
 
         self.llm = llm_model
+        self.vllm_name = llm_model().model_id if not isinstance(llm_model, str) else llm_model
         if self.inference_type == InferenceType.LOCAL:
             self.lock = asyncio.Lock()
         if self.inference_type == InferenceType.VLLM:
-            self.vllm_name = llm_model().model_id
             if vllm_endpoint == "":
                 vllm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8086")
+        if self.inference_type == InferenceType.OVMS:
+            if ovms_endpoint == "":
+                ovms_endpoint = os.getenv("OVMS_ENDPOINT", "http://localhost:8000")
         self.vllm_endpoint = vllm_endpoint
+        self.ovms_endpoint = ovms_endpoint
+
+        if self.inference_type == InferenceType.OVMS:
+            self.remote_endpoint = self.ovms_endpoint
+        else:
+            self.remote_endpoint = self.vllm_endpoint
 
     async def run(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
         if self.inference_type == InferenceType.LOCAL:
             response = await self.run_local(chat_request, retrieved_nodes, node_parser_type, **kwargs)
-        elif self.inference_type == InferenceType.VLLM:
-            response = await self.run_vllm(chat_request, retrieved_nodes, node_parser_type, **kwargs)
+        elif self.inference_type in (InferenceType.VLLM, InferenceType.OVMS):
+            response = await self.run_remote(chat_request, retrieved_nodes, node_parser_type, **kwargs)
         else:
             raise ValueError("LLM inference_type not supported")
         return response
@@ -438,10 +474,11 @@ async def generator():
             result = self.llm().complete(prompt_str)
             return result
 
-    async def run_vllm(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
+    async def run_remote(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
+        api_base_suffix = "/v3" if self.inference_type == InferenceType.OVMS else "/v1"
         llm = OpenAILike(
             api_key="fake",
-            api_base=self.vllm_endpoint + "/v1",
+            api_base=self.remote_endpoint.rstrip("/") + api_base_suffix,
             max_tokens=chat_request.max_tokens,
             model=self.vllm_name,
             top_p=chat_request.top_p,
@@ -465,6 +502,9 @@ async def generator():
             result = await llm.acomplete(prompt_str)
             return str(result)
 
+    async def run_vllm(self, chat_request, retrieved_nodes, node_parser_type, **kwargs):
+        return await self.run_remote(chat_request, retrieved_nodes, node_parser_type, **kwargs)
+
     @model_serializer
     def ser_model(self):
         set = {
@@ -473,6 +513,7 @@ def ser_model(self):
             "inference_type": self.inference_type,
             "model": self.llm(),
             "vllm_endpoint": self.vllm_endpoint,
+            "ovms_endpoint": self.ovms_endpoint,
         }
         return set
 
diff --git a/EdgeCraftRAG/edgecraftrag/components/indexer.py b/EdgeCraftRAG/edgecraftrag/components/indexer.py
index 6248c87db3..1e2349aa12 100644
--- a/EdgeCraftRAG/edgecraftrag/components/indexer.py
+++ b/EdgeCraftRAG/edgecraftrag/components/indexer.py
@@ -6,15 +6,14 @@
 import faiss
 from edgecraftrag.base import BaseComponent, CompType, IndexerType
 from edgecraftrag.context import ctx
-from langchain_milvus import Milvus
 from langchain_openai import OpenAIEmbeddings
 from llama_index.core import StorageContext, VectorStoreIndex
 from llama_index.vector_stores.faiss import FaissVectorStore
 from llama_index.vector_stores.milvus import MilvusVectorStore
 from pydantic import model_serializer
+from langchain_milvus import Milvus
 from pymilvus import Collection, MilvusException, connections, utility
 
-
 class VectorIndexer(BaseComponent, VectorStoreIndex):
     def __init__(self, embed_model, vector_type, vector_url="http://localhost:19530", kb_name="default_kb"):
         BaseComponent.__init__(
diff --git a/EdgeCraftRAG/edgecraftrag/components/knowledge_base.py b/EdgeCraftRAG/edgecraftrag/components/knowledge_base.py
index 8846161877..0a1071a344 100644
--- a/EdgeCraftRAG/edgecraftrag/components/knowledge_base.py
+++ b/EdgeCraftRAG/edgecraftrag/components/knowledge_base.py
@@ -3,8 +3,8 @@
 
 import json
 import os
-import time
 import uuid
+import time
 from typing import Any, Dict, List, Optional, Union
 
 from edgecraftrag.base import BaseComponent, BenchType, CompType
@@ -16,7 +16,6 @@
 from llama_index.core.schema import Document
 from pydantic import Field, model_serializer
 
-
 class Knowledge(BaseComponent):
 
     node_parser: Optional[BaseComponent] = Field(default=None)
@@ -38,7 +37,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(name=name, comp_type=CompType.KNOWLEDGE, **kwargs)
-
         self.description = description
         self.experience_active = experience_active
         self.active = active
@@ -348,7 +346,7 @@ def calculate_totals(self):
         else:
             total = None
         return total
-
+    
     def update_nodes(self, nodes: List[Document]):
         self.nodes = nodes
 
@@ -371,7 +369,7 @@ async def run_node_parser(self, docs: List[Document]) -> Any:
             self.benchmark.update_benchmark_data(benchmark_index, BenchType.CHUNK_NUM, benchmark_data)
         self.add_nodes(nodes)
         return nodes
-
+        
     async def update_nodes_to_indexer(self) -> Any:
         if self.indexer is not None:
             self.indexer.insert_nodes(self.nodes)
diff --git a/EdgeCraftRAG/edgecraftrag/components/model.py b/EdgeCraftRAG/edgecraftrag/components/model.py
index 252ea40241..3fec80ac82 100644
--- a/EdgeCraftRAG/edgecraftrag/components/model.py
+++ b/EdgeCraftRAG/edgecraftrag/components/model.py
@@ -1,18 +1,74 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import io
+import os
 from pathlib import Path
+import asyncio
 from typing import Any, Optional
-
+import openvino_genai
+import openvino as ov
+import numpy as np
 from edgecraftrag.base import BaseComponent, CompType, ModelType
 from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.llms.openvino import OpenVINOLLM
 from llama_index.postprocessor.openvino_rerank import OpenVINORerank
+from edgecraftrag.components.ov_llamaindex_helper import OpenVINOGenAIEmbedding, OpenVINOGenAIReranking
+from llama_index.llms.openvino_genai import OpenVINOGenAILLM
 from pydantic import Field, model_serializer
+from llama_index.core.base.llms.types import CompletionResponse, CompletionResponseAsyncGen, CompletionResponseGen
+from threading import Event, Thread
+
+def resolve_model_path(model_path: str) -> str:
+    if not model_path:
+        return model_path
+
+    path_obj = Path(model_path)
+    if path_obj.is_absolute() and path_obj.exists():
+        return str(path_obj)
+
+    candidates = [
+        Path.cwd() / path_obj,
+        Path(__file__).resolve().parents[2] / path_obj,
+        Path(__file__).resolve().parents[3] / path_obj,
+    ]
+
+    model_env = os.getenv("MODEL_PATH")
+    container_model_root = Path("/home/user/models")
+    if model_env:
+        model_root = Path(model_env).expanduser().resolve()
+        model_parts = list(path_obj.parts)
+        if model_parts[:1] == ["."]:
+            model_parts = model_parts[1:]
+        if model_parts[:1] == ["models"]:
+            model_parts = model_parts[1:]
+        if model_parts:
+            candidates.append(model_root / Path(*model_parts))
+            candidates.append(model_root / path_obj.name)
+
+    model_parts = list(path_obj.parts)
+    if model_parts[:1] == ["."]:
+        model_parts = model_parts[1:]
+    if model_parts[:1] == ["models"]:
+        model_parts = model_parts[1:]
+    if model_parts:
+        candidates.append(container_model_root / Path(*model_parts))
+        candidates.append(container_model_root / path_obj.name)
+
+    for candidate in candidates:
+        try:
+            resolved = candidate.expanduser().resolve()
+        except Exception:
+            continue
+        if resolved.exists():
+            return str(resolved)
+
+    return model_path
 
 
 def model_exist(model_path):
+    model_path = resolve_model_path(model_path)
     model_dir = Path(model_path)
     return (
         model_dir.is_dir()
@@ -68,9 +124,25 @@ def __init__(self, model_id, api_base, **kwargs):
 class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding):
 
     def __init__(self, model_id, model_path, device, weight):
+        model_path = resolve_model_path(model_path)
         if not model_exist(model_path):
             OpenVINOEmbedding.create_and_save_openvino_model(model_id, model_path)
-        OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device)
+        model_kwargs={
+            "ov_config": {
+                "NUM_STREAMS": "1",
+                "PERFORMANCE_HINT": "LATENCY"
+            }
+        }
+        OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device, model_kwargs=model_kwargs)
+        if device == "AUTO":
+            real_device=self._model.request.get_property("EXECUTION_DEVICES")[0]
+            self._model.to(real_device)
+            self._model.compile()
+            device=real_device
+        buf = io.BytesIO()
+        self._model.request.export_model(buf)
+        self.size_mb = len(buf.getvalue()) / 1024 / 1024
+        buf.seek(0)
         self.comp_type = CompType.MODEL
         self.comp_subtype = ModelType.EMBEDDING
         self.model_id = model_id
@@ -78,17 +150,77 @@ def __init__(self, model_id, model_path, device, weight):
         self.device = device
         self.weight = ""
 
+class OpenVINOGenAIEmbeddingModel(BaseModelComponent, OpenVINOGenAIEmbedding):
+
+    def __init__(self, model_id, model_path, device, weight):
+        max_length=512
+        model_path = resolve_model_path(model_path)
+        if not model_exist(model_path):
+            OpenVINOGenAIEmbedding.create_and_save_openvino_model(model_id, model_path)
+        if device == "NPU":
+            OpenVINOGenAIEmbedding.__init__(self, model_path=model_path, device=device, embed_batch_size=1, pad_to_max_length=True, max_length=512, normalize=True, pooling="mean", padding_side="right")
+        else:
+            OpenVINOGenAIEmbedding.__init__(self, model_path=model_path, device=device, pad_to_max_length=True, max_length=max_length, normalize=True, pooling="mean", padding_side="right")
+        self.size_mb = round(os.path.getsize(model_path+"/openvino_model.bin")/(1024*1024),3)
+        self.comp_type = CompType.MODEL
+        self.comp_subtype = ModelType.EMBEDDING
+        self.model_id = model_id
+        self.model_path = model_path
+        self.device = device
+        self.weight = ""
+        self.model_id_or_path = model_path
 
 class OpenVINORerankModel(BaseModelComponent, OpenVINORerank):
 
     def __init__(self, model_id, model_path, device, weight):
+        model_path = resolve_model_path(model_path)
         if not model_exist(model_path):
             OpenVINORerank.create_and_save_openvino_model(model_id, model_path)
+        model_kwargs={
+            "ov_config": {
+                "NUM_STREAMS": "1",
+                "PERFORMANCE_HINT": "LATENCY"
+            }
+        }
+
         OpenVINORerank.__init__(
             self,
             model_id_or_path=model_path,
             device=device,
+            model_kwargs=model_kwargs
         )
+        if device == "AUTO":
+            real_device=self._model.request.get_property("EXECUTION_DEVICES")[0]
+            self._model.to(real_device)
+            self._model.compile()
+            device=real_device
+        buf = io.BytesIO()
+        self._model.request.export_model(buf)
+        self.size_mb = len(buf.getvalue()) / 1024 / 1024
+        buf.seek(0)
+        self.comp_type = CompType.MODEL
+        self.comp_subtype = ModelType.RERANKER
+        self.model_id = model_id
+        self.model_path = model_path
+        self.device = device
+        self.weight = ""
+
+class OpenVINOGenAIRerankModel(BaseModelComponent, OpenVINOGenAIReranking):
+
+    def __init__(self, model_id, model_path, device, weight):
+        max_length=512
+        model_path = resolve_model_path(model_path)
+        if not model_exist(model_path):
+            OpenVINOGenAIReranking.create_and_save_openvino_model(model_id, model_path)
+        OpenVINOGenAIReranking.__init__(
+            self,
+            model_id_or_path=model_path,
+            device=device,
+            max_length=max_length,
+            pad_to_max_length=True,  
+            padding_side="right"
+        )
+        self.size_mb = round(os.path.getsize(model_path+"/openvino_model.bin")/(1024*1024),3)
         self.comp_type = CompType.MODEL
         self.comp_subtype = ModelType.RERANKER
         self.model_id = model_id
@@ -100,6 +232,7 @@ def __init__(self, model_id, model_path, device, weight):
 class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM):
 
     def __init__(self, model_id, model_path, device, weight, model=None):
+        model_path = resolve_model_path(model_path)
         OpenVINOLLM.__init__(
             self,
             model_id_or_path=model_path,
@@ -112,3 +245,133 @@ def __init__(self, model_id, model_path, device, weight, model=None):
         self.model_path = model_path
         self.device = device
         self.weight = weight
+
+class OpenVINOGenAILLMModel(BaseModelComponent, OpenVINOGenAILLM):
+
+    def __init__(self, model_id, model_path, device, weight, model=None):
+        model_path = resolve_model_path(model_path)
+        OpenVINOGenAILLM.__init__(
+            self,
+            model_path=model_path,
+            device=device,
+        )
+        self.comp_type = CompType.MODEL
+        self.comp_subtype = ModelType.LLM
+        self.model_id = model_id
+        self.model_path = model_path
+        self.perf_metrics = None
+        self.device = device
+        self.weight = weight
+        self.model_name = model_id
+        self.device_map = device
+        self._model = self._pipe
+
+    
+
+    
+
+    async def astream_complete_with_bench(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        async def gen() -> CompletionResponseAsyncGen:
+            loop = asyncio.get_running_loop()
+            message_queue: asyncio.Queue[Optional[CompletionResponse]] = asyncio.Queue()
+            error_holder = {}
+
+            def worker() -> None:
+                try:
+                    for message in self.stream_complete_with_bench(prompt, formatted=formatted, **kwargs):
+                        asyncio.run_coroutine_threadsafe(message_queue.put(message), loop).result()
+                except Exception as exc:
+                    error_holder["error"] = exc
+                finally:
+                    asyncio.run_coroutine_threadsafe(message_queue.put(None), loop).result()
+
+            worker_thread = Thread(target=worker, daemon=True)
+            worker_thread.start()
+
+            while True:
+                message = await message_queue.get()
+                if message is None:
+                    break
+                yield message
+
+            if "error" in error_holder:
+                raise error_holder["error"]
+        
+        return gen()
+
+    def stream_complete_with_bench(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        """Streaming completion endpoint."""
+        full_prompt = prompt
+        if not formatted:
+            if self.query_wrapper_prompt:
+                full_prompt = self.query_wrapper_prompt.format(query_str=prompt)
+            if self.system_prompt:
+                full_prompt = f"{self.system_prompt} {full_prompt}"
+
+        input_data = self._tokenizer.encode(full_prompt)
+        input_ids =  input_data.input_ids.data
+        attention_mask = input_data.attention_mask
+        full_prompt = openvino_genai.TokenizedInputs(ov.Tensor(input_ids), attention_mask)
+        generation_holder = {}
+        error_holder = {}
+
+        def run_generation() -> None:
+            try:
+                generation_holder["result"] = self._pipe.generate(
+                    full_prompt,
+                    self.config,
+                    streamer=self._streamer,
+                    **kwargs
+                )
+            except Exception as exc:
+                error_holder["error"] = exc
+
+        def gen() -> CompletionResponseGen:
+            generation_thread = Thread(target=run_generation, daemon=True)
+            generation_thread.start()
+
+            text = ""
+            for token in self._streamer:
+                text += token
+                yield CompletionResponse(text=text, delta=token)
+
+            generation_thread.join()
+
+            if "error" in error_holder:
+                raise error_holder["error"]
+
+            generation_result = generation_holder.get("result")
+            if generation_result is not None:
+                self.perf_metrics = generation_result.perf_metrics
+
+        return gen()
+
+
+    def complete_with_bench(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        """Completion endpoint."""
+        full_prompt = prompt
+        if not formatted:
+            if self.query_wrapper_prompt:
+                full_prompt = self.query_wrapper_prompt.format(query_str=prompt)
+            if self.completion_to_prompt:
+                full_prompt = self.completion_to_prompt(full_prompt)
+            elif self.system_prompt:
+                full_prompt = f"{self.system_prompt} {full_prompt}"
+
+        
+        input_data = self._tokenizer.encode(full_prompt)
+        input_ids =  input_data.input_ids.data
+        attention_mask = input_data.attention_mask
+        full_prompt = openvino_genai.TokenizedInputs(ov.Tensor(input_ids), attention_mask)
+        generation_result = self._pipe.generate(full_prompt, self.config, **kwargs)
+        self.perf_metrics = generation_result.perf_metrics
+        generated_tokens = np.array(generation_result.tokens)
+        completion = self._tokenizer.decode(generated_tokens)
+        token = completion[0] 
+        return CompletionResponse(text=token, raw={"model_output": token})
\ No newline at end of file
diff --git a/EdgeCraftRAG/edgecraftrag/components/ov_llamaindex_helper.py b/EdgeCraftRAG/edgecraftrag/components/ov_llamaindex_helper.py
new file mode 100644
index 0000000000..5abcd1a06e
--- /dev/null
+++ b/EdgeCraftRAG/edgecraftrag/components/ov_llamaindex_helper.py
@@ -0,0 +1,192 @@
+from llama_index.core.base.embeddings.base import (
+    DEFAULT_EMBED_BATCH_SIZE,
+    BaseEmbedding,
+)
+from llama_index.core.postprocessor.types import BaseNodePostprocessor
+from typing import Any, List, Optional, Dict
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.callbacks import CBEventType, EventPayload
+from llama_index.core.instrumentation import get_dispatcher
+from llama_index.core.instrumentation.events.rerank import (
+    ReRankEndEvent,
+    ReRankStartEvent,
+)
+from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle
+from llama_index.core.instrumentation import get_dispatcher
+
+
+dispatcher = get_dispatcher(__name__)
+
+
+class OpenVINOGenAIEmbedding(BaseEmbedding):
+    model_path: str = Field(description="Local path.")
+    max_length: Optional[int] = Field(description="Maximum length of input.")
+    pooling: str = Field(description="Pooling strategy. One of ['cls', 'mean'].")
+    normalize: bool = Field(default=True, description="Normalize embeddings or not.")
+    query_instruction: Optional[str] = Field(description="Instruction to prepend to query text.")
+    text_instruction: Optional[str] = Field(description="Instruction to prepend to text.")
+
+    _ov_pipe: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model_path: str,
+        pooling: str = "cls",
+        max_length: int = 2048,
+        pad_to_max_length: bool = False,
+        normalize: bool = False,
+        padding_side: Optional[str] = None,
+        query_instruction: Optional[str] = None,
+        text_instruction: Optional[str] = None,
+        embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,
+        callback_manager: Optional[CallbackManager] = None,
+        model_kwargs: Dict[str, Any] = {},
+        device: Optional[str] = "auto",
+    ):
+        try:
+            import openvino_genai
+
+        except ImportError:
+            raise ImportError("Could not import OpenVINO GenAI package. " "Please install it with `pip install openvino-genai`.")
+
+        if pooling not in ["cls", "mean"]:
+            raise ValueError(f"Pooling {pooling} not supported.")
+
+        config = openvino_genai.TextEmbeddingPipeline.Config()
+        config.normalize = normalize
+        if device == "NPU":
+            config.batch_size = embed_batch_size
+        config.pad_to_max_length = pad_to_max_length
+        config.max_length = max_length
+        if padding_side:
+            config.padding_side = padding_side
+        config.pooling_type = (
+            openvino_genai.TextEmbeddingPipeline.PoolingType.MEAN if pooling == "mean" else openvino_genai.TextEmbeddingPipeline.PoolingType.CLS
+        )
+        config.query_instruction = query_instruction
+        try:
+            config.embed_instruction = text_instruction
+        except Exception as e:
+            pass
+        super().__init__(
+            embed_batch_size=embed_batch_size,
+            callback_manager=callback_manager or CallbackManager([]),
+            model_path=model_path,
+            max_length=max_length,
+            pooling=pooling,
+            normalize=normalize,
+            query_instruction=query_instruction,
+            text_instruction=text_instruction,
+            pad_to_max_length=pad_to_max_length
+        )
+        self._ov_pipe = openvino_genai.TextEmbeddingPipeline(model_path, device, config, **model_kwargs)
+        self._device = device
+        self._model = self._ov_pipe
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "OpenVINOGenAIEmbedding"
+
+    def _get_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding."""
+        return self._ov_pipe.embed_query(query)
+
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding async."""
+        return self._ov_pipe.embed_query(query)
+
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding async."""
+        return self._ov_pipe.embed_documents([text])[0]
+
+    def _get_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding."""
+        return self._ov_pipe.embed_documents([text])[0]
+
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get text embeddings."""
+        return self._ov_pipe.embed_documents(texts)
+
+
+class OpenVINOGenAIReranking(BaseNodePostprocessor):
+    model_id_or_path: str = Field(description="Huggingface model id or local path.")
+    top_n: int = Field(description="Number of nodes to return sorted by score.")
+    keep_retrieval_score: bool = Field(
+        default=False,
+        description="Whether to keep the retrieval score in metadata.",
+    )
+
+    _ov_pipe: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model_id_or_path: str,
+        top_n: Optional[int] = 3,
+        max_length: int = 2048,
+        pad_to_max_length: bool = False,
+        padding_side: Optional[str] = None,
+        device: Optional[str] = "auto",
+        model_kwargs: Dict[str, Any] = {},
+        keep_retrieval_score: Optional[bool] = False,
+    ):
+        try:
+            import openvino_genai
+        except ImportError:
+            raise ImportError("Could not import OpenVINO GenAI package. " "Please install it with `pip install openvino-genai`.")
+
+        super().__init__(top_n=top_n, max_length=max_length, model_id_or_path=model_id_or_path, device=device, keep_retrieval_score=keep_retrieval_score)
+
+        config = openvino_genai.TextRerankPipeline.Config()
+        config.top_n = top_n
+        if max_length:
+            config.max_length = max_length
+        config.pad_to_max_length = pad_to_max_length
+        config.padding_side = padding_side
+        self._ov_pipe = openvino_genai.TextRerankPipeline(model_id_or_path, device, config, **model_kwargs)
+        self._model = self._ov_pipe
+        self._device = device
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "OpenVINOGenAIReranking"
+
+    def _postprocess_nodes(
+        self,
+        nodes: List[NodeWithScore],
+        query_bundle: Optional[QueryBundle] = None,
+    ) -> List[NodeWithScore]:
+        dispatcher.event(
+            ReRankStartEvent(
+                query=query_bundle,
+                nodes=nodes,
+                top_n=self.top_n,
+                model_name=self.model_id_or_path,
+            )
+        )
+
+        if query_bundle is None:
+            raise ValueError("Missing query bundle in extra info.")
+        if len(nodes) == 0:
+            return []
+
+        nodes_text_list = [str(node.node.get_content(metadata_mode=MetadataMode.EMBED)) for node in nodes]
+        with self.callback_manager.event(
+            CBEventType.RERANKING,
+            payload={
+                EventPayload.NODES: nodes,
+                EventPayload.MODEL_NAME: self.model_id_or_path,
+                EventPayload.QUERY_STR: query_bundle.query_str,
+                EventPayload.TOP_K: self.top_n,
+            },
+        ) as event:
+            outputs = self._ov_pipe.rerank(query_bundle.query_str, nodes_text_list)
+            for node, score in zip(nodes, outputs):
+                if self.keep_retrieval_score:
+                    # keep the retrieval score in metadata
+                    node.node.metadata["retrieval_score"] = node.score
+                node.score = score[1]
+            event.on_end(payload={EventPayload.NODES: nodes})
+
+        dispatcher.event(ReRankEndEvent(nodes=nodes))
+        return nodes
diff --git a/EdgeCraftRAG/edgecraftrag/components/pipeline.py b/EdgeCraftRAG/edgecraftrag/components/pipeline.py
index abd9b8622b..f2017c7b37 100644
--- a/EdgeCraftRAG/edgecraftrag/components/pipeline.py
+++ b/EdgeCraftRAG/edgecraftrag/components/pipeline.py
@@ -5,9 +5,10 @@
 import json
 import os
 import time
+import gc
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Callable, List, Optional
-
+from openvino import Core
 from comps.cores.proto.api_protocol import ChatCompletionRequest
 from edgecraftrag.base import (
     BaseComponent,
@@ -15,18 +16,14 @@
     CompType,
     GeneratorType,
     InferenceType,
-    NodeParserType,
     RetrieverType,
 )
+from edgecraftrag.base import NodeParserType
 from edgecraftrag.components.generator import clone_generator
 from edgecraftrag.components.postprocessor import RerankProcessor
 from edgecraftrag.components.query_preprocess import query_search
-from edgecraftrag.components.retriever import (
-    AutoMergeRetriever,
-    KBadminRetriever,
-    SimpleBM25Retriever,
-    VectorSimRetriever,
-)
+from edgecraftrag.components.knowledge_base import Knowledge
+from edgecraftrag.components.retriever import AutoMergeRetriever, SimpleBM25Retriever, VectorSimRetriever, KBadminRetriever
 from edgecraftrag.env import SEARCH_CONFIG_PATH, SEARCH_DIR
 from fastapi.responses import StreamingResponse
 from llama_index.core.schema import QueryBundle
@@ -63,6 +60,11 @@ def __init__(
             self.idx = str(idx)
 
         self.enable_benchmark = os.getenv("ENABLE_BENCHMARK", "False").lower() == "true"
+        self.max_util = round((
+            0.95 - float(os.environ.get("GPU_MEMORY_UTIL", 0))
+            if "LLM_MODEL" in os.environ
+            else 0.95
+        ),3)
         self.run_pipeline_cb = run_pipeline
         self.run_retriever_postprocessor_cb = run_retrieve_postprocess
         self.run_retriever_cb = run_retrieve
@@ -72,6 +74,7 @@ def __init__(
         self._origin_json = origin_json if origin_json is not None else "{}"
         self.retriever_type = ""
         self.retrieve_topk = 0
+        self.max_retrieve_topk=0
         self.retrievers = []
 
     # TODO: consider race condition
@@ -160,7 +163,6 @@ def get_generator(self, generator_type: str) -> Optional[BaseComponent]:
                 if gen.comp_subtype == generator_type:
                     return gen
         return None
-
     def update_retriever_config(self, retriever_type: str, retrieve_topk: int):
         self.retriever_type = retriever_type
         self.retrieve_topk = retrieve_topk
@@ -208,6 +210,7 @@ def update_retriever(self, kb, prev_indexer):
                             raise ValueError(f"Retriever type {self.retriever_type} not supported")
                 break
 
+    
     def clear_retrievers(self):
         self.retrievers = []
 
@@ -230,6 +233,163 @@ def create_freechat_gen_from_chatqna_gen(self) -> bool:
             return True
         return False
 
+    def _update_config_and_retrievers(self, changed: bool) -> None:
+        """Helper method to update JSON config and retriever settings."""
+        origin_json = json.loads(self._origin_json)
+        origin_json["retriever"]["retrieve_topk"] = self.retrieve_topk
+        origin_json["retriever"]["max_retrieve_topk"] = self.max_retrieve_topk
+        
+        for retriever in self.retrievers:
+            retriever.topk = self.retrieve_topk
+        
+        if self.postprocessor:
+            for i, processor in enumerate(self.postprocessor):
+                processor.top_n = min(processor.top_n, self.max_retrieve_topk)
+                origin_json["postprocessor"][i]["top_n"] = processor.top_n
+        
+        self._origin_json = json.dumps(origin_json)
+
+    def _resolve_max_util(self, reranker_device: str, core: Core) -> float:
+        """Resolve memory utilization rate based on device and inference type."""
+
+        if self.generator[0].inference_type == InferenceType.LOCAL:
+            if self.generator[0].llm().device == reranker_device:
+                return 0.5
+            else:
+                return 0.95
+        
+        if reranker_device == "CPU" or reranker_device == "NPU":
+            return 0.95
+        
+        device_type_obj = self._safe_get_property(reranker_device, "DEVICE_TYPE", core)
+        reranker_card = 0
+        if reranker_device == "CPU":
+            reranker_device_type = "CPU"
+        elif reranker_device == "NPU":
+            reranker_device_type = "NPU"
+        elif getattr(device_type_obj, "name", "") == "INTEGRATED":
+            reranker_device_type = "iGPU"
+        else:
+            reranker_device_type = "dGPU"
+            reranker_card = int(reranker_device.split(".")[1]) - 1
+
+        dgpu_number = 0
+        for d in core.available_devices:
+            if d.startswith("GPU") and getattr(self._safe_get_property(d, "DEVICE_TYPE", core), "name", "") == "DISCRETE":
+                dgpu_number += 1
+        mask = os.getenv("VLLM_AFFINITY_MASK", "")
+        allowed = set(int(x) for x in mask.split(",") if x.strip().isdigit())
+        max_gpu = max(allowed) if allowed else None
+       
+        if max_gpu >= dgpu_number and int(os.getenv("TP", 1)) > 1:
+            vllm_device_type = "iGPU"
+        else:
+            vllm_device_type = "dGPU"
+        if vllm_device_type == "iGPU" and reranker_device_type == "iGPU":
+            return self.max_util
+        
+        if vllm_device_type == "dGPU" and reranker_device_type == "dGPU":
+            if reranker_card in allowed:
+                return self.max_util
+        return 0.95
+
+    def _parse_vllm_device_mask(self) -> Optional[int]:
+        """Parse VLLM device affinity mask and return device index."""
+        ze_mask = os.environ.get("VLLM_AFFINITY_MASK", "")
+        devices = ze_mask.split(",") if ze_mask else []
+        if devices and devices[0]:
+            try:
+                return int(devices[0])
+            except (ValueError, IndexError):
+                pass
+        return None
+
+    @staticmethod
+    def _safe_get_property(device_name: str, property_name: str, core: Core):
+        """Safely retrieve OpenVINO device property."""
+        try:
+            return core.get_property(device_name, property_name)
+        except Exception:
+            return None
+
+    def _calculate_max_retrieve_topk(
+        self, available_memory_mb: float, hidden_size: Optional[int], num_hidden_layers: Optional[int], embedding_length: int
+    ) -> int:
+        """Calculate maximum top-k based on available memory and model config."""
+        # Constants for calculation
+        MEMORY_CALC_DIVISORS = 2 * 2 * 0.2  # From original formula
+        
+        if not hidden_size or not num_hidden_layers or embedding_length <= 0:
+            return self.retrieve_topk
+        
+        denominator = hidden_size * num_hidden_layers * MEMORY_CALC_DIVISORS * embedding_length
+        max_topk = int(available_memory_mb * 1024 * 1024 / denominator)
+        return max(1, max_topk)  # Ensure at least 1
+
+    def _get_reranker_config(self) -> dict:
+        """Safely retrieve reranker model configuration."""
+        if not self.postprocessor:
+            return {}
+        
+        try:
+            model = self.postprocessor[0].model
+            if hasattr(model, "_model") and hasattr(model._model, "config"):
+                return model._model.config
+            return {}
+        except Exception:
+            return {}
+
+    def check_top_k(self, active_kbs: list[Knowledge]):
+        """Limit top_k based on available GPU memory and model configuration."""
+        # Initialize device and core
+        reranker_model = self.postprocessor[0].model if self.postprocessor else None
+        reranker_device = reranker_model.device if reranker_model else "CPU"
+        core = Core()
+
+        # Resolve memory utilization rate
+        max_util = self._resolve_max_util(reranker_device, core)
+        # Calculate model and memory sizes
+        reranker_size = reranker_model.size_mb if reranker_model else 0
+        embedding_size = sum(index.indexer.model.size_mb for index in active_kbs)
+        embedding_length = max((getattr(index.indexer, "d", 0) for index in active_kbs), default=0)
+        
+        # Apply default minimums
+        embedding_size = embedding_size or 512
+        embedding_length = embedding_length or 256
+
+        # Try to get GPU max allocation memory
+        gpu_max_alloc_mem_size = self._safe_get_property(reranker_device, "GPU_DEVICE_MAX_ALLOC_MEM_SIZE", core)
+        if gpu_max_alloc_mem_size is None:
+            # Fallback: keep current top-k if device property not available
+            self.max_retrieve_topk = self.retrieve_topk
+            self._update_config_and_retrievers(False)
+            return False
+
+        # Calculate available GPU memory
+        available_memory_mb = gpu_max_alloc_mem_size / 1024 / 1024 * max_util - reranker_size - embedding_size
+        # Get model configuration and calculate max top-k
+        config = self._get_reranker_config()
+        if not isinstance(config, dict) :
+            if not hasattr(config, "to_dict"):
+                config = {}
+            else:
+                config = config.to_dict()
+        
+        num_hidden_layers = config.get("num_hidden_layers") if isinstance(config, dict) else getattr(config, "num_hidden_layers", None)
+        hidden_size = (config.get("hidden_size") or config.get("hidden_dim")) if isinstance(config, dict) else (getattr(config, "hidden_size", None) or getattr(config, "hidden_dim", None))
+        self.max_retrieve_topk = self._calculate_max_retrieve_topk(
+            available_memory_mb, hidden_size, num_hidden_layers, embedding_length
+        )
+
+        # Determine if top-k changed and update accordingly
+        new_retrieve_topk = min(self.retrieve_topk, self.max_retrieve_topk)
+        changed = new_retrieve_topk != self.retrieve_topk
+        if changed:
+            self.retrieve_topk = new_retrieve_topk
+
+        # Update configuration and return flag
+        self._update_config_and_retrievers(changed)
+        return changed
 
 async def run_retrieve(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any:
     query = chat_request.messages
@@ -258,11 +418,15 @@ async def run_postprocess(pl: Pipeline, chat_request: ChatCompletionRequest, con
         for processor in pl.postprocessor:
             if (
                 isinstance(processor, RerankProcessor)
+                and chat_request.top_n is not None
+                and chat_request.top_n != 0
                 and chat_request.top_n != ChatCompletionRequest.model_fields["top_n"].default
             ):
                 processor.top_n = chat_request.top_n
-            retri_res = processor.run(retri_res=contexts.get(CompType.RETRIEVER), query_bundle=query_bundle)
-            contexts[CompType.POSTPROCESSOR] = retri_res
+            elif isinstance(processor, RerankProcessor) and chat_request.top_n == 0:
+                processor.top_n = processor.default_top_n
+            post_res = processor.run(retri_res=contexts.get(CompType.RETRIEVER), query_bundle=query_bundle)
+            contexts[CompType.POSTPROCESSOR] = post_res
     return contexts
 
 
@@ -276,6 +440,7 @@ async def run_retrieve_postprocess(pl: Pipeline, chat_request: ChatCompletionReq
         benchmark_index = pl.benchmark.init_benchmark_data()
         start = time.perf_counter()
     retri_res = []
+    post_res = []
     for retriever in pl.retrievers:
         retri_res.extend(retriever.run(query=query, top_k=top_k))
     if pl.enable_benchmark:
@@ -286,11 +451,15 @@ async def run_retrieve_postprocess(pl: Pipeline, chat_request: ChatCompletionReq
         for processor in pl.postprocessor:
             if (
                 isinstance(processor, RerankProcessor)
+                and chat_request.top_n is not None
+                and chat_request.top_n != 0
                 and chat_request.top_n != ChatCompletionRequest.model_fields["top_n"].default
             ):
                 processor.top_n = chat_request.top_n
-            retri_res = processor.run(retri_res=retri_res, query_bundle=query_bundle)
-            contexts[CompType.POSTPROCESSOR] = retri_res
+            elif isinstance(processor, RerankProcessor) and chat_request.top_n == 0:
+                processor.top_n = processor.default_top_n
+            post_res = processor.run(retri_res=retri_res, query_bundle=query_bundle)
+            contexts[CompType.POSTPROCESSOR] = post_res
     return contexts
 
 
@@ -313,6 +482,14 @@ def run_async_query_search():
     return query, sub_questionss_result
 
 
+def cleanup_pipeline_resources(*resources) -> None:
+    for resource in resources:
+        if hasattr(resource, "clear"):
+            resource.clear()
+        del resource
+    gc.collect()
+
+
 async def run_pipeline(
     pl: Pipeline, chat_request: ChatCompletionRequest, generator_type: str = GeneratorType.CHATQNA
 ) -> Any:
@@ -321,6 +498,8 @@ async def run_pipeline(
         benchmark_index = pl.benchmark.init_benchmark_data()
     contexts = {}
     retri_res = []
+    post_res = []
+    top_k = None
     active_kbs = chat_request.user if chat_request.user else []
     enable_rag_retrieval = (
         chat_request.chat_template_kwargs.get("enable_rag_retrieval", True)
@@ -340,6 +519,7 @@ async def run_pipeline(
         raise ValueError("unstructured node parser cannot work with other types of node parser")
     np_type = next(iter(np_types), None)
     query = chat_request.messages
+    query_bundle = None
     sub_questionss_result = None
     experience_status = True if chat_request.tool_choice == "auto" else False
     target_generator = pl.get_generator(generator_type)
@@ -349,17 +529,16 @@ async def run_pipeline(
         start = 0
         if pl.enable_benchmark:
             start = time.perf_counter()
-        if target_generator.inference_type == InferenceType.VLLM and experience_status:
+        if target_generator.inference_type in (InferenceType.VLLM, InferenceType.OVMS) and experience_status:
             query, sub_questionss_result = await run_query_search(pl, chat_request)
         if pl.enable_benchmark:
             pl.benchmark.update_benchmark_data(benchmark_index, CompType.QUERYSEARCH, time.perf_counter() - start)
             start = time.perf_counter()
         top_k = (
             None
-            if chat_request.k == pl.retrievers[0].topk or chat_request.k != 0 or chat_request.k is None
-            else chat_request.k
+            if chat_request.k == pl.retrievers[0].topk or chat_request.k == 0 or chat_request.k is None
+            else min(chat_request.k, pl.retrieve_topk)
         )
-        retri_res = []
         for retriever in pl.retrievers:
             retri_res.extend(retriever.run(query=query, top_k=top_k))
         if pl.enable_benchmark:
@@ -376,24 +555,26 @@ async def run_pipeline(
                     and chat_request.top_n is not None
                     and chat_request.top_n != ChatCompletionRequest.model_fields["top_n"].default
                 ):
-                    processor.top_n = chat_request.top_n
-                retri_res = processor.run(retri_res=retri_res, query_bundle=query_bundle)
-                contexts[CompType.POSTPROCESSOR] = retri_res
+                    processor.top_n = min(chat_request.top_n, top_k) if top_k is not None else chat_request.top_n
+                elif isinstance(processor, RerankProcessor) and chat_request.top_n == 0:
+                    processor.top_n = processor.default_top_n
+                post_res = processor.run(retri_res=retri_res, query_bundle=query_bundle)
+                contexts[CompType.POSTPROCESSOR] = post_res
         if pl.enable_benchmark:
             pl.benchmark.update_benchmark_data(benchmark_index, CompType.POSTPROCESSOR, time.perf_counter() - start)
 
     if pl.enable_benchmark:
-        _, prompt_str = target_generator.query_transform(chat_request, retri_res)
+        _, prompt_str = target_generator.query_transform(chat_request, post_res)
         input_token_size = pl.benchmark.cal_input_token_size(prompt_str)
 
     if pl.enable_benchmark:
         start = time.perf_counter()
     if target_generator.inference_type == InferenceType.LOCAL:
-        ret = await target_generator.run(chat_request, retri_res, np_type)
-    elif target_generator.inference_type == InferenceType.VLLM:
-        ret = await target_generator.run_vllm(
+        ret = await target_generator.run(chat_request, retri_res, np_type, enable_benchmark=pl.enable_benchmark, benchmark=pl.benchmark, benchmark_index=benchmark_index)
+    elif target_generator.inference_type in (InferenceType.VLLM, InferenceType.OVMS):
+        ret = await target_generator.run_remote(
             chat_request,
-            retri_res,
+            post_res,
             np_type,
             sub_questions=sub_questionss_result,
             benchmark=pl.benchmark,
@@ -402,8 +583,16 @@ async def run_pipeline(
     else:
         raise ValueError("LLM inference_type not supported")
     if not isinstance(ret, StreamingResponse) and pl.enable_benchmark:
+        if ( target_generator.inference_type == InferenceType.LOCAL ):
+            if ( not chat_request.stream ): 
+                pl.benchmark.update_benchmark_data_genai(benchmark_index, CompType.GENERATOR, time.perf_counter() - start, pl.generator[0].llm)
+                pl.benchmark.insert_llm_data_genai(benchmark_index, input_token_size, pl.generator[0].llm)
+            cleanup_pipeline_resources(retri_res, post_res, np_types, sub_questionss_result)
+            return ret, contexts
         pl.benchmark.update_benchmark_data(benchmark_index, CompType.GENERATOR, time.perf_counter() - start)
         pl.benchmark.insert_llm_data(benchmark_index, input_token_size)
+    
+    cleanup_pipeline_resources(retri_res, post_res, np_types, sub_questionss_result)
     return ret, contexts
 
 
@@ -411,7 +600,7 @@ async def run_generator(
     pl: Pipeline, chat_request: ChatCompletionRequest, generator_type: str = GeneratorType.CHATQNA
 ) -> Any:
     active_kbs = chat_request.user if chat_request.user else []
-    # If using multiple knowledge bases, unstructured node parser cannot work with other types of node parser
+     # If using multiple knowledge bases, unstructured node parser cannot work with other types of node parser
     np_types = {kb.node_parser.comp_subtype for kb in active_kbs}
     if len(np_types) > 1 and NodeParserType.UNSTRUCTURED in np_types:
         raise ValueError("unstructured node parser cannot work with other types of node parser")
@@ -421,8 +610,8 @@ async def run_generator(
         raise ValueError(f"No Generator ({generator_type}) Specified")
     if target_generator.inference_type == InferenceType.LOCAL:
         ret = await target_generator.run(chat_request, [], np_type)
-    elif target_generator.inference_type == InferenceType.VLLM:
-        ret = await target_generator.run_vllm(chat_request, [], np_type)
+    elif target_generator.inference_type in (InferenceType.VLLM, InferenceType.OVMS):
+        ret = await target_generator.run_remote(chat_request, [], np_type)
     else:
         raise ValueError("LLM inference_type not supported")
     return ret
diff --git a/EdgeCraftRAG/edgecraftrag/components/postprocessor.py b/EdgeCraftRAG/edgecraftrag/components/postprocessor.py
index cbd387f59e..47d78238fe 100644
--- a/EdgeCraftRAG/edgecraftrag/components/postprocessor.py
+++ b/EdgeCraftRAG/edgecraftrag/components/postprocessor.py
@@ -18,6 +18,7 @@ def __init__(self, rerank_model, top_n):
         )
         self.model = rerank_model
         self.top_n = top_n
+        self.default_top_n = top_n
 
     def run(self, **kwargs) -> Any:
         self.model.top_n = self.top_n
diff --git a/EdgeCraftRAG/edgecraftrag/components/query_preprocess.py b/EdgeCraftRAG/edgecraftrag/components/query_preprocess.py
index 1d732e2a93..1f0d21c2e8 100644
--- a/EdgeCraftRAG/edgecraftrag/components/query_preprocess.py
+++ b/EdgeCraftRAG/edgecraftrag/components/query_preprocess.py
@@ -214,7 +214,7 @@ async def query_search(user_input, SEARCH_CONFIG_PATH, SEARCH_DIR, pl):
 
     generator = pl.get_generator(GeneratorType.CHATQNA)
     model_id = generator.model_id
-    vllm_endpoint = generator.vllm_endpoint
+    vllm_endpoint = getattr(generator, "remote_endpoint", generator.vllm_endpoint)
 
     maintenance_data = read_json_files(SEARCH_DIR)
     issues = []
diff --git a/EdgeCraftRAG/edgecraftrag/components/retriever.py b/EdgeCraftRAG/edgecraftrag/components/retriever.py
index 62ee1b5630..5469c0f8a1 100644
--- a/EdgeCraftRAG/edgecraftrag/components/retriever.py
+++ b/EdgeCraftRAG/edgecraftrag/components/retriever.py
@@ -5,6 +5,7 @@
 from typing import Any, List, Optional, cast
 
 import requests
+
 from edgecraftrag.base import BaseComponent, CompType, RetrieverType
 from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
 from llama_index.core.retrievers import AutoMergingRetriever
@@ -13,6 +14,7 @@
 from pydantic import model_serializer
 
 
+
 class VectorSimRetriever(BaseComponent, VectorIndexRetriever):
 
     def __init__(self, indexer, **kwargs):
@@ -114,7 +116,7 @@ def __init__(self, indexer, **kwargs):
     def run(self, **kwargs) -> Any:
         for k, v in kwargs.items():
             if k == "query":
-                if self._index.comp_subtype == "milvus_vector":
+                if self._index.comp_subtype == 'milvus_vector':
                     raise NotImplementedError("not support BM25 retriever for Milvus vector store")
                 top_k = kwargs["top_k"] if kwargs["top_k"] else self.topk
                 nodes = cast(List[BaseNode], list(self._docstore.docs.values()))
diff --git a/EdgeCraftRAG/edgecraftrag/config_repository.py b/EdgeCraftRAG/edgecraftrag/config_repository.py
index 6e7cb52a5d..c2f42f4900 100644
--- a/EdgeCraftRAG/edgecraftrag/config_repository.py
+++ b/EdgeCraftRAG/edgecraftrag/config_repository.py
@@ -291,7 +291,7 @@ async def save_pipeline_configurations(operation: str = None, pipeline=None):
                 chatqna_gen = pipeline.get_generator(GeneratorType.CHATQNA)
                 if chatqna_gen:
                     if GeneratorType.CHATQNA in gens_data:
-                        gens_data[GeneratorType.CHATQNA]["prompt_content"] = chatqna_gen.prompt_content
+                         gens_data[GeneratorType.CHATQNA]["prompt_content"] = chatqna_gen.prompt_content
             target_data["active"] = pipeline.status.active
 
         if pipeline_milvus_repo:
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/agentmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/agentmgr.py
index 91a24385cb..1db38340c9 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/agentmgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/agentmgr.py
@@ -51,8 +51,12 @@ def create_agent(self, cfgs: AgentCreateIn):
             return "Create Agent failed. Pipeline id not found."
         if cfgs.type == AgentType.SIMPLE:
             new_agent = SimpleRAGAgent(cfgs.idx, cfgs.name, cfgs.pipeline_idx, cfgs.configs)
+            new_agent.configs["max_retrievals"]=min(new_agent.configs["max_retrievals"], self.get_pipeline_by_name_or_id(cfgs.pipeline_idx).max_retrieve_topk)
         elif cfgs.type == AgentType.DEEPSEARCH:
             new_agent = DeepSearchAgent(cfgs.idx, cfgs.name, cfgs.pipeline_idx, cfgs.configs)
+            new_agent.configs["retrieve_top_k"]=min(new_agent.configs["retrieve_top_k"], self.get_pipeline_by_name_or_id(cfgs.pipeline_idx).max_retrieve_topk)
+            new_agent.configs["rerank_top_k"]=min(new_agent.configs["rerank_top_k"], self.get_pipeline_by_name_or_id(cfgs.pipeline_idx).max_retrieve_topk)
+
         if new_agent is not None:
             self.set_manager(new_agent)
             self.agents[new_agent.idx] = new_agent
@@ -118,4 +122,5 @@ def get_agent_default_configs(self, agent_type):
     async def run_agent(self, chat_request: ChatCompletionRequest) -> Any:
         active_agent = self.get_active_agent()
         if active_agent is not None:
-            return await active_agent.run(cbtype=CallbackType.RUNAGENT, chat_request=chat_request)
+            run_agent_gen = await active_agent.run(cbtype=CallbackType.RUNAGENT, chat_request=chat_request)
+            return run_agent_gen, active_agent.retrievals
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py
index 839ac23ef7..7777d148de 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py
@@ -3,9 +3,9 @@
 
 import asyncio
 import os
-from pathlib import Path
 from typing import Any, Callable, List, Optional
 
+from pathlib import Path
 from edgecraftrag.base import BaseMgr
 from edgecraftrag.components.data import File
 from llama_index.core.schema import Document
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/knowledge_basemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/knowledge_basemgr.py
index 9dc91533cd..e59d5d5beb 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/knowledge_basemgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/knowledge_basemgr.py
@@ -1,9 +1,9 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import gc
 from typing import Any, Dict, List, Optional
 
+import gc
 from edgecraftrag.api_schema import KnowledgeBaseCreateIn
 from edgecraftrag.base import BaseMgr
 from edgecraftrag.components.knowledge_base import Knowledge
@@ -96,6 +96,22 @@ def create_knowledge_base(self, knowledge: KnowledgeBaseCreateIn, origin_json: s
     def delete_knowledge_base(self, name: str):
         kb = self.get_knowledge_base_by_name_or_id(name)
         kb.node_parser = None
+        if kb.idx in self.active_knowledge_idx:
+            self.active_knowledge_idx.remove(kb.idx)
+        if self.active_experience_idx == kb.idx:
+            self.active_experience_idx = None
+        if kb.indexer is not None and getattr(kb.indexer, "model", None) is not None:
+            if getattr(kb.indexer.model, "_model", None) is not None:
+                try:
+                    kb.indexer.model._model.clear_requests()
+                    kb.indexer.model._model = None
+                except Exception as e:
+                    pass
+            try:
+              del kb.indexer.model._ov_pipe
+            except Exception as e:
+                pass
+            kb.indexer.model = None
         kb.indexer = None
         self.remove(kb.idx)
         del kb
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
index 3685fdbc65..307108ff7b 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py
@@ -2,15 +2,19 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-
+import os
 from edgecraftrag.api_schema import ModelIn
 from edgecraftrag.base import BaseComponent, BaseMgr, CompType, ModelType
 from edgecraftrag.components.model import (
     BaseModelComponent,
     OpenAIEmbeddingModel,
     OpenVINOEmbeddingModel,
+    OpenVINOGenAIEmbeddingModel,
     OpenVINOLLMModel,
+    OpenVINOGenAILLMModel,
     OpenVINORerankModel,
+    OpenVINOGenAIRerankModel,
+    resolve_model_path,
 )
 
 
@@ -78,38 +82,66 @@ def del_model_by_name(self, name: str):
     @staticmethod
     def load_model(model_para: ModelIn):
         model = None
+        enable_genai = os.getenv("ENABLE_GENAI", "").lower() == "true"
         match model_para.model_type:
             case ModelType.EMBEDDING:
-                model = OpenVINOEmbeddingModel(
-                    model_id=model_para.model_id,
-                    model_path=model_para.model_path,
-                    device=model_para.device,
-                    weight=model_para.weight,
-                )
+                if model_para.device == "NPU" or enable_genai== True:
+                    model = OpenVINOGenAIEmbeddingModel(
+                        model_id=model_para.model_id,
+                        model_path=model_para.model_path,
+                        device=model_para.device,
+                        weight=model_para.weight,
+                    )
+                else:
+                    model = OpenVINOEmbeddingModel(
+                        model_id=model_para.model_id,
+                        model_path=model_para.model_path,
+                        device=model_para.device,
+                        weight=model_para.weight,
+                    )
             case ModelType.VLLM_EMBEDDING:
                 model = OpenAIEmbeddingModel(
                     model_id=model_para.model_id,
                     api_base=model_para.api_base,
                 )
             case ModelType.RERANKER:
-                model = OpenVINORerankModel(
-                    model_id=model_para.model_id,
-                    model_path=model_para.model_path,
-                    device=model_para.device,
-                    weight=model_para.weight,
-                )
+                if enable_genai== True:
+                    model = OpenVINOGenAIRerankModel(
+                        model_id=model_para.model_id,
+                        model_path=model_para.model_path,
+                        device=model_para.device,
+                        weight=model_para.weight,
+                    )
+                else:
+                    model = OpenVINORerankModel(
+                        model_id=model_para.model_id,
+                        model_path=model_para.model_path,
+                        device=model_para.device,
+                        weight=model_para.weight,
+                    )
             case ModelType.LLM:
-                model = OpenVINOLLMModel(
+                model = OpenVINOGenAILLMModel(
                     model_id=model_para.model_id,
                     model_path=model_para.model_path,
                     device=model_para.device,
                     weight=model_para.weight,
                 )
+                # model = OpenVINOLLMModel(
+                #     model_id=model_para.model_id,
+                #     model_path=model_para.model_path,
+                #     device=model_para.device,
+                #     weight=model_para.weight,
+                # )
             case ModelType.VLLM:
                 model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
                 model.comp_type = CompType.MODEL
                 model.comp_subtype = ModelType.VLLM
                 model.model_id_or_path = model_para.model_id
+            case ModelType.OVMS:
+                model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
+                model.comp_type = CompType.MODEL
+                model.comp_subtype = ModelType.OVMS
+                model.model_id_or_path = model_para.model_id
         return model
 
     @staticmethod
@@ -121,28 +153,34 @@ def load_model_ben(model_para: ModelIn):
             case ModelType.LLM:
                 from optimum.intel import OVModelForCausalLM
 
-                ov_model = OVModelForCausalLM.from_pretrained(
-                    model_para.model_path,
-                    device=model_para.device,
-                    weight=model_para.weight,
-                )
+                resolved_model_path = resolve_model_path(model_para.model_path)
+
+                # ov_model = OVModelForCausalLM.from_pretrained(
+                #     resolved_model_path,
+                #     device=model_para.device,
+                #     weight=model_para.weight,
+                # )
                 from llm_bench_utils.hook_common import get_bench_hook
 
                 num_beams = 1
-                bench_hook = get_bench_hook(num_beams, ov_model)
-                model = OpenVINOLLMModel(
+                bench_hook = None
+                model = OpenVINOGenAILLMModel(
                     model_id=model_para.model_id,
-                    model_path=model_para.model_path,
+                    model_path=resolved_model_path,
                     device=model_para.device,
-                    weight=model_para.weight,
-                    model=ov_model,
+                    weight=model_para.weight
                 )
                 from transformers import AutoTokenizer
 
-                tokenizer = AutoTokenizer.from_pretrained(model_para.model_path, trust_remote_code=True)
+                tokenizer = None
             case ModelType.VLLM:
                 model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
                 model.comp_type = CompType.MODEL
                 model.comp_subtype = ModelType.VLLM
                 model.model_id_or_path = model_para.model_id
+            case ModelType.OVMS:
+                model = BaseModelComponent(model_id=model_para.model_id, model_path="", device="", weight="")
+                model.comp_type = CompType.MODEL
+                model.comp_subtype = ModelType.OVMS
+                model.model_id_or_path = model_para.model_id
         return model, tokenizer, bench_hook
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py
index 7eb00e386c..2f34f6c7fc 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py
@@ -2,13 +2,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import json
 import gc
 from typing import Any
-
+from openvino import Core, Type
 from comps.cores.proto.api_protocol import ChatCompletionRequest
-from edgecraftrag.base import BaseMgr, CallbackType
-from edgecraftrag.components.knowledge_base import Knowledge
+from edgecraftrag.base import BaseMgr, CallbackType, InferenceType
 from edgecraftrag.components.pipeline import Pipeline
+from edgecraftrag.components.knowledge_base import Knowledge
 
 
 class PipelineMgr(BaseMgr):
@@ -44,8 +45,44 @@ def remove_pipeline_by_name_or_id(self, name: str):
             raise Exception("Unable to remove an active pipeline...")
         if self._prev_active_pipeline_name and pl.name == self._prev_active_pipeline_name:
             raise Exception("Pipeline is currently cached, unable to remove...")
-        pl.retriever = None
+        pl.retrievers = None
+        if pl.postprocessor != None:
+            for post in pl.postprocessor:
+                try:
+                    post.model._model.clear_requests()
+                except Exception as e:
+                    pass
+                try:
+                    del post.model._model
+                    post.model._model=None
+                except Exception as e:
+                    pass
+                try:
+                    del post.model._ov_pipe
+                except Exception as e:
+                    pass
+                post.model=None
+                post=None
         pl.postprocessor = None
+        for gen in pl.generator:
+            if gen.inference_type:
+                if gen.inference_type == InferenceType.VLLM:
+                    continue
+                else:
+                    llm_model = gen.llm()
+                    if llm_model:
+                        try:
+                            llm_model._model.finish_chat()
+                        except Exception as e:
+                            pass
+                        try:
+                            del llm_model._model
+                            del llm_model._pipe
+                        except Exception as e:
+                            pass
+                        llm_model._model=None
+                        del llm_model
+                del gen
         pl.generator = None
         pl.benchmark = None
         pl.status = None
diff --git a/EdgeCraftRAG/edgecraftrag/controllers/sessionmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/sessionmgr.py
index 53dac2bc04..9c22066d49 100644
--- a/EdgeCraftRAG/edgecraftrag/controllers/sessionmgr.py
+++ b/EdgeCraftRAG/edgecraftrag/controllers/sessionmgr.py
@@ -89,7 +89,7 @@ def update_current_message(self, sessionid: str, role: str, content: str) -> str
 
     def concat_history(self, sessionid: str, inference_type: str, user_message: str) -> str:
         max_token = 6000
-        if inference_type == InferenceType.VLLM:
+        if inference_type in (InferenceType.VLLM, InferenceType.OVMS):
             vllm_max_len = int(os.getenv("MAX_MODEL_LEN", "10240"))
             if vllm_max_len > 5000:
                 max_token = vllm_max_len - 1024
diff --git a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
index 3be864f760..5d3895d521 100644
--- a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
+++ b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt
@@ -2,9 +2,23 @@
 Your task is to learn from following context and MUST make sure understand and follow Sub-questions session.
 Then answer the user's question based on what you learned from the context but not your own knowledge.
 You MUST STRICTLY follow the points listed in Sub-questions when providing answers. For example, if there're five points in Sub-questions, your answer must align with these five points. And each point MUST follow the summary provided in Sub-questions.
+When mentioning evidence in the main body, cite the concrete DOCUMENT_NODE_SOURCE value.
+Every time you use any information from a DOCUMENT_NODE_CONTEXT in the answer body, you MUST immediately include its citation using a page link: ([DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH)).
+At the very end of your final answer, you MUST output referenced DOCUMENT_NODE_SOURCE in this exact plain-text format:
+\n\n --- \n\n### Document Source:
+- DOCUMENT_NODE_SOURCE
+...
+Only list unique DOCUMENT_NODE_SOURCE values in this final Document Source block; do NOT repeat the same source.
+In the final Document Source block, output plain DOCUMENT_NODE_SOURCE only (no markdown links, no URLs, no file paths).
 
 ### Search Result ###
 The following contents are the search results related to the user's message:
+Each context unit is formatted as STRICT tagged blocks:
+- <DOCUMENT_NODE>
+- <DOCUMENT_NODE_SOURCE>xxx</DOCUMENT_NODE_SOURCE>
+- <DOCUMENT_NODE_FILE_PATH>xxx</DOCUMENT_NODE_FILE_PATH> (already URL-encoded; spaces appear as %20)
+- <DOCUMENT_NODE_CONTEXT>xxx</DOCUMENT_NODE_CONTEXT>
+- </DOCUMENT_NODE>
 
 {context}
 
@@ -14,6 +28,13 @@ When responding and thinking, please keep the following points in mind:
 - Your thinking process should generally match the language of the main question
 - If the response is lengthy, structure it well and summarize it in paragraphs
 - Choose an appropriate and visually appealing format for your response based on the user's requirements and the content of the answer, ensuring strong readability
+- You MUST only use the provided context as evidence, and only cite sources that you actually used in the answer
+- You MUST parse evidence only from <DOCUMENT_NODE_CONTEXT> and map each used context to its paired <DOCUMENT_NODE_SOURCE> and <DOCUMENT_NODE_FILE_PATH> in the same <DOCUMENT_NODE>
+- In the answer body, each claim derived from DOCUMENT_NODE_CONTEXT MUST carry a citation after current paragraph: ([DOCUMENT_NODE_SOURCE](DOCUMENT_NODE_FILE_PATH)).
+- If multiple claims in one paragraph come from different DOCUMENT_NODEs, include all corresponding inline citations in that paragraph
+- Do NOT use DOCUMENT_NODE id as citation
+- DOCUMENT_NODE_FILE_PATH is already URL-encoded; use it exactly as provided and do NOT replace %20 with spaces
+- In the final "Document Source:" block, do NOT use hyperlink format; each line must be plain - DOCUMENT_NODE_SOURCE
 
 ### history content ###
 {chat_history}
diff --git a/EdgeCraftRAG/edgecraftrag/requirements.txt b/EdgeCraftRAG/edgecraftrag/requirements.txt
index 6c4b7e4451..74f2084355 100644
--- a/EdgeCraftRAG/edgecraftrag/requirements.txt
+++ b/EdgeCraftRAG/edgecraftrag/requirements.txt
@@ -1,33 +1,33 @@
-docx2txt
-EbookLib>=0.18
-faiss-cpu>=1.8.0.post1
+docx2txt==0.9
+EbookLib==0.20
+faiss-cpu==1.13.2
 html2text>=2025.4.15
 json-repair==0.52.0
-langchain-core==0.3.80
-langchain-milvus
-langchain-openai
+langchain-core==0.3.81
+langchain-milvus==0.2.1
+langchain-openai==0.3.35
 langgraph==0.6.10
-llama-index==0.12.36
-llama-index-core==0.12.37
-llama-index-embeddings-openvino==0.5.2
-llama-index-llms-openai==0.3.44
-llama-index-llms-openai-like==0.3.4
-llama-index-llms-openvino==0.4.0
-llama-index-postprocessor-openvino-rerank==0.4.1
-llama-index-readers-file==0.4.7
-llama-index-retrievers-bm25==0.5.2
-llama-index-vector-stores-faiss==0.4.0
-llama-index-vector-stores-milvus==0.8.3
-opea-comps>=1.2
-openai==1.95.1
+opea-comps==1.3
+openai==2.15.0
 pillow>=10.4.0
 py-cpuinfo>=9.0.0
-pymilvus==2.5.10
+pymilvus==2.6.6
 python-docx==1.1.2
 torch==2.8.0+cpu
 torchvision==0.23.0+cpu
 transformers==4.53.3
-unstructured
-unstructured[all-docs]
-unstructured[pdf]
+unstructured[all-docs]==0.18.27
 werkzeug==3.1.3
+llama-index==0.14.13
+pyarrow==22.0.0
+llama-index-embeddings-openvino==0.6.1
+llama-index-embeddings-openvino-genai==0.6.1
+llama-index-llms-openai==0.6.13
+llama-index-llms-openai-like==0.5.3
+llama-index-llms-openvino==0.5.1
+llama-index-llms-openvino-genai==0.3.1
+llama-index-postprocessor-openvino-rerank==0.5.1
+llama-index-readers-file==0.5.4
+llama-index-retrievers-bm25==0.6.5
+llama-index-vector-stores-faiss==0.5.2
+llama-index-vector-stores-milvus==0.9.6
\ No newline at end of file
diff --git a/EdgeCraftRAG/edgecraftrag/server.py b/EdgeCraftRAG/edgecraftrag/server.py
index 9046fef26a..4603cb206e 100644
--- a/EdgeCraftRAG/edgecraftrag/server.py
+++ b/EdgeCraftRAG/edgecraftrag/server.py
@@ -27,7 +27,7 @@ async def lifespan(app: FastAPI):
         await restore_knowledge_configurations()
         await restore_agent_configurations()
     except Exception as e:
-        pass
+        print(f"Error during restore: {e}")
     finally:
         yield
 
diff --git a/EdgeCraftRAG/edgecraftrag/utils.py b/EdgeCraftRAG/edgecraftrag/utils.py
index a4b06ebf97..d6c2cef822 100644
--- a/EdgeCraftRAG/edgecraftrag/utils.py
+++ b/EdgeCraftRAG/edgecraftrag/utils.py
@@ -25,6 +25,78 @@
 """
 
 
+def resolve_prompt_template_path(template_path: str) -> Path:
+    if not template_path:
+        raise ValueError("Template path is empty.")
+
+    # Support both container path and source-tree path.
+    allowed_roots = [Path("/templates"), Path(__file__).resolve().parent / "prompt_template"]
+    requested = Path(template_path).expanduser()
+
+    if requested.is_absolute():
+        normalized = requested.resolve()
+        if not any(str(normalized).startswith(str(root.resolve())) for root in allowed_roots):
+            raise ValueError("Template path is outside of the allowed directory.")
+        if not normalized.exists():
+            raise FileNotFoundError(f"Template file does not exist: {normalized}")
+        return normalized
+
+    for root in allowed_roots:
+        candidate = (root / requested).resolve()
+        if str(candidate).startswith(str(root.resolve())) and candidate.exists():
+            return candidate
+
+    searched = [str((root / requested).resolve()) for root in allowed_roots]
+    raise FileNotFoundError(f"Template file does not exist. Tried: {searched}")
+
+
+def _resolve_model_path(model_path: str) -> str:
+    if not model_path:
+        return model_path
+
+    path_obj = Path(model_path)
+    if path_obj.is_absolute() and path_obj.exists():
+        return str(path_obj)
+
+    candidates = [
+        Path.cwd() / path_obj,
+        Path(__file__).resolve().parents[1] / path_obj,
+        Path(__file__).resolve().parents[2] / path_obj,
+    ]
+
+    model_env = os.getenv("MODEL_PATH")
+    container_model_root = Path("/home/user/models")
+    if model_env:
+        model_root = Path(model_env).expanduser().resolve()
+        model_parts = list(path_obj.parts)
+        if model_parts[:1] == ["."]:
+            model_parts = model_parts[1:]
+        if model_parts[:1] == ["models"]:
+            model_parts = model_parts[1:]
+        if model_parts:
+            candidates.append(model_root / Path(*model_parts))
+            candidates.append(model_root / path_obj.name)
+
+    model_parts = list(path_obj.parts)
+    if model_parts[:1] == ["."]:
+        model_parts = model_parts[1:]
+    if model_parts[:1] == ["models"]:
+        model_parts = model_parts[1:]
+    if model_parts:
+        candidates.append(container_model_root / Path(*model_parts))
+        candidates.append(container_model_root / path_obj.name)
+
+    for candidate in candidates:
+        try:
+            resolved = candidate.expanduser().resolve()
+        except Exception:
+            continue
+        if resolved.exists():
+            return str(resolved)
+
+    return model_path
+
+
 class DocxParagraphPicturePartitioner:
     @classmethod
     def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
@@ -42,20 +114,15 @@ def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> It
 
 
 def get_prompt_template(model_path, prompt_content=None, template_path=None, enable_think=False):
+    model_path = _resolve_model_path(model_path)
     if prompt_content is not None:
         template = prompt_content
     elif template_path is not None:
-        # Safely load the template only if it is inside /templates (or other safe root)
-        safe_root = "/templates"
-        normalized_path = os.path.normpath(os.path.join(safe_root, template_path))
-        if not normalized_path.startswith(safe_root):
-            raise ValueError("Template path is outside of the allowed directory.")
-        if not os.path.exists(normalized_path):
-            raise FileNotFoundError("Template file does not exist.")
-        template = Path(normalized_path).read_text(encoding=None)
+        normalized_path = resolve_prompt_template_path(template_path)
+        template = normalized_path.read_text(encoding=None)
     else:
         template = DEFAULT_TEMPLATE
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=os.path.exists(model_path))
     messages = [{"role": "system", "content": template}, {"role": "user", "content": "\n{input}\n"}]
     prompt_template = tokenizer.apply_chat_template(
         messages,
diff --git a/EdgeCraftRAG/kubernetes/helm/Chart.yaml b/EdgeCraftRAG/kubernetes/helm/Chart.yaml
new file mode 100644
index 0000000000..c5fe831cff
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: edgecraftrag
+description: Helm chart for EdgeCraftRAG stack
+type: application
+version: 0.1.0
+appVersion: "25.11"
diff --git a/EdgeCraftRAG/kubernetes/helm/README.md b/EdgeCraftRAG/kubernetes/helm/README.md
new file mode 100644
index 0000000000..137091f31f
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/README.md
@@ -0,0 +1,94 @@
+# EdgeCraft RAG Helm Chart
+
+This doc intrudoces the Helm chart for deploying EdgeCraft RAG (ecrag) on a Kubernetes cluster.
+
+## Prerequisites
+
+- A running Kubernetes cluster.
+- Helm installed.
+- Required Docker images available in your registry or locally.
+
+## Configuration
+
+Before installing, you should configure the `edgecraftrag/values.yaml` file according to your environment.
+
+### Key Configurations
+
+1. **Images**: Set the registry and tag for `ecrag` and `vllm`.
+   ```yaml
+   image:
+     ecrag:
+       registry: <your-registry>
+       tag: <your-tag>
+     vllm:
+       registry: <your-registry>
+       tag: <your-tag>
+   ```
+
+2. **Environment Variables**: Configure proxies and host IP.
+   ```yaml
+   env:
+     http_proxy: "http://proxy:port"
+     https_proxy: "http://proxy:port"
+     HOST_IP: "<node-ip>"
+   ```
+
+3. **LLM Settings**: Adjust LLM model paths and parameters.
+   ```yaml
+   llm:
+     LLM_MODEL: "/path/to/model/inside/container" # Ensure this maps to paths.model
+   ```
+
+4. **Persistant Paths**: Ensure the host paths exist for mounting.
+   ```yaml
+   paths:
+     model: /home/user/models
+     docs: /home/user/docs
+   ```
+
+## Installation
+
+To install the chart, please use below command (`edgecraftrag` as an example)
+
+```bash
+cd kubernetes/helm
+helm install edgecraftrag ./
+```
+
+If there're different clusters avaliable, please install the chart with specific kube config, e.g. :
+
+```bash
+helm install edgecraftrag ./ --kubeconfig /home/user/.kube/nas.yaml
+```
+
+## Verification
+
+### Accessing the Web UI
+
+Once the service is running, you can access the UI via your browser.
+
+1.  **Identify the Port**:
+    Check the `nodePort` configured in the `edgecraftrag/values.yaml` file. This is the external access port.
+
+2.  **Identify the IP**:
+    Use the IP address of the Kubernetes node where the deployment is running.
+    *   If running on your local machine (e.g., MicroK8s), use `localhost` or your machine's LAN IP.
+    *   If running on a remote cluster, use that node's IP.
+
+3.  **Open in Browser**:
+    Navigate to `http://<NodeIP>:<NodePort>`
+    > Example: `http://192.168.1.5:31234`
+
+## Uninstallation
+
+To uninstall/delete the `edgecraftrag` deployment:
+
+```bash
+helm uninstall edgecraftrag
+```
+
+If there're different clusters avaliable, please uninstall the chart with specific kube config, e.g. :
+
+```bash
+helm uninstall edgecraftrag --kubeconfig /home/user/.kube/nas.yaml
+```
diff --git a/EdgeCraftRAG/kubernetes/helm/README_zh.md b/EdgeCraftRAG/kubernetes/helm/README_zh.md
new file mode 100644
index 0000000000..999830dc1e
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/README_zh.md
@@ -0,0 +1,94 @@
+# EdgeCraft RAG Helm Chart
+
+此文档将为您介绍如何使用Helm chart在Kubernetes集群上部署EdgeCraft RAG (ecrag)。
+
+## 前置条件
+
+- 您需要一个运行中的Kubernetes集群。
+- 您需要已经安装Helm。
+- 所需的Docker镜像已在您的镜像仓库或本地可用。
+
+## 配置
+
+安装前，请根据您的环境配置 `edgecraftrag/values.yaml` 文件。
+
+### 关键配置
+
+1. **镜像**：设置 `ecrag` 和 `vllm` 的镜像仓库和标签。
+   ```yaml
+   image:
+     ecrag:
+       registry: <your-registry>
+       tag: <your-tag>
+     vllm:
+       registry: <your-registry>
+       tag: <your-tag>
+   ```
+
+2. **环境变量**：配置代理和主机IP。
+   ```yaml
+   env:
+     http_proxy: "http://proxy:port"
+     https_proxy: "http://proxy:port"
+     HOST_IP: "<node-ip>"
+   ```
+
+3. **LLM设置**：调整LLM模型路径和参数。
+   ```yaml
+   llm:
+     LLM_MODEL: "/path/to/model/inside/container" # 确保此路径映射到 paths.model
+   ```
+
+4. **持久化路径**：确保主机挂载路径存在。
+   ```yaml
+   paths:
+     model: /home/user/models
+     docs: /home/user/docs
+   ```
+
+## 安装
+
+请使用如下命令安装helm（以`edgecraftrag`作为发布名为例）：
+
+```bash
+cd kubernetes/helm
+helm install edgecraftrag ./edgecraftrag
+```
+
+如果有不同的集群可用，请使用指定的kube config安装chart，例如：
+
+```bash
+helm install edgecraftrag ./edgecraftrag --kubeconfig /home/user/.kube/nas.yaml
+```
+
+## 验证
+
+### 访问Web界面
+
+服务运行后，您可以通过浏览器访问UI。
+
+1.  **确认端口**：
+    查看 `edgecraftrag/values.yaml` 文件中配置的 `nodePort`。这是外部访问端口。
+
+2.  **确认IP**：
+    使用部署所运行的Kubernetes节点的IP地址。
+    *   如果在本地机器运行（如MicroK8s），使用 `localhost` 或您机器的局域网IP。
+    *   如果在远程集群运行，使用该节点的IP。
+
+3.  **在浏览器中打开**：
+    访问 `http://<NodeIP>:<NodePort>`
+    > 示例：`http://192.168.1.5:31234`
+
+## 卸载
+
+卸载/删除部署的`edgecraftrag`：
+
+```bash
+helm uninstall edgecraftrag
+```
+
+如果有不同的集群可用，请使用指定的kube config卸载chart，例如：
+
+```bash
+helm uninstall edgecraftrag --kubeconfig /home/user/.kube/nas.yaml
+```
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/configmap-env.yaml b/EdgeCraftRAG/kubernetes/helm/templates/configmap-env.yaml
new file mode 100644
index 0000000000..0fe7f105fb
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/configmap-env.yaml
@@ -0,0 +1,36 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: edgecraftrag-env
+data:
+  # Common environment variables
+  no_proxy: "{{ .Values.env.no_proxy }}"
+  http_proxy: "{{ .Values.env.http_proxy }}"
+  https_proxy: "{{ .Values.env.https_proxy }}"
+  HOST_IP: "{{ .Values.env.HOST_IP }}"
+  ENABLE_BENCHMARK: "{{ .Values.env.ENABLE_BENCHMARK }}"
+  CHAT_HISTORY_ROUND: "{{ .Values.env.CHAT_HISTORY_ROUND }}"
+  METADATA_DATABASE_URL: "{{ .Values.env.METADATA_DATABASE_URL }}"
+  MEGA_SERVICE_PORT: "{{ .Values.ports.mega }}"
+  PIPELINE_SERVICE_HOST_IP: edgecraftrag-server
+  PIPELINE_SERVICE_PORT: "{{ .Values.ports.pipeline }}"
+  UI_SERVICE_PORT: "{{ .Values.ports.ui.port }}"
+  VLLM_SERVICE_PORT_B60: "{{ .Values.ports.vllm }}"
+
+  # llm-serving-xpu specific environment variables
+  LLM_MODEL: "{{ .Values.llm.LLM_MODEL }}"
+  DTYPE: "{{ .Values.llm.DTYPE }}"
+  ZE_AFFINITY_MASK: "{{ .Values.llm.ZE_AFFINITY_MASK }}"
+  ENFORCE_EAGER: "{{ .Values.llm.ENFORCE_EAGER }}"
+  TRUST_REMOTE_CODE: "{{ .Values.llm.TRUST_REMOTE_CODE }}"
+  DISABLE_SLIDING_WINDOW: "{{ .Values.llm.DISABLE_SLIDING_WINDOW }}"
+  GPU_MEMORY_UTIL: "{{ .Values.llm.GPU_MEMORY_UTIL }}"
+  NO_ENABLE_PREFIX_CACHING: "{{ .Values.llm.NO_ENABLE_PREFIX_CACHING }}"
+  MAX_NUM_BATCHED_TOKENS: "{{ .Values.llm.MAX_NUM_BATCHED_TOKENS }}"
+  MAX_MODEL_LEN: "{{ .Values.llm.MAX_MODEL_LEN }}"
+  DISABLE_LOG_REQUESTS: "{{ .Values.llm.DISABLE_LOG_REQUESTS }}"
+  BLOCK_SIZE: "{{ .Values.llm.BLOCK_SIZE }}"
+  QUANTIZATION: "{{ .Values.llm.QUANTIZATION }}"
+  TP: "{{ .Values.llm.TP }}"
+  DP: "{{ .Values.llm.DP }}"
+
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/daemonset-edgecraftrag-server.yaml b/EdgeCraftRAG/kubernetes/helm/templates/daemonset-edgecraftrag-server.yaml
new file mode 100644
index 0000000000..e7a68336bb
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/daemonset-edgecraftrag-server.yaml
@@ -0,0 +1,58 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: edgecraftrag-server
+spec:
+  selector:
+    matchLabels:
+      app: edgecraftrag-server
+  template:
+    metadata:
+      labels:
+        app: edgecraftrag-server
+    spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        supplementalGroups:
+        - {{ .Values.gpu.groups.video }}
+        - {{ .Values.gpu.groups.render }}
+      containers:
+        - name: edgecraftrag-server
+          image: "{{ .Values.image.ecrag.registry }}/edgecraftrag-server:{{ .Values.image.ecrag.tag }}"
+          imagePullPolicy: IfNotPresent
+          envFrom:
+            - configMapRef:
+                name: edgecraftrag-env
+          env:
+            - name: PIPELINE_SERVICE_HOST_IP
+              value: "0.0.0.0"
+          ports:
+            - containerPort: {{ .Values.ports.pipeline }}
+          volumeMounts:
+            - name: model-path
+              mountPath: /home/user/models
+            - name: docs-path
+              mountPath: /home/user/docs
+            - name: tmpfile-path
+              mountPath: /home/user/ui_cache
+            - name: prompt-path
+              mountPath: /templates/custom
+            - name: dri-device
+              mountPath: /dev/dri
+      volumes:
+        - name: model-path
+          hostPath:
+            path: "{{ .Values.paths.model }}"
+        - name: docs-path
+          hostPath:
+            path: "{{ .Values.paths.docs }}"
+        - name: tmpfile-path
+          hostPath:
+            path: "{{ .Values.paths.tmpfile }}"
+        - name: prompt-path
+          hostPath:
+            path: "{{ .Values.paths.prompt }}"
+        - name: dri-device
+          hostPath:
+            path: /dev/dri
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/daemonset-llm-serving-xpu.yaml b/EdgeCraftRAG/kubernetes/helm/templates/daemonset-llm-serving-xpu.yaml
new file mode 100644
index 0000000000..5534993d87
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/daemonset-llm-serving-xpu.yaml
@@ -0,0 +1,58 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: llm-serving-xpu
+spec:
+  selector:
+    matchLabels:
+      app: llm-serving-xpu
+  template:
+    metadata:
+      labels:
+        app: llm-serving-xpu
+    spec:
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        supplementalGroups:
+        - {{ .Values.gpu.groups.video }}
+        - {{ .Values.gpu.groups.render }}
+      containers:
+        - name: llm-serving-xpu
+          image: "{{ .Values.image.vllm.registry }}/llm-scaler-vllm:{{ .Values.image.vllm.tag }}"
+          imagePullPolicy: IfNotPresent
+          command:
+            - "/bin/bash"
+            - "-c"
+            - "cd /workspace/vllm/models && source /opt/intel/oneapi/setvars.sh --force && \
+               VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=1 TORCH_LLM_ALLREDUCE=1 VLLM_USE_V1=1 \
+               CCL_ZE_IPC_EXCHANGE=pidfd VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 VLLM_WORKER_MULTIPROC_METHOD=spawn \
+               python3 -m vllm.entrypoints.openai.api_server \
+               --model $LLM_MODEL --dtype $DTYPE --enforce-eager --port $VLLM_SERVICE_PORT_B60 \
+               --trust-remote-code --disable-sliding-window --gpu-memory-util $GPU_MEMORY_UTIL \
+               --no-enable-prefix-caching --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
+               --disable-log-requests --max-model-len $MAX_MODEL_LEN --block-size $BLOCK_SIZE \
+               --quantization $QUANTIZATION -tp=$TP -dp=$DP"
+          envFrom:
+            - configMapRef:
+                name: edgecraftrag-env
+          ports:
+            - containerPort: {{ .Values.ports.vllm }}
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: model-path
+              mountPath: /workspace/vllm/models
+            - name: dri-device
+              mountPath: /dev/dri
+      volumes:
+        - name: model-path
+          hostPath:
+            path: "{{ .Values.paths.model }}"
+        - name: dri-device
+          hostPath:
+            path: /dev/dri
+      tolerations:
+        - key: "gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/deployment-ecrag.yaml b/EdgeCraftRAG/kubernetes/helm/templates/deployment-ecrag.yaml
new file mode 100644
index 0000000000..45f22326eb
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/deployment-ecrag.yaml
@@ -0,0 +1,45 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ecrag
+spec:
+  replicas: {{ .Values.replica.ecrag }}
+  selector:
+    matchLabels:
+      app: ecrag
+  template:
+    metadata:
+      labels:
+        app: ecrag
+    spec:
+      containers:
+        - name: ecrag
+          image: "{{ .Values.image.ecrag.registry }}/edgecraftrag:{{ .Values.image.ecrag.tag }}"
+          imagePullPolicy: IfNotPresent
+          envFrom:
+            - configMapRef:
+                name: edgecraftrag-env
+          ports:
+            - containerPort: {{ .Values.ports.mega }}
+          volumeMounts:
+            - name: model-path
+              mountPath: /home/user/models
+            - name: docs-path
+              mountPath: /home/user/docs
+            - name: tmpfile-path
+              mountPath: /home/user/ui_cache
+            - name: prompt-path
+              mountPath: /templates/custom
+      volumes:
+        - name: model-path
+          hostPath:
+            path: "{{ .Values.paths.model }}"
+        - name: docs-path
+          hostPath:
+            path: "{{ .Values.paths.docs }}"
+        - name: tmpfile-path
+          hostPath:
+            path: "{{ .Values.paths.tmpfile }}"
+        - name: prompt-path
+          hostPath:
+            path: "{{ .Values.paths.prompt }}"
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/deployment-edgecraftrag-ui.yaml b/EdgeCraftRAG/kubernetes/helm/templates/deployment-edgecraftrag-ui.yaml
new file mode 100644
index 0000000000..ce4ab4018e
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/deployment-edgecraftrag-ui.yaml
@@ -0,0 +1,45 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: edgecraftrag-ui
+spec:
+  replicas: {{ .Values.replica.ecrag_ui }}
+  selector:
+    matchLabels:
+      app: edgecraftrag-ui
+  template:
+    metadata:
+      labels:
+        app: edgecraftrag-ui
+    spec:
+      containers:
+        - name: edgecraftrag-ui
+          image: "{{ .Values.image.ecrag.registry }}/edgecraftrag-ui:{{ .Values.image.ecrag.tag }}"
+          imagePullPolicy: IfNotPresent
+          envFrom:
+            - configMapRef:
+                name: edgecraftrag-env
+          ports:
+            - containerPort: {{ .Values.ports.ui.port }}
+          volumeMounts:
+            - name: model-path
+              mountPath: /home/user/models
+            - name: docs-path
+              mountPath: /home/user/docs
+            - name: tmpfile-path
+              mountPath: /home/user/ui_cache
+            - name: prompt-path
+              mountPath: /templates/custom
+      volumes:
+        - name: model-path
+          hostPath:
+            path: "{{ .Values.paths.model }}"
+        - name: docs-path
+          hostPath:
+            path: "{{ .Values.paths.docs }}"
+        - name: tmpfile-path
+          hostPath:
+            path: "{{ .Values.paths.tmpfile }}"
+        - name: prompt-path
+          hostPath:
+            path: "{{ .Values.paths.prompt }}"
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/service-ecrag.yaml b/EdgeCraftRAG/kubernetes/helm/templates/service-ecrag.yaml
new file mode 100644
index 0000000000..93ee1d73d4
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/service-ecrag.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: ecrag
+spec:
+  selector:
+    app: ecrag
+  ports:
+    - protocol: TCP
+      port: {{ .Values.ports.mega }}
+      targetPort: {{ .Values.ports.mega }}
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-server.yaml b/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-server.yaml
new file mode 100644
index 0000000000..6f04b40f20
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-server.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: edgecraftrag-server
+spec:
+  selector:
+    app: edgecraftrag-server
+  ports:
+    - protocol: TCP
+      port: {{ .Values.ports.pipeline }}
+      targetPort: {{ .Values.ports.pipeline }}
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-ui.yaml b/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-ui.yaml
new file mode 100644
index 0000000000..cb02247dbe
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/service-edgecraftrag-ui.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: edgecraftrag-ui
+spec:
+  type: NodePort
+  selector:
+    app: edgecraftrag-ui
+  ports:
+    - protocol: TCP
+      port: {{ .Values.ports.ui.port }}
+      targetPort: {{ .Values.ports.ui.port }}
+      nodePort: {{ .Values.ports.ui.nodePort }}
diff --git a/EdgeCraftRAG/kubernetes/helm/templates/service-llm-serving-xpu.yaml b/EdgeCraftRAG/kubernetes/helm/templates/service-llm-serving-xpu.yaml
new file mode 100644
index 0000000000..e8bf327f7a
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/templates/service-llm-serving-xpu.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-serving-xpu
+spec:
+  selector:
+    app: llm-serving-xpu
+  ports:
+    - protocol: TCP
+      port: {{ .Values.ports.vllm }}
+      targetPort: {{ .Values.ports.vllm }}
diff --git a/EdgeCraftRAG/kubernetes/helm/values.yaml b/EdgeCraftRAG/kubernetes/helm/values.yaml
new file mode 100644
index 0000000000..203ce236f6
--- /dev/null
+++ b/EdgeCraftRAG/kubernetes/helm/values.yaml
@@ -0,0 +1,57 @@
+image:
+  ecrag:
+    registry: opea
+    tag: latest
+  vllm:
+    registry: intel
+    tag: 1.1-preview
+
+replica:
+  ecrag: 1
+  ecrag_ui: 1
+
+env:
+  no_proxy: ""
+  http_proxy: ""
+  https_proxy: ""
+  HOST_IP: ""
+  ENABLE_BENCHMARK: false
+  CHAT_HISTORY_ROUND: 0
+  METADATA_DATABASE_URL: ""
+
+llm:
+  LLM_MODEL: ""
+  DTYPE: float16
+  ZE_AFFINITY_MASK: 0,1
+  ENFORCE_EAGER: 1
+  TRUST_REMOTE_CODE: 1
+  DISABLE_SLIDING_WINDOW: 1
+  GPU_MEMORY_UTIL: 0.9
+  NO_ENABLE_PREFIX_CACHING: 1
+  MAX_NUM_BATCHED_TOKENS: 8192
+  MAX_MODEL_LEN: 49152
+  DISABLE_LOG_REQUESTS: 1
+  BLOCK_SIZE: 64
+  QUANTIZATION: sym_int4
+  TP: 1
+  DP: 1
+
+
+ports:
+  pipeline: 16010
+  mega: 16011
+  ui: 
+    port: 8082
+    nodePort: 30082
+  vllm: 8086
+
+paths:
+  model: /home/user/models
+  docs: /home/user/docs
+  tmpfile: /home/user/ui_cache
+  prompt: /templates/custom
+
+gpu:
+  groups:
+    video: 44
+    render: 991
diff --git a/EdgeCraftRAG/tests/common.sh b/EdgeCraftRAG/tests/common.sh
index 67388822c5..8a43fb30ed 100644
--- a/EdgeCraftRAG/tests/common.sh
+++ b/EdgeCraftRAG/tests/common.sh
@@ -64,4 +64,4 @@ function validate_knowledge() {
         "data" \
         "edgecraftrag-server" \
         '{"local_path":"/home/user/ui_cache"}'
-}
+}
\ No newline at end of file
diff --git a/EdgeCraftRAG/tests/configs/test_pipeline_local_llm.json b/EdgeCraftRAG/tests/configs/test_pipeline_local_llm.json
index 985152ce2c..cb379ca2c5 100644
--- a/EdgeCraftRAG/tests/configs/test_pipeline_local_llm.json
+++ b/EdgeCraftRAG/tests/configs/test_pipeline_local_llm.json
@@ -20,7 +20,7 @@
     {
       "model": {
         "model_id": "Qwen/Qwen3-8B",
-        "model_path": "./models/Qwen/Qwen3-8B/INT4_compressed_weights",
+        "model_path": "./models/OpenVINO/Qwen3-8B-int4-ov",
         "device": "auto",
         "weight": "INT4"
       },
@@ -30,4 +30,4 @@
     }
   ],
   "active": "True"
-}
+}
\ No newline at end of file
diff --git a/EdgeCraftRAG/tests/test_pipeline_local_llm.json b/EdgeCraftRAG/tests/test_pipeline_local_llm.json
index 985152ce2c..cb379ca2c5 100644
--- a/EdgeCraftRAG/tests/test_pipeline_local_llm.json
+++ b/EdgeCraftRAG/tests/test_pipeline_local_llm.json
@@ -20,7 +20,7 @@
     {
       "model": {
         "model_id": "Qwen/Qwen3-8B",
-        "model_path": "./models/Qwen/Qwen3-8B/INT4_compressed_weights",
+        "model_path": "./models/OpenVINO/Qwen3-8B-int4-ov",
         "device": "auto",
         "weight": "INT4"
       },
@@ -30,4 +30,4 @@
     }
   ],
   "active": "True"
-}
+}
\ No newline at end of file
diff --git a/EdgeCraftRAG/tools/README.md b/EdgeCraftRAG/tools/README.md
old mode 100755
new mode 100644
index 0b2c2bde6d..e5fde632bd
--- a/EdgeCraftRAG/tools/README.md
+++ b/EdgeCraftRAG/tools/README.md
@@ -1,54 +1,109 @@
-# EdgeCraftRAG tool scripts
-
 [中文版](README_zh.md)
 
-This directory contains helper scripts for building images and starting EC-RAG services.
+This directory contains the deployment, startup, and image build scripts for EdgeCraftRAG.
+
+# 1.Script Overview
+
+The main scripts in this directory are:
+
+- `quick_start.sh`: recommended one-click deployment script for new users, with automatic setup and interactive guidance
+- `bootstrap.sh`: non-interactive deployment orchestrator that can be used directly or invoked by `quick_start.sh`
+- `model_download.sh`: model preparation helper (supports `vllm` / `ov`, optional `model_id` and `model_path` arguments)
+- `run_ov_baremetal.sh`: OpenVINO bare-metal startup script
+- `run_ov_container.sh`: OpenVINO container startup script
+- `run_vllm_baremetal.sh`: vLLM bare-metal startup script
+- `run_vllm_container.sh`: vLLM container startup script
+- `run_ovms_baremetal.sh`: OVMS bare-metal startup script
+- `run_ovms_container.sh`: OVMS container startup script
+- `build_images.sh`: container image build script
+
+Deployment methods:
 
-## Scripts
+| Method | Description | Requirements | Milvus Support |
+|------|------|----------|-------------|
+| baremetal | Start services as Python processes | Python 3.10+ | No (in-memory only) |
+| container | Start services in Docker containers | Docker / Docker Compose | Yes (enabled by default) |
 
-- `quick_start.sh`: one-click startup for OpenVINO or vLLM deployment
-- `build_images.sh`: build EC-RAG Docker images
+Note: If you need Milvus, use the container deployment method.
 
----
+# 2.Quick Deployment Script (New Users)
 
-## quick_start.sh
+## 2.1 One-Command Quick Deployment
 
-Run from the `EdgeCraftRAG` root directory:
+Run this from the `EdgeCraftRAG` root directory:
 
 ```bash
 ./tools/quick_start.sh
 ```
 
-### Default behavior
+The script behaves as follows by default:
+
+- runs in non-interactive mode
+- uses OpenVINO as the default inference backend
+- if `INFERENCE_BACKEND` is not set, the script resolves it to `openvino`
+- uses `baremetal` as the default deployment method when `DEPLOYMENT_METHOD` is not set
+
+In the default bare-metal flow, the script automatically:
+
+- creates and activates `EdgeCraftRAG/ecrag_venv` if it does not exist
+- validates the Python version (3.10+ required, 3.10/3.11 recommended)
+- checks and installs required Python packages
+- checks and installs `npm` for baremetal UI startup when needed
+- validates Intel GPU driver/runtime and auto-installs missing packages on apt-based Linux
+- checks and auto-downloads missing models (embedding, reranker, OpenVINO LLM)
+- writes a deployment environment snapshot to `workspace/bootstrap.env` before invoking `bootstrap.sh`
+- calls `bootstrap.sh` to start services
+
+For vLLM deployments and container deployment method, the script also validates Docker and Docker Compose before deployment.
+On Ubuntu 24.04, if Docker or Docker Compose is missing, the script attempts automatic installation and starts/enables Docker service.
+
+To skip model verification/download when models are already prepared locally:
 
-If no environment variables are provided, the script uses these defaults:
+```bash
+./tools/quick_start.sh --skip-model-check
+```
+
+Equivalent environment variable:
 
 ```bash
-MODEL_PATH=${WORKSPACE}/workspace/models
-DOC_PATH=${WORKSPACE}/workspace
-TMPFILE_PATH=${WORKSPACE}/workspace
-LLM_MODEL=Qwen/Qwen3-8B
+export SKIP_MODEL_CHECK=1
+./tools/quick_start.sh
 ```
 
-The script will also:
+Intel GPU driver/runtime validation can be skipped when needed:
+
+```bash
+./tools/quick_start.sh --skip-gpu-driver-check
+```
 
-- create and activate a Python virtual environment automatically
-- install `python3-venv` if needed
-- check whether required models exist under `MODEL_PATH`
-- automatically download missing embedding, reranker, and LLM models
-- print the UI access URL after startup completes
+Equivalent environment variables:
 
-### Non-interactive mode
+```bash
+export SKIP_INTEL_GPU_DRIVER_CHECK=1
+# Or keep validation but disable auto-install:
+export AUTO_INSTALL_INTEL_GPU_DRIVER=0
+./tools/quick_start.sh
+```
 
-By default, non-interactive mode starts local OpenVINO services.
+To disable automatic npm installation during baremetal preparation:
 
 ```bash
+export AUTO_INSTALL_NPM=0
 ./tools/quick_start.sh
 ```
 
+After startup succeeds, the terminal prints a UI access URL such as:
+
+```text
+UI access URL: http://${HOST_IP}:8082
+```
+
+Note: If you set `DEPLOYMENT_METHOD=container` in advance, the script skips venv and pip checks and continues with container deployment.
+
 You can override defaults with environment variables:
 
 ```bash
+export INFERENCE_BACKEND=openvino
 export MODEL_PATH="${PWD}/workspace/models"
 export DOC_PATH="${PWD}/workspace"
 export TMPFILE_PATH="${PWD}/workspace"
@@ -58,110 +113,215 @@ export HOST_IP="$(hostname -I | awk '{print $1}')"
 ./tools/quick_start.sh
 ```
 
-### Select deployment mode with `COMPOSE_PROFILES`
-
-#### OpenVINO on Core Ultra, B60 or A770
+Select the backend with `INFERENCE_BACKEND`:
 
 ```bash
+# OpenVINO (default)
 ./tools/quick_start.sh
-```
 
-#### vLLM on Intel Arc A770
-
-```bash
-export COMPOSE_PROFILES=vLLM_A770
+# vLLM_A770
+export INFERENCE_BACKEND=vllm_a770
 ./tools/quick_start.sh
-```
 
-#### vLLM on Intel Arc B60
+# vLLM_B60
+export INFERENCE_BACKEND=vllm_b60
+./tools/quick_start.sh
 
-```bash
-export COMPOSE_PROFILES=vLLM_B60
+# OVMS
+export INFERENCE_BACKEND=ovms
+export OVMS_SOURCE_MODEL=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_MODEL_NAME=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_TARGET_DEVICE=GPU.0
 ./tools/quick_start.sh
 ```
 
-Optional B60/vLLM variables:
+For OVMS deployments, the tooling now exports the compose-facing variables directly. The most commonly overridden ones are `OVMS_SOURCE_MODEL`, `OVMS_MODEL_NAME`, `OVMS_TARGET_DEVICE`, `OVMS_TOOL_PARSER`, and `OVMS_MAX_NUM_BATCHED_TOKENS`.
+
+Important OVMS behavior:
+
+- `OVMS_SOURCE_MODEL` keeps your original model ID as-is (for example `Qwen/Qwen3-8B`).
+- `quick_start.sh` and `bootstrap.sh` both persist OVMS variables into `workspace/bootstrap.env` for reuse.
+- You can replay the exact OVMS configuration with `source workspace/bootstrap.env && ./tools/bootstrap.sh`.
+
+Compatibility note: the legacy environment variable `COMPOSE_PROFILES` is still accepted, but new configurations should use `INFERENCE_BACKEND`.
+
+Supported `INFERENCE_BACKEND` values:
+
+- `openvino`
+- `vllm_a770`
+- `vllm_b60`
+- `ovms`
+
+## 2.2 Interactive Mode
 
 ```bash
-export VLLM_SERVICE_PORT_B60=8086
-export DTYPE=float16
-export TP=1
-export DP=1
-export ZE_AFFINITY_MASK=0
-export ENFORCE_EAGER=1
-export TRUST_REMOTE_CODE=1
-export DISABLE_SLIDING_WINDOW=1
-export GPU_MEMORY_UTIL=0.8
-export NO_ENABLE_PREFIX_CACHING=1
-export MAX_NUM_BATCHED_TOKENS=8192
-export DISABLE_LOG_REQUESTS=1
-export MAX_MODEL_LEN=49152
-export BLOCK_SIZE=64
-export QUANTIZATION=fp8
+./tools/quick_start.sh -i
 ```
 
-### Interactive mode
+Interactive mode is suitable for first-time deployment or when you are not sure about the parameters. After you run `./tools/quick_start.sh -i`, the script prompts step by step and generates the deployment configuration for the current run.
+
+The interactive flow typically includes:
+
+- choosing the inference backend: OpenVINO / vLLM_A770 / vLLM_B60 / OVMS
+- choosing the deployment method: baremetal / container
+- configuring key parameters: `HOST_IP`, `MODEL_PATH`, `DOC_PATH`, `TMPFILE_PATH`, `LLM_MODEL`
+- confirming the configuration and starting deployment, then printing the access URL at the end
+
+Interactive mode is recommended when:
+
+- this is your first installation and you are not familiar with the environment variables or defaults
+- you need to switch quickly between different hardware targets or inference backends
+- you want to review parameters before deployment to reduce configuration mistakes
+
+Example:
 
 ```bash
-bash -i ./tools/quick_start.sh
+cd EdgeCraftRAG
+./tools/quick_start.sh -i
 ```
 
-In interactive mode, the script prompts for:
+## 2.3 Common Interactive Input Examples
 
-- deployment mode: `vLLM_A770`, `vLLM_B60`, or `ov`
-- `HOST_IP`
-- `DOC_PATH`
-- `TMPFILE_PATH`
-- `MODEL_PATH`
-- `LLM_MODEL`
-- optional vLLM runtime settings
+The following examples show common inputs during the interactive flow. Actual prompt text may vary slightly based on the script.
 
-### Model check and auto-download
+### Example A: OpenVINO + baremetal (single-machine quick experience)
 
-The script checks these model locations automatically:
+```text
+Inference backend: OpenVINO
+Deployment method: baremetal
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+Confirm deployment: y
+```
 
-#### Shared models
+### Example B: vLLM_B60 + container (Milvus required)
 
 ```text
-${MODEL_PATH}/BAAI/bge-small-en-v1.5
-${MODEL_PATH}/BAAI/bge-reranker-large
+Inference backend: vLLM_B60
+Deployment method: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+Confirm deployment: y
 ```
 
-#### vLLM mode
+### Example C: vLLM_A770 + container (recommended for A770)
 
 ```text
-${MODEL_PATH}/${LLM_MODEL}
+Inference backend: vLLM_A770
+Deployment method: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+Confirm deployment: y
 ```
 
-#### OpenVINO mode
+### Example D: OVMS + container
 
 ```text
-${MODEL_PATH}/${LLM_MODEL}/INT4_compressed_weights
+Inference backend: OVMS
+Deployment method: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+Confirm deployment: y
 ```
 
-If a required model is missing, the script downloads it automatically and prints a message.
+Notes:
 
-### UI access output
+- for a remote server, set `HOST_IP` to an address reachable by the client machine
+- if you need persistent vector retrieval data, use the container deployment method
+- if the device is Intel Arc A770, prefer the `vllm_a770` configuration
 
-After startup completes, the script prints:
+Cleanup:
 
-```text
-Service launched successfully.
-UI access URL: http://${HOST_IP}:8082
-If you are accessing from another machine, replace ${HOST_IP} with the server's reachable IP or hostname.
+```bash
+./tools/quick_start.sh cleanup
 ```
 
-### Cleanup
+# 3.Startup Scripts
 
-To stop and remove the deployed containers:
+## 3.1 bootstrap.sh (Non-Interactive Orchestration)
+
+Run with environment variables defined in advance:
 
 ```bash
-./tools/quick_start.sh cleanup
+export INFERENCE_BACKEND=openvino
+export DEPLOYMENT_METHOD=baremetal
+./tools/bootstrap.sh
+```
+
+Use defaults (`openvino` + `baremetal`):
+
+```bash
+./tools/bootstrap.sh
 ```
 
----
+Configuration reuse:
+
+- `quick_start.sh` writes `workspace/bootstrap.env` before real deployment starts.
+- `bootstrap.sh` also persists configuration for reuse.
+- For OVMS, this includes `OVMS_SOURCE_MODEL`, `OVMS_MODEL_NAME`, `OVMS_TARGET_DEVICE`, `OVMS_TOOL_PARSER`, and related `OVMS_*` runtime variables.
+
+```bash
+source workspace/bootstrap.env
+./tools/bootstrap.sh
+```
+
+## 3.3 model_download.sh (Model Preparation)
+
+Basic usage:
+
+```bash
+./tools/model_download.sh <mode> [model_id] [model_path]
+```
+
+Modes:
+
+- `vllm`: prepare embedding/reranker OpenVINO models + vLLM LLM model
+- `ov`: prepare embedding/reranker OpenVINO models + OpenVINO INT4 LLM model
+
+Optional arguments:
+
+- `model_id`: overrides `LLM_MODEL` for current run
+- `model_path`: overrides `MODEL_PATH` for current run
+
+Examples:
+
+```bash
+./tools/model_download.sh vllm
+./tools/model_download.sh ov Qwen/Qwen3-8B /data/models
+```
+
+Environment behavior:
+
+- if a virtual environment is already active, it is reused
+- otherwise, the script creates/activates `ecrag_venv` automatically (same style as `quick_start.sh`)
+- missing `python3-venv` / `pip` prerequisites are installed automatically when supported by the system package manager
+
+## 3.2 Direct Startup Scripts
+
+You can also call the following scripts directly based on inference backend and deployment method:
+
+- OpenVINO baremetal: `./tools/run_ov_baremetal.sh`
+- OpenVINO container: `./tools/run_ov_container.sh`
+- vLLM baremetal: `./tools/run_vllm_baremetal.sh`
+- vLLM container: `./tools/run_vllm_container.sh`
+- OVMS baremetal: `./tools/run_ovms_baremetal.sh`
+- OVMS container: `./tools/run_ovms_container.sh`
+
+This is useful when you already know your parameters and want to skip the one-click onboarding flow.
 
-## build_images.sh
+# 4.Container Image Build Script
 
 Build all images:
 
@@ -169,7 +329,7 @@ Build all images:
 ./tools/build_images.sh
 ```
 
-Build selected images only:
+Build by component:
 
 ```bash
 ./tools/build_images.sh mega
@@ -178,4 +338,4 @@ Build selected images only:
 ./tools/build_images.sh all
 ```
 
-For full deployment details, refer to [../docs/Advanced_Setup.md](../docs/Advanced_Setup.md).
+For complete deployment guidance, see [../docs/Advanced_Setup.md](../docs/Advanced_Setup.md).
diff --git a/EdgeCraftRAG/tools/README_zh.md b/EdgeCraftRAG/tools/README_zh.md
index 4a37187eb6..da4c2d34c4 100644
--- a/EdgeCraftRAG/tools/README_zh.md
+++ b/EdgeCraftRAG/tools/README_zh.md
@@ -1,54 +1,95 @@
-# EdgeCraftRAG 工具脚本
-
 [English](README.md)
 
-本目录包含用于构建镜像和启动 EC-RAG 服务的辅助脚本。
+本目录包含 EdgeCraftRAG 的部署、启动和镜像构建脚本。
+
+# 1.脚本介绍
+
+本目录主要脚本如下：
+
+- `quick_start.sh`：推荐新用户使用的一键部署脚本，支持自动安装与交互引导
+- `bootstrap.sh`：非交互部署编排器（可独立使用，也可由 quick_start 调用）
+- `model_download.sh`：模型准备脚本（支持 `vllm` / `ov` 模式，支持可选参数 `model_id` 和 `model_path`）
+- `run_ov_baremetal.sh`：OpenVINO 裸金属启动脚本
+- `run_ov_container.sh`：OpenVINO 容器启动脚本
+- `run_vllm_baremetal.sh`：vLLM 裸金属启动脚本
+- `run_vllm_container.sh`：vLLM 容器启动脚本
+- `run_ovms_baremetal.sh`：OVMS 裸金属启动脚本
+- `run_ovms_container.sh`：OVMS 容器启动脚本
+- `build_images.sh`：容器镜像编译脚本
+
+部署方式说明：
 
-## 脚本
+| 方式 | 描述 | 环境要求 | Milvus 支持 |
+|------|------|----------|-------------|
+| baremetal（裸金属） | 以 Python 进程方式启动服务 | Python 3.10+ | 否（仅内存） |
+| container（容器） | 以 Docker 容器方式启动服务 | Docker / Docker Compose | 是（默认启用） |
 
-- `quick_start.sh`：一键启动 OpenVINO 或 vLLM 部署
-- `build_images.sh`：构建 EC-RAG Docker 镜像
+提示：如需使用 Milvus，请选择容器部署。
 
----
+# 2.快速部署脚本（新用户）
 
-## quick_start.sh
+## 2.1 一键快速部署
 
-请在 `EdgeCraftRAG` 根目录下运行：
+推荐在 `EdgeCraftRAG` 根目录执行：
 
 ```bash
 ./tools/quick_start.sh
 ```
 
-### 默认行为
+脚本会按以下默认行为执行：
 
-如果未提供环境变量，脚本会使用以下默认值：
+- 进入非交互模式（non-interactive）
+- 推理后端默认选择 OpenVINO（`INFERENCE_BACKEND` 未设置时，脚本会自动解析为 `openvino`）
+- 部署方式默认是 baremetal（`DEPLOYMENT_METHOD` 默认 `baremetal`）
+
+在 baremetal 默认模式下，会自动执行：
+
+- 创建并激活 `EdgeCraftRAG/ecrag_venv` 虚拟环境（若不存在）
+- 校验 Python 版本（要求 3.10+，推荐 3.10/3.11）
+- 检查并安装关键 Python 依赖
+- 在裸金属 UI 启动需要时检查并安装 `npm`
+- 校验 Intel GPU 驱动/运行时，若缺失则在 apt 系统上自动安装
+- 检查并自动下载缺失模型（embedding、reranker、OpenVINO LLM）
+- 在调用 `bootstrap.sh` 前将本次部署环境快照写入 `workspace/bootstrap.env`
+- 调用 `bootstrap.sh` 启动服务
+
+对于 vLLM 部署或 container 部署方式，脚本会在部署前校验 Docker 与 Docker Compose。
+在 Ubuntu 24.04 上，如果 Docker 或 Docker Compose 缺失，脚本会尝试自动安装并启动/启用 Docker 服务。
+
+如需跳过 Intel GPU 驱动/运行时校验，可使用：
 
 ```bash
-MODEL_PATH=${WORKSPACE}/workspace/models
-DOC_PATH=${WORKSPACE}/workspace
-TMPFILE_PATH=${WORKSPACE}/workspace
-LLM_MODEL=Qwen/Qwen3-8B
+./tools/quick_start.sh --skip-gpu-driver-check
 ```
 
-脚本还会自动执行以下操作：
-
-- 自动创建并激活 Python 虚拟环境
-- 在需要时安装 `python3-venv`
-- 检查 `MODEL_PATH` 下必需模型是否存在
-- 自动下载缺失的 embedding、reranker 和 LLM 模型
-- 在启动完成后输出 UI 访问地址
+等价环境变量：
 
-### 非交互模式
+```bash
+export SKIP_INTEL_GPU_DRIVER_CHECK=1
+# 或保留校验但禁用自动安装：
+export AUTO_INSTALL_INTEL_GPU_DRIVER=0
+./tools/quick_start.sh
+```
 
-默认情况下，非交互模式启动本地 OpenVINO 服务。
+如需禁用 baremetal 准备阶段的 npm 自动安装，可使用：
 
 ```bash
+export AUTO_INSTALL_NPM=0
 ./tools/quick_start.sh
 ```
 
-你也可以通过环境变量覆盖默认值：
+启动成功后，终端会输出 UI 访问地址，例如：
+
+```text
+UI access URL: http://${HOST_IP}:8082
+```
+
+补充：如果你事先设置了 `DEPLOYMENT_METHOD=container`，脚本会跳过 venv/pip 检查，并按容器方式继续部署。
+
+可通过环境变量覆盖：
 
 ```bash
+export INFERENCE_BACKEND=openvino
 export MODEL_PATH="${PWD}/workspace/models"
 export DOC_PATH="${PWD}/workspace"
 export TMPFILE_PATH="${PWD}/workspace"
@@ -58,118 +99,224 @@ export HOST_IP="$(hostname -I | awk '{print $1}')"
 ./tools/quick_start.sh
 ```
 
-### 使用 `COMPOSE_PROFILES` 选择部署模式
-
-#### Core Ultra、B60 或 A770 上的 OpenVINO
+按硬件选择 `INFERENCE_BACKEND`：
 
 ```bash
+# OpenVINO（默认）
 ./tools/quick_start.sh
-```
-
-#### Intel Arc A770 上的 vLLM
 
-```bash
-export COMPOSE_PROFILES=vLLM_A770
+# vLLM_A770
+export INFERENCE_BACKEND=vllm_a770
 ./tools/quick_start.sh
-```
 
-#### Intel Arc B60 上的 vLLM
+# vLLM_B60
+export INFERENCE_BACKEND=vllm_b60
+./tools/quick_start.sh
 
-```bash
-export COMPOSE_PROFILES=vLLM_B60
+# OVMS
+export INFERENCE_BACKEND=ovms
+export OVMS_SOURCE_MODEL=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_MODEL_NAME=OpenVINO/Qwen3-8B-int4-ov
+export OVMS_TARGET_DEVICE=GPU.0
 ./tools/quick_start.sh
 ```
 
-可选的 B60/vLLM 环境变量：
+对于 OVMS 部署，工具脚本会直接导出 compose 所需的 `OVMS_*` 环境变量。常见可覆盖项包括：`OVMS_SOURCE_MODEL`、`OVMS_MODEL_NAME`、`OVMS_TARGET_DEVICE`、`OVMS_TOOL_PARSER`、`OVMS_MAX_NUM_BATCHED_TOKENS`。
+
+OVMS 相关行为说明：
+
+- `OVMS_SOURCE_MODEL` 会保持你提供的原始模型 ID（例如 `Qwen/Qwen3-8B`），不会自动截断。
+- `quick_start.sh` 与 `bootstrap.sh` 都会将 OVMS 变量写入 `workspace/bootstrap.env` 以便复用。
+- 可通过 `source workspace/bootstrap.env && ./tools/bootstrap.sh` 复用同一套 OVMS 配置。
+
+兼容说明：历史环境变量 `COMPOSE_PROFILES` 仍可使用，但新配置建议统一使用 `INFERENCE_BACKEND`。
+
+`INFERENCE_BACKEND` 支持以下取值：
+
+- `openvino`
+- `vllm_a770`
+- `vllm_b60`
+- `ovms`
+
+
+## 2.2 交互模式
 
 ```bash
-export VLLM_SERVICE_PORT_B60=8086
-export DTYPE=float16
-export TP=1
-export DP=1
-export ZE_AFFINITY_MASK=0
-export ENFORCE_EAGER=1
-export TRUST_REMOTE_CODE=1
-export DISABLE_SLIDING_WINDOW=1
-export GPU_MEMORY_UTIL=0.8
-export NO_ENABLE_PREFIX_CACHING=1
-export MAX_NUM_BATCHED_TOKENS=8192
-export DISABLE_LOG_REQUESTS=1
-export MAX_MODEL_LEN=49152
-export BLOCK_SIZE=64
-export QUANTIZATION=fp8
+./tools/quick_start.sh -i
 ```
 
-### 交互模式
+交互模式适合首次部署或不确定参数时使用。执行 `./tools/quick_start.sh -i` 后，脚本会逐步提问并自动生成本次部署配置。
+
+交互流程通常包括：
+
+- 选择推理后端：OpenVINO / vLLM_A770 / vLLM_B60 / OVMS
+- 选择部署方式：baremetal / container
+- 配置关键参数：`HOST_IP`、`MODEL_PATH`、`DOC_PATH`、`TMPFILE_PATH`、`LLM_MODEL`
+- 确认配置后开始部署，并在结束后输出访问地址
+
+建议在以下场景使用交互模式：
+
+- 首次安装，不熟悉环境变量名称和默认值
+- 需要快速切换不同硬件或推理后端
+- 希望先确认参数再执行，降低配置出错概率
+
+示例：
 
 ```bash
-bash -i ./tools/quick_start.sh
+cd EdgeCraftRAG
+./tools/quick_start.sh -i
 ```
 
-在交互模式下，脚本会提示你输入：
+## 2.3 交互模式常见输入示例
 
-- 部署模式：`vLLM_A770`、`vLLM_B60` 或 `ov`
-- `HOST_IP`
-- `DOC_PATH`
-- `TMPFILE_PATH`
-- `MODEL_PATH`
-- `LLM_MODEL`
-- 可选的 vLLM 运行参数
+以下示例用于说明交互过程中常见的输入内容，实际选项名称以终端提示为准。
 
-### 模型检查与自动下载
+### 示例 A：OpenVINO + baremetal（单机快速体验）
 
-脚本会自动检查以下模型路径：
+```text
+部署后端: OpenVINO
+部署方式: baremetal
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+确认部署: y
+```
 
-#### 公共模型
+### 示例 B：vLLM_B60 + container（需要 Milvus）
 
 ```text
-${MODEL_PATH}/BAAI/bge-small-en-v1.5
-${MODEL_PATH}/BAAI/bge-reranker-large
+部署后端: vLLM_B60
+部署方式: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+确认部署: y
 ```
 
-#### vLLM 模式
+### 示例 C：vLLM_A770 + container（A770 推荐）
 
 ```text
-${MODEL_PATH}/${LLM_MODEL}
+部署后端: vLLM_A770
+部署方式: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+确认部署: y
 ```
 
-#### OpenVINO 模式
+### 示例 D：OVMS + container
 
 ```text
-${MODEL_PATH}/${LLM_MODEL}/INT4_compressed_weights
+部署后端: OVMS
+部署方式: container
+HOST_IP: 192.168.1.20
+MODEL_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace/models
+DOC_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+TMPFILE_PATH: /home/scale/edgeai/applications.edge.ai.rag/EdgeCraftRAG/workspace
+LLM_MODEL: Qwen/Qwen3-8B
+确认部署: y
 ```
 
-如果缺少必需模型，脚本会自动下载并输出提示信息。
+提示：
 
-### UI 访问输出
+- 如果是远程服务器，请将 `HOST_IP` 设置为客户端可访问的地址。
+- 如需持久化向量检索数据，请使用 container 部署方式。
+- 若设备为 Intel Arc A770，优先选择 vLLM_A770 对应配置。
 
-启动完成后，脚本会输出：
+清理部署：
 
-```text
-Service launched successfully.
-UI access URL: http://${HOST_IP}:8082
-If you are accessing from another machine, replace ${HOST_IP} with your server's reachable IP or hostname.
+```bash
+./tools/quick_start.sh cleanup
 ```
 
-### 清理
+# 3.启动脚本
 
-停止并移除已部署容器：
+## 3.1 bootstrap.sh（非交互编排）
+
+通过环境变量定义部署参数后执行：
 
 ```bash
-./tools/quick_start.sh cleanup
+export INFERENCE_BACKEND=openvino
+export DEPLOYMENT_METHOD=baremetal
+./tools/bootstrap.sh
+```
+
+使用默认值（openvino + baremetal）：
+
+```bash
+./tools/bootstrap.sh
+```
+
+配置复用：
+
+- `quick_start.sh` 在真正部署前会写入 `workspace/bootstrap.env`。
+- `bootstrap.sh` 也会保存配置，便于下次直接复用。
+- 对于 OVMS，上述文件会包含 `OVMS_SOURCE_MODEL`、`OVMS_MODEL_NAME`、`OVMS_TARGET_DEVICE`、`OVMS_TOOL_PARSER` 等 `OVMS_*` 运行参数。
+
+```bash
+source workspace/bootstrap.env
+./tools/bootstrap.sh
 ```
 
----
+## 3.3 model_download.sh（模型准备）
+
+基础用法：
+
+```bash
+./tools/model_download.sh <mode> [model_id] [model_path]
+```
+
+模式说明：
+
+- `vllm`：准备 embedding/reranker 的 OpenVINO 模型 + vLLM LLM 模型
+- `ov`：准备 embedding/reranker 的 OpenVINO 模型 + OpenVINO INT4 LLM 模型
+
+可选参数：
+
+- `model_id`：仅对本次执行覆盖 `LLM_MODEL`
+- `model_path`：仅对本次执行覆盖 `MODEL_PATH`
+
+示例：
+
+```bash
+./tools/model_download.sh vllm
+./tools/model_download.sh ov Qwen/Qwen3-8B /data/models
+```
+
+环境行为说明：
+
+- 若当前已激活虚拟环境，会优先复用
+- 若未激活虚拟环境，脚本会自动创建并激活 `ecrag_venv`（与 `quick_start.sh` 一致）
+- 若缺失 `python3-venv` 或 `pip`，脚本会在支持的包管理器上自动安装所需前置依赖
+
+## 3.2 直接启动脚本
+
+按推理后端与部署方式可直接调用以下脚本：
+
+- OpenVINO 裸金属：`./tools/run_ov_baremetal.sh`
+- OpenVINO 容器：`./tools/run_ov_container.sh`
+- vLLM 裸金属：`./tools/run_vllm_baremetal.sh`
+- vLLM 容器：`./tools/run_vllm_container.sh`
+- OVMS 裸金属：`./tools/run_ovms_baremetal.sh`
+- OVMS 容器：`./tools/run_ovms_container.sh`
+
+适用于你已明确参数、希望跳过一键引导流程的场景。
 
-## build_images.sh
+# 4.容器镜像编译脚本
 
-构建全部镜像：
+编译全部镜像：
 
 ```bash
 ./tools/build_images.sh
 ```
 
-只构建指定镜像：
+按组件编译：
 
 ```bash
 ./tools/build_images.sh mega
@@ -178,4 +325,4 @@ If you are accessing from another machine, replace ${HOST_IP} with your server's
 ./tools/build_images.sh all
 ```
 
-完整部署说明请参考 [../docs/Advanced_Setup_zh.md](../docs/Advanced_Setup_zh.md).
+完整部署说明请参考 [../docs/Advanced_Setup_zh.md](../docs/Advanced_Setup_zh.md)。
diff --git a/EdgeCraftRAG/tools/bootstrap.sh b/EdgeCraftRAG/tools/bootstrap.sh
new file mode 100755
index 0000000000..da8ff2df83
--- /dev/null
+++ b/EdgeCraftRAG/tools/bootstrap.sh
@@ -0,0 +1,961 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# EdgeCraftRAG Bootstrap - Non-interactive Deployment Orchestrator
+# This script validates system requirements and delegates to appropriate deployment scripts.
+# For interactive mode with prompts, use quick_start.sh instead.
+
+set -euo pipefail
+
+# Script version
+BOOTSTRAP_VERSION="1.0"
+
+# Script directory and workspace detection
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Validation flags
+SKIP_VALIDATION=${SKIP_VALIDATION:-0}
+
+# Default values
+DEFAULT_INFERENCE_BACKEND="openvino"
+DEFAULT_DEPLOYMENT_METHOD="baremetal"
+
+#==============================================================================
+# Banner and Information Display
+#==============================================================================
+
+print_banner() {
+    echo ""
+    echo "╔═══════════════════════════════════════════════════════════╗"
+    echo "║         EdgeCraftRAG Bootstrap v${BOOTSTRAP_VERSION}                       ║"
+    echo "║         Deployment Preparation Tool                       ║"
+    echo "╚═══════════════════════════════════════════════════════════╝"
+    echo ""
+}
+
+print_help() {
+    cat << EOF
+Usage: ${0##*/} [options] [command]
+
+EdgeCraftRAG Bootstrap - Non-interactive deployment orchestrator
+This script validates system requirements and delegates to deployment scripts.
+For interactive mode, use quick_start.sh instead.
+
+Commands:
+  (none)          Run deployment (default)
+  cleanup         Stop all services and cleanup
+
+Options:
+  --check-only    Validate system requirements only, don't deploy
+  --help          Show usage information
+  --version       Show script version
+
+Environment Variables (all have defaults):
+    INFERENCE_BACKEND    Inference type: openvino|vllm_a770|vllm_b60|ovms (default: openvino)
+  DEPLOYMENT_METHOD    Deployment type: baremetal|container (default: baremetal)
+  MODEL_PATH           Model storage path (default: workspace/models)
+  DOC_PATH             Document storage path (default: workspace)
+  HOST_IP              Server IP address (auto-detected if not set)
+  LLM_MODEL            LLM model name (default: Qwen/Qwen3-8B)
+  SKIP_VALIDATION      Skip system checks: 0|1 (default: 0)
+
+Examples:
+  # Default: OpenVINO baremetal deployment
+  ./tools/bootstrap.sh
+
+  # vLLM A770 baremetal deployment
+  INFERENCE_BACKEND=vllm_a770 ./tools/bootstrap.sh
+
+  # OpenVINO container deployment
+  INFERENCE_BACKEND=openvino DEPLOYMENT_METHOD=container ./tools/bootstrap.sh
+
+  # System check only
+  ./tools/bootstrap.sh --check-only
+
+  # Stop services
+  ./tools/bootstrap.sh cleanup
+
+  # Reuse previous configuration
+  source workspace/bootstrap.env
+  ./tools/bootstrap.sh
+
+Configuration Persistence:
+  After successful deployment, configuration is saved to workspace/bootstrap.env
+  Source this file to reuse the same settings in future deployments.
+
+For interactive mode with prompts, use:
+  ./tools/quick_start.sh -i
+
+For more information, see: EdgeCraftRAG/tools/README.md
+
+After successful deployment, bootstrap also installs the `ecrag` CLI.
+EOF
+}
+
+print_info() {
+    echo -e "${CYAN}[INFO]${NC} $*"
+}
+
+print_success() {
+    echo -e "${GREEN}[✓]${NC} $*"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[!]${NC} $*"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $*" >&2
+}
+
+print_arrow() {
+    echo -e "${BLUE}[→]${NC} $*"
+}
+
+#==============================================================================
+# System Validation Functions
+#==============================================================================
+
+check_python_version() {
+    if ! command -v python3 &>/dev/null; then
+        print_error "Python: python3 not found"
+        echo "        → Solution: Install Python 3.10 or later"
+        echo "        → Details: Run: sudo apt update && sudo apt install python3"
+        return 1
+    fi
+
+    local python_version
+    python_version=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+    local major minor
+    major=$(echo "$python_version" | cut -d. -f1)
+    minor=$(echo "$python_version" | cut -d. -f2)
+
+    if [ "$major" -lt 3 ] || { [ "$major" -eq 3 ] && [ "$minor" -lt 10 ]; }; then
+        print_error "Python: Version $python_version detected, but 3.10+ required"
+        echo "        → Solution: Upgrade Python to 3.10 or later"
+        echo "        → Details: Run: sudo apt update && sudo apt install python3.10"
+        return 1
+    fi
+
+    print_success "Python ${python_version} detected"
+    return 0
+}
+
+check_docker() {
+    local deployment_method="${1:-}"
+
+    # Skip Docker check for baremetal deployment
+    if [[ "$deployment_method" == "baremetal" ]]; then
+        return 0
+    fi
+
+    if ! command -v docker &>/dev/null; then
+        print_error "Docker: docker command not found"
+        echo "        → Solution: Install Docker"
+        echo "        → Details: See https://docs.docker.com/engine/install/"
+        return 1
+    fi
+
+    if ! docker info &>/dev/null; then
+        print_error "Docker: Daemon not running"
+        echo "        → Solution: Start Docker service"
+        echo "        → Details: Run: sudo systemctl start docker"
+        return 1
+    fi
+
+    local docker_version
+    docker_version=$(docker --version | grep -oP '\d+\.\d+\.\d+' | head -1)
+    print_success "Docker ${docker_version} running"
+    return 0
+}
+
+check_disk_space() {
+    local available_gb
+    available_gb=$(df -BG "${WORKPATH}" | awk 'NR==2 {print $4}' | sed 's/G//')
+
+    if [ "$available_gb" -lt 50 ]; then
+        print_warning "Disk space: Only ${available_gb}GB available, 50GB+ recommended"
+        echo "        → Solution: Free up disk space or use custom MODEL_PATH"
+        echo "        → Details: Models require ~40GB storage"
+        return 0  # Warning, not error
+    fi
+
+    print_success "Disk space: ${available_gb}GB available"
+    return 0
+}
+
+check_groups() {
+    local video_gid render_gid
+
+    if getent group video &>/dev/null; then
+        video_gid=$(getent group video | cut -d: -f3)
+    else
+        print_warning "Video group not found (optional for some deployments)"
+        video_gid=""
+    fi
+
+    if getent group render &>/dev/null; then
+        render_gid=$(getent group render | cut -d: -f3)
+    else
+        print_warning "Render group not found (optional for some deployments)"
+        render_gid=""
+    fi
+
+    if [[ -n "$video_gid" ]] && [[ -n "$render_gid" ]]; then
+        print_success "Video group (gid:${video_gid}) and render group (gid:${render_gid}) found"
+        export VIDEOGROUPID="$video_gid"
+        export RENDERGROUPID="$render_gid"
+    fi
+
+    return 0
+}
+
+validate_system_requirements() {
+    local deployment_method="${1:-}"
+
+    print_info "Validating system requirements..."
+
+    local all_checks_passed=0
+
+    if ! check_python_version; then
+        all_checks_passed=1
+    fi
+
+    if ! check_docker "$deployment_method"; then
+        all_checks_passed=1
+    fi
+
+    check_disk_space  # Always continue, just warn
+    check_groups      # Always continue, just warn
+
+    if [ $all_checks_passed -ne 0 ]; then
+        print_error "System requirements not met"
+        return 1
+    fi
+
+    print_success "All system requirements met"
+    echo ""
+    return 0
+}
+
+#==============================================================================
+# Environment Setup Functions
+#==============================================================================
+
+detect_host_ip() {
+    if [[ -z "${HOST_IP:-}" ]]; then
+        HOST_IP=$(hostname -I | awk '{print $1}')
+        if [[ -z "$HOST_IP" ]]; then
+            HOST_IP="127.0.0.1"
+            print_warning "Could not detect host IP, using 127.0.0.1"
+        fi
+    fi
+    export HOST_IP
+}
+
+set_default_paths() {
+    export WORKPATH
+    export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
+    export DOC_PATH=${DOC_PATH:-"${WORKPATH}/workspace"}
+    export TMPFILE_PATH=${TMPFILE_PATH:-"${WORKPATH}/workspace"}
+    export LLM_MODEL=${LLM_MODEL:-"Qwen/Qwen3-8B"}
+    export MILVUS_ENABLED=${MILVUS_ENABLED:-"1"}
+    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-"0"}
+}
+
+setup_environment() {
+    print_info "Detecting environment..."
+
+    detect_host_ip
+    set_default_paths
+
+    print_success "Host IP: ${HOST_IP}"
+    print_success "EdgeCraftRAG root: ${WORKPATH}"
+    print_success "Model path: ${MODEL_PATH}"
+    echo ""
+
+    return 0
+}
+
+save_bootstrap_config() {
+    local backend="$1"
+    local method="$2"
+    local config_file="${WORKPATH}/workspace/bootstrap.env"
+
+    # Create workspace directory if it doesn't exist
+    mkdir -p "${WORKPATH}/workspace"
+
+    # Save configuration to file
+    cat > "${config_file}" << EOF
+# EdgeCraftRAG Bootstrap Configuration
+# Generated: $(date)
+# This file contains the environment variables used for deployment.
+# Source this file to reuse the same configuration:
+#   source workspace/bootstrap.env
+#   ./tools/bootstrap.sh
+
+# Deployment Configuration
+export INFERENCE_BACKEND="${backend}"
+export DEPLOYMENT_METHOD="${method}"
+
+# Paths
+export MODEL_PATH="${MODEL_PATH}"
+export DOC_PATH="${DOC_PATH}"
+export TMPFILE_PATH="${TMPFILE_PATH}"
+
+# Network
+export HOST_IP="${HOST_IP}"
+
+# Model Configuration
+export LLM_MODEL="${LLM_MODEL}"
+export OV_CONVERSION_METHOD="${OV_CONVERSION_METHOD:-int4}"
+export OVMS_SERVICE_PORT="${OVMS_SERVICE_PORT:-8000}"
+export OVMS_ENDPOINT="${OVMS_ENDPOINT:-http://${HOST_IP}:${OVMS_SERVICE_PORT:-8000}}"
+export OVMS_REST_PORT="${OVMS_REST_PORT:-${OVMS_SERVICE_PORT:-8000}}"
+export OVMS_SOURCE_MODEL="${OVMS_SOURCE_MODEL:-${LLM_MODEL}}"
+export OVMS_MODEL_REPOSITORY_PATH="${OVMS_MODEL_REPOSITORY_PATH:-/models}"
+export OVMS_MODEL_NAME="${OVMS_MODEL_NAME:-${OVMS_SOURCE_MODEL:-${LLM_MODEL}}}"
+export OVMS_TARGET_DEVICE="${OVMS_TARGET_DEVICE:-GPU.0}"
+export OVMS_TASK="${OVMS_TASK:-text_generation}"
+export OVMS_CACHE_DIR="${OVMS_CACHE_DIR:-/models/.ov_cache}"
+export OVMS_ENABLE_PREFIX_CACHING="${OVMS_ENABLE_PREFIX_CACHING:-true}"
+export OVMS_TOOL_PARSER="${OVMS_TOOL_PARSER:-qwen3coder}"
+export OVMS_ENABLE_TOOL_GUIDED_GENERATION="${OVMS_ENABLE_TOOL_GUIDED_GENERATION:-true}"
+export OVMS_MAX_NUM_BATCHED_TOKENS="${OVMS_MAX_NUM_BATCHED_TOKENS:-8192}"
+
+# Services
+export MILVUS_ENABLED="${MILVUS_ENABLED}"
+export CHAT_HISTORY_ROUND="${CHAT_HISTORY_ROUND}"
+
+# Skip validation on reuse (system already validated)
+export SKIP_VALIDATION=1
+EOF
+
+    chmod 644 "${config_file}"
+    print_success "Configuration saved to: workspace/bootstrap.env"
+}
+
+install_ecrag_cli() {
+    print_info "Installing ecrag CLI..."
+
+    export BOOTSTRAP_ECRAG_COMMAND=""
+    export BOOTSTRAP_ECRAG_PATH_HINT=""
+    local cli_root="${WORKPATH}/cli"
+
+    if [[ ! -f "${cli_root}/setup.py" ]]; then
+        print_error "CLI setup script not found: ${cli_root}/setup.py"
+        return 1
+    fi
+
+    # Prefer editable install for local development workflows.
+    if ! python3 -m pip install -e "${cli_root}" >/dev/null 2>&1; then
+        print_warning "Editable install failed, trying PEP668-compatible fallback"
+        if ! python3 -m pip install --break-system-packages -e "${cli_root}" >/dev/null 2>&1; then
+            print_warning "Fallback editable install failed, trying non-editable install"
+            if ! python3 -m pip install "${cli_root}" >/dev/null 2>&1; then
+                if ! python3 -m pip install --break-system-packages "${cli_root}" >/dev/null 2>&1; then
+                    print_error "Failed to install ecrag CLI"
+                    echo "        → Try manually: cd ${cli_root} && python3 -m pip install --break-system-packages -e ."
+                    return 1
+                fi
+            fi
+        fi
+    fi
+
+    # Refresh command lookup after installation.
+    hash -r 2>/dev/null || true
+
+    if command -v ecrag >/dev/null 2>&1; then
+        export BOOTSTRAP_ECRAG_COMMAND="ecrag"
+        print_success "CLI installed: $(command -v ecrag)"
+        return 0
+    fi
+
+    if [[ -x "${HOME}/.local/bin/ecrag" ]]; then
+        export BOOTSTRAP_ECRAG_COMMAND="${HOME}/.local/bin/ecrag"
+        export BOOTSTRAP_ECRAG_PATH_HINT="export PATH=\"${HOME}/.local/bin:\$PATH\""
+        print_warning "CLI installed at ${HOME}/.local/bin/ecrag but not in PATH"
+        echo "        → Use directly: ${HOME}/.local/bin/ecrag --help"
+        echo "        → Add to PATH: ${BOOTSTRAP_ECRAG_PATH_HINT}"
+        return 0
+    fi
+
+    print_error "CLI installation finished but command not found"
+    echo "        → Try manually: cd ${cli_root} && python3 -m pip install --break-system-packages -e ."
+    return 1
+}
+
+#==============================================================================
+# Inference Backend Selection
+#==============================================================================
+
+# Removed interactive menu - use environment variables or defaults
+
+normalize_inference_backend() {
+    local backend="$1"
+
+    case "$backend" in
+        1|openvino|ov|OpenVINO)
+            echo "openvino"
+            ;;
+        2|vllm_a770|vLLM_A770|a770)
+            echo "vllm_a770"
+            ;;
+        3|vllm_b60|vLLM_B60|b60)
+            echo "vllm_b60"
+            ;;
+        4|ovms|OVMS)
+            echo "ovms"
+            ;;
+        *)
+            print_error "Invalid inference backend: $backend"
+            echo "        → Valid options: openvino, vllm_a770, vllm_b60, ovms"
+            return 2
+            ;;
+    esac
+}
+
+get_inference_backend_from_env() {
+    # Priority 1: INFERENCE_BACKEND
+    if [[ -n "${INFERENCE_BACKEND:-}" ]]; then
+        normalize_inference_backend "$INFERENCE_BACKEND"
+        return
+    fi
+
+    # Priority 2: COMPOSE_PROFILES (backward compatibility)
+    if [[ -n "${COMPOSE_PROFILES:-}" ]]; then
+        case "$COMPOSE_PROFILES" in
+            vLLM_A770|vllm_on_a770|vLLM)
+                echo "vllm_a770"
+                return
+                ;;
+            vLLM_B60|vllm_on_b60)
+                echo "vllm_b60"
+                return
+                ;;
+            ovms|OVMS)
+                echo "ovms"
+                return
+                ;;
+            *)
+                echo "openvino"
+                return
+                ;;
+        esac
+    fi
+
+    # No env var set
+    echo ""
+}
+
+select_inference_backend() {
+    local backend
+
+    # Get from environment or use default
+    backend=$(get_inference_backend_from_env)
+
+    # If not set, use default
+    if [[ -z "$backend" ]]; then
+        backend="$DEFAULT_INFERENCE_BACKEND"
+    fi
+
+    echo "$backend"
+}
+
+#==============================================================================
+# Deployment Method Selection
+#==============================================================================
+
+# Removed interactive menu - use environment variables or defaults
+
+normalize_deployment_method() {
+    local method="$1"
+
+    case "$method" in
+        1|baremetal|bare_metal|local|Baremetal)
+            echo "baremetal"
+            ;;
+        2|container|docker|Container)
+            echo "container"
+            ;;
+        *)
+            print_error "Invalid deployment method: $method"
+            echo "        → Valid options: baremetal, container"
+            return 2
+            ;;
+    esac
+}
+
+get_deployment_method_from_env() {
+    if [[ -n "${DEPLOYMENT_METHOD:-}" ]]; then
+        normalize_deployment_method "$DEPLOYMENT_METHOD"
+        return
+    fi
+
+    # No env var set
+    echo ""
+}
+
+select_deployment_method() {
+    local method
+
+    # Get from environment or use default
+    method=$(get_deployment_method_from_env)
+
+    # If not set, use default
+    if [[ -z "$method" ]]; then
+        method="$DEFAULT_DEPLOYMENT_METHOD"
+    fi
+
+    echo "$method"
+}
+
+get_backend_display_name() {
+    local backend="$1"
+
+    case "$backend" in
+        openvino)
+            echo "OpenVINO"
+            ;;
+        vllm_a770)
+            echo "vLLM on Arc A770"
+            ;;
+        vllm_b60)
+            echo "vLLM on Arc B60"
+            ;;
+        ovms)
+            echo "OVMS"
+            ;;
+        *)
+            echo "Unknown"
+            ;;
+    esac
+}
+
+get_deployment_display_name() {
+    local method="$1"
+
+    case "$method" in
+        container)
+            echo "Container (Docker)"
+            ;;
+        baremetal)
+            echo "Baremetal"
+            ;;
+        *)
+            echo "Unknown"
+            ;;
+    esac
+}
+
+#==============================================================================
+# Deployment Delegation Functions
+#==============================================================================
+
+validate_backend_deployment_combo() {
+    local backend="$1"
+    local method="$2"
+
+    # All combinations are now supported
+    # vLLM baremetal = vLLM container + EdgeCraftRAG bare-metal services
+    return 0
+}
+
+map_deployment_to_script() {
+    local backend="$1"
+    local method="$2"
+    local script=""
+    local script_args=""
+
+    # Validate combination first
+    if ! validate_backend_deployment_combo "$backend" "$method"; then
+        return 2
+    fi
+
+    if [[ "$method" == "container" ]]; then
+        case "$backend" in
+            openvino)
+                script="${SCRIPT_DIR}/run_ov_container.sh"
+                script_args="start"
+                export COMPOSE_PROFILES=""
+                ;;
+            vllm_a770)
+                script="${SCRIPT_DIR}/run_vllm_container.sh"
+                script_args="start"
+                export VLLM_BACKEND="a770"
+                ;;
+            vllm_b60)
+                script="${SCRIPT_DIR}/run_vllm_container.sh"
+                script_args="start"
+                export VLLM_BACKEND="b60"
+                ;;
+            ovms)
+                script="${SCRIPT_DIR}/run_ovms_container.sh"
+                script_args="start"
+                ;;
+        esac
+    elif [[ "$method" == "baremetal" ]]; then
+        case "$backend" in
+            openvino)
+                script="${SCRIPT_DIR}/run_ov_baremetal.sh"
+                script_args="start all"
+                ;;
+            vllm_a770)
+                script="${SCRIPT_DIR}/run_vllm_baremetal.sh"
+                script_args="start all"
+                export VLLM_BACKEND="a770"
+                ;;
+            vllm_b60)
+                script="${SCRIPT_DIR}/run_vllm_baremetal.sh"
+                script_args="start all"
+                export VLLM_BACKEND="b60"
+                ;;
+            ovms)
+                script="${SCRIPT_DIR}/run_ovms_baremetal.sh"
+                script_args="start all"
+                ;;
+        esac
+    else
+        print_error "Unknown deployment method: $method"
+        return 2
+    fi
+
+    if [ ! -f "$script" ]; then
+        print_error "Deployment script not found: $script"
+        return 4
+    fi
+
+    echo "$script $script_args"
+}
+
+print_deployment_summary() {
+    local backend="$1"
+    local method="$2"
+    local backend_name
+    local method_name
+
+    backend_name=$(get_backend_display_name "$backend")
+    method_name=$(get_deployment_display_name "$method")
+
+    echo ""
+    print_info "Inference Backend: ${backend_name}"
+    print_info "Deployment Method: ${method_name}"
+    print_info "Host IP: ${HOST_IP}"
+    print_info "Model Path: ${MODEL_PATH}"
+    print_info "LLM Model: ${LLM_MODEL}"
+    echo ""
+}
+
+delegate_to_deployment_script() {
+    local backend="$1"
+    local method="$2"
+    local script_cmd
+
+    script_cmd=$(map_deployment_to_script "$backend" "$method")
+    if [ $? -ne 0 ]; then
+        return 4
+    fi
+
+    print_deployment_summary "$backend" "$method"
+
+    print_arrow "Delegating to deployment script..."
+    echo ""
+    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    echo ""
+
+    # Execute the deployment script
+    if ! bash -c "$script_cmd"; then
+        echo ""
+        echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        print_error "Deployment script failed"
+        return 4
+    fi
+
+    echo ""
+    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+    return 0
+}
+
+print_completion_info() {
+    local backend="$1"
+    local method="$2"
+    local cli_command="${BOOTSTRAP_ECRAG_COMMAND:-ecrag}"
+
+    echo ""
+    echo "╔═══════════════════════════════════════════════════════════╗"
+    echo "║         Deployment Complete!                              ║"
+    echo "╚═══════════════════════════════════════════════════════════╝"
+    echo ""
+
+    echo "UI Access: http://${HOST_IP}:8082"
+    if [[ "$method" == "baremetal" ]]; then
+        echo "API Endpoint: http://${HOST_IP}:16010"
+    fi
+    echo ""
+    echo "Next steps:"
+    echo "  • Upload documents via the UI"
+    echo "  • Try the chat interface"
+    echo "  • Run CLI: ${cli_command} --help"
+    if [[ -n "${BOOTSTRAP_ECRAG_PATH_HINT:-}" ]]; then
+        echo "  • Add CLI to PATH: ${BOOTSTRAP_ECRAG_PATH_HINT}"
+    fi
+
+    # Show appropriate stop/status commands based on backend and method
+    case "$backend" in
+        openvino)
+            case "$method" in
+                baremetal)
+                    echo "  • Stop services: ./tools/run_ov_baremetal.sh stop"
+                    echo "  • View status: ./tools/run_ov_baremetal.sh status"
+                    ;;
+                container)
+                    echo "  • Stop services: ./tools/run_ov_container.sh stop"
+                    echo "  • View status: ./tools/run_ov_container.sh status"
+                    echo "  • View logs: ./tools/run_ov_container.sh logs [server|mega|ui]"
+                    ;;
+            esac
+            ;;
+        vllm_a770|vllm_b60)
+            case "$method" in
+                baremetal)
+                    echo "  • Stop services: ./tools/run_vllm_baremetal.sh stop"
+                    echo "  • View status: ./tools/run_vllm_baremetal.sh status"
+                    ;;
+                container)
+                    echo "  • Stop services: ./tools/run_vllm_container.sh stop"
+                    echo "  • View status: ./tools/run_vllm_container.sh status"
+                    echo "  • View logs: ./tools/run_vllm_container.sh logs [vllm|server|mega|ui]"
+                    ;;
+            esac
+            ;;
+        ovms)
+            case "$method" in
+                baremetal)
+                    echo "  • Stop services: ./tools/run_ovms_baremetal.sh stop"
+                    echo "  • View status: ./tools/run_ovms_baremetal.sh status"
+                    ;;
+                container)
+                    echo "  • Stop services: ./tools/run_ovms_container.sh stop"
+                    echo "  • View status: ./tools/run_ovms_container.sh status"
+                    echo "  • View logs: ./tools/run_ovms_container.sh logs [ovms|server|mega|ui]"
+                    ;;
+            esac
+            ;;
+    esac
+
+    echo ""
+    echo "To reuse this configuration:"
+    echo "  source workspace/bootstrap.env"
+    echo "  ./tools/bootstrap.sh"
+    echo ""
+    echo "For troubleshooting: see EdgeCraftRAG/README.md"
+    echo ""
+}
+
+#==============================================================================
+# Cleanup Function
+#==============================================================================
+
+handle_cleanup() {
+    print_info "Stopping EdgeCraftRAG services..."
+
+    # Try all deployment script cleanups
+    if [ -f "${SCRIPT_DIR}/run_ov_container.sh" ]; then
+        bash "${SCRIPT_DIR}/run_ov_container.sh" stop 2>/dev/null || true
+    fi
+
+    if [ -f "${SCRIPT_DIR}/run_ov_baremetal.sh" ]; then
+        bash "${SCRIPT_DIR}/run_ov_baremetal.sh" stop 2>/dev/null || true
+    fi
+
+    if [ -f "${SCRIPT_DIR}/run_vllm_container.sh" ]; then
+        bash "${SCRIPT_DIR}/run_vllm_container.sh" stop 2>/dev/null || true
+    fi
+
+    if [ -f "${SCRIPT_DIR}/run_vllm_baremetal.sh" ]; then
+        bash "${SCRIPT_DIR}/run_vllm_baremetal.sh" stop 2>/dev/null || true
+    fi
+
+    if [ -f "${SCRIPT_DIR}/run_ovms_container.sh" ]; then
+        bash "${SCRIPT_DIR}/run_ovms_container.sh" stop 2>/dev/null || true
+    fi
+
+    if [ -f "${SCRIPT_DIR}/run_ovms_baremetal.sh" ]; then
+        bash "${SCRIPT_DIR}/run_ovms_baremetal.sh" stop 2>/dev/null || true
+    fi
+
+    print_success "Cleanup complete"
+    return 0
+}
+
+#==============================================================================
+# Main Function
+#==============================================================================
+
+main() {
+    local check_only=0
+    local command=""
+
+    # Parse command-line arguments
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --help|-h)
+                print_help
+                exit 0
+                ;;
+            --version|-v)
+                echo "EdgeCraftRAG Bootstrap v${BOOTSTRAP_VERSION}"
+                exit 0
+                ;;
+            --check-only)
+                check_only=1
+                shift
+                ;;
+            cleanup)
+                command="cleanup"
+                shift
+                ;;
+            *)
+                print_error "Unknown option: $1"
+                echo ""
+                print_help
+                exit 2
+                ;;
+        esac
+    done
+
+    # Handle cleanup command
+    if [[ "$command" == "cleanup" ]]; then
+        handle_cleanup
+        exit $?
+    fi
+
+    # Load previous configuration if it exists.
+    # Explicitly provided environment variables for the current run must win
+    # over values persisted from an earlier deployment.
+    local config_file="${WORKPATH}/workspace/bootstrap.env"
+    if [[ -f "$config_file" ]]; then
+        local saved_inference_backend="${INFERENCE_BACKEND-__BOOTSTRAP_UNSET__}"
+        local saved_deployment_method="${DEPLOYMENT_METHOD-__BOOTSTRAP_UNSET__}"
+        local saved_model_path="${MODEL_PATH-__BOOTSTRAP_UNSET__}"
+        local saved_doc_path="${DOC_PATH-__BOOTSTRAP_UNSET__}"
+        local saved_tmpfile_path="${TMPFILE_PATH-__BOOTSTRAP_UNSET__}"
+        local saved_host_ip="${HOST_IP-__BOOTSTRAP_UNSET__}"
+        local saved_llm_model="${LLM_MODEL-__BOOTSTRAP_UNSET__}"
+        local saved_ov_conversion_method="${OV_CONVERSION_METHOD-__BOOTSTRAP_UNSET__}"
+        local saved_milvus_enabled="${MILVUS_ENABLED-__BOOTSTRAP_UNSET__}"
+        local saved_chat_history_round="${CHAT_HISTORY_ROUND-__BOOTSTRAP_UNSET__}"
+        local saved_skip_validation="${SKIP_VALIDATION-__BOOTSTRAP_UNSET__}"
+
+        print_info "Loading saved configuration from workspace/bootstrap.env"
+        # Source the file to load environment variables
+        # shellcheck disable=SC1090
+        source "$config_file"
+
+        if [[ "$saved_inference_backend" != "__BOOTSTRAP_UNSET__" ]]; then
+            export INFERENCE_BACKEND="$saved_inference_backend"
+        fi
+        if [[ "$saved_deployment_method" != "__BOOTSTRAP_UNSET__" ]]; then
+            export DEPLOYMENT_METHOD="$saved_deployment_method"
+        fi
+        if [[ "$saved_model_path" != "__BOOTSTRAP_UNSET__" ]]; then
+            export MODEL_PATH="$saved_model_path"
+        fi
+        if [[ "$saved_doc_path" != "__BOOTSTRAP_UNSET__" ]]; then
+            export DOC_PATH="$saved_doc_path"
+        fi
+        if [[ "$saved_tmpfile_path" != "__BOOTSTRAP_UNSET__" ]]; then
+            export TMPFILE_PATH="$saved_tmpfile_path"
+        fi
+        if [[ "$saved_host_ip" != "__BOOTSTRAP_UNSET__" ]]; then
+            export HOST_IP="$saved_host_ip"
+        fi
+        if [[ "$saved_llm_model" != "__BOOTSTRAP_UNSET__" ]]; then
+            export LLM_MODEL="$saved_llm_model"
+        fi
+        if [[ "$saved_ov_conversion_method" != "__BOOTSTRAP_UNSET__" ]]; then
+            export OV_CONVERSION_METHOD="$saved_ov_conversion_method"
+        fi
+        if [[ "$saved_milvus_enabled" != "__BOOTSTRAP_UNSET__" ]]; then
+            export MILVUS_ENABLED="$saved_milvus_enabled"
+        fi
+        if [[ "$saved_chat_history_round" != "__BOOTSTRAP_UNSET__" ]]; then
+            export CHAT_HISTORY_ROUND="$saved_chat_history_round"
+        fi
+        if [[ "$saved_skip_validation" != "__BOOTSTRAP_UNSET__" ]]; then
+            export SKIP_VALIDATION="$saved_skip_validation"
+        fi
+
+        print_success "Previous configuration loaded"
+        echo ""
+    fi
+
+    # Print banner
+    print_banner
+
+    # Setup environment first (needed for validation)
+    setup_environment
+
+    # Get inference backend
+    local inference_backend
+    inference_backend=$(select_inference_backend)
+    if [ $? -ne 0 ]; then
+        exit 2
+    fi
+
+    # Get deployment method
+    local deployment_method
+    deployment_method=$(select_deployment_method)
+    if [ $? -ne 0 ]; then
+        exit 2
+    fi
+
+    # Validate system requirements unless skipped
+    if [ "$SKIP_VALIDATION" -ne 1 ]; then
+        if ! validate_system_requirements "$deployment_method"; then
+            exit 1
+        fi
+    fi
+
+    # If check-only, exit here
+    if [ $check_only -eq 1 ]; then
+        print_info "System check complete - ready for deployment"
+        exit 0
+    fi
+
+    # Store normalized values for later use
+    export BOOTSTRAP_INFERENCE_BACKEND="$inference_backend"
+    export BOOTSTRAP_DEPLOYMENT_METHOD="$deployment_method"
+
+    # Delegate to deployment script
+    if ! delegate_to_deployment_script "$inference_backend" "$deployment_method"; then
+        exit 4
+    fi
+
+    # Save configuration for reuse
+    save_bootstrap_config "$inference_backend" "$deployment_method"
+
+    # Install CLI so users can access ecrag directly after bootstrap
+    if ! install_ecrag_cli; then
+        exit 5
+    fi
+
+    # Print completion information
+    print_completion_info "$inference_backend" "$deployment_method"
+
+    exit 0
+}
+
+# Execute main function
+main "$@"
diff --git a/EdgeCraftRAG/tools/build_images.sh b/EdgeCraftRAG/tools/build_images.sh
index abfb0a42dc..6a2bd0ed2f 100755
--- a/EdgeCraftRAG/tools/build_images.sh
+++ b/EdgeCraftRAG/tools/build_images.sh
@@ -1,7 +1,4 @@
 #!/usr/bin/env bash
-# Copyright (C) 2026 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
 set -euo pipefail
 
 PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
diff --git a/EdgeCraftRAG/tools/model_download.sh b/EdgeCraftRAG/tools/model_download.sh
new file mode 100755
index 0000000000..ae167ae016
--- /dev/null
+++ b/EdgeCraftRAG/tools/model_download.sh
@@ -0,0 +1,705 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# EdgeCraftRAG Model Download Tool
+# Supports ModelScope (default) and Hugging Face download sources.
+
+set -e
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+ENV_NAME="${WORKPATH}/ecrag_venv"
+
+MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
+LLM_MODEL=${LLM_MODEL:-"Qwen/Qwen3-8B"}
+EMBEDDING_MODEL=${EMBEDDING_MODEL:-"BAAI/bge-small-en-v1.5"}
+RERANKER_MODEL=${RERANKER_MODEL:-"BAAI/bge-reranker-large"}
+MODEL_DOWNLOAD_SOURCE=${MODEL_DOWNLOAD_SOURCE:-"modelscope"}
+OV_CONVERSION_METHOD=${OV_CONVERSION_METHOD:-"int4"}
+EMBEDDING_RERANKER_OV_WEIGHT_FORMAT=${EMBEDDING_RERANKER_OV_WEIGHT_FORMAT:-""}
+SKIP_SOURCE_MODEL_DOWNLOAD=${SKIP_SOURCE_MODEL_DOWNLOAD:-"0"}
+SOURCE_MODEL_PATH=${SOURCE_MODEL_PATH:-""}
+
+resolve_python_cmd() {
+    if [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then
+        echo "${VIRTUAL_ENV}/bin/python"
+        return 0
+    fi
+
+    if [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
+        echo "${CONDA_PREFIX}/bin/python"
+        return 0
+    fi
+
+    if [[ -n "${PYTHON_BIN:-}" ]] && command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
+        echo "${PYTHON_BIN}"
+        return 0
+    fi
+
+    # Keep Python selection consistent with quick_start.sh.
+    if command -v python3.11 >/dev/null 2>&1; then
+        echo "python3.11"
+        return 0
+    fi
+
+    if command -v python3.10 >/dev/null 2>&1; then
+        echo "python3.10"
+        return 0
+    fi
+
+    if command -v python3 >/dev/null 2>&1; then
+        echo "python3"
+        return 0
+    fi
+
+    if command -v python >/dev/null 2>&1; then
+        echo "python"
+        return 0
+    fi
+
+    echo "[Model Check] ERROR: Python interpreter not found (need python3 or python)"
+    exit 1
+}
+
+PYTHON_CMD=$(resolve_python_cmd)
+
+setup_python_venv() {
+    local base_python_cmd
+    base_python_cmd=$(resolve_python_cmd)
+
+    if [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then
+        PYTHON_CMD="${VIRTUAL_ENV}/bin/python"
+        echo "[Model Check] Using active virtual environment: ${VIRTUAL_ENV}"
+        return 0
+    fi
+
+    if [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
+        PYTHON_CMD="${CONDA_PREFIX}/bin/python"
+        echo "[Model Check] Using active conda environment: ${CONDA_PREFIX}"
+        return 0
+    fi
+
+    if ! "${base_python_cmd}" -c "import ensurepip" >/dev/null 2>&1; then
+        echo "[Model Check] python3-venv (ensurepip) not found, installing..."
+        local py_ver
+        py_ver=$("${base_python_cmd}" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+
+        if command -v apt-get >/dev/null 2>&1; then
+            sudo apt-get update
+            if ! sudo apt-get install -y "python${py_ver}-venv"; then
+                sudo apt-get install -y python3-venv
+            fi
+        elif command -v dnf >/dev/null 2>&1; then
+            sudo dnf install -y python3-virtualenv
+        elif command -v yum >/dev/null 2>&1; then
+            sudo yum install -y python3-virtualenv
+        else
+            echo "[Model Check] ERROR: Unsupported package manager. Please install python3-venv manually."
+            exit 1
+        fi
+    fi
+
+    if [[ ! -f "${ENV_NAME}/bin/activate" && ! -f "${ENV_NAME}/Scripts/activate" ]]; then
+        echo "[Model Check] Creating virtual environment at ${ENV_NAME}..."
+        rm -rf "${ENV_NAME}"
+        "${base_python_cmd}" -m venv "${ENV_NAME}"
+    fi
+
+    if [[ -f "${ENV_NAME}/bin/activate" ]]; then
+        # shellcheck disable=SC1090
+        source "${ENV_NAME}/bin/activate"
+    elif [[ -f "${ENV_NAME}/Scripts/activate" ]]; then
+        # shellcheck disable=SC1090
+        source "${ENV_NAME}/Scripts/activate"
+    else
+        echo "[Model Check] ERROR: Failed to activate virtual environment at ${ENV_NAME}"
+        exit 1
+    fi
+
+    PYTHON_CMD=$(resolve_python_cmd)
+    echo "[Model Check] Python virtual environment activated: ${VIRTUAL_ENV}"
+}
+
+ensure_python_venv_support() {
+    if "${PYTHON_CMD}" -c "import ensurepip" >/dev/null 2>&1; then
+        return 0
+    fi
+
+    echo "[Model Check] python3-venv (ensurepip) not found for ${PYTHON_CMD}, installing..."
+    local py_ver
+    py_ver=$("${PYTHON_CMD}" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+
+    if command -v apt-get >/dev/null 2>&1; then
+        sudo apt-get update
+        if ! sudo apt-get install -y "python${py_ver}-venv"; then
+            sudo apt-get install -y python3-venv
+        fi
+    elif command -v dnf >/dev/null 2>&1; then
+        sudo dnf install -y python3-virtualenv
+    elif command -v yum >/dev/null 2>&1; then
+        sudo yum install -y python3-virtualenv
+    else
+        echo "[Model Check] ERROR: Unsupported package manager. Please install python3-venv manually."
+        exit 1
+    fi
+
+    if ! "${PYTHON_CMD}" -c "import ensurepip" >/dev/null 2>&1; then
+        echo "[Model Check] ERROR: ensurepip still unavailable after python3-venv installation"
+        exit 1
+    fi
+}
+
+ensure_pip_available() {
+    ensure_python_venv_support
+
+    if "${PYTHON_CMD}" -m pip --version >/dev/null 2>&1; then
+        return 0
+    fi
+
+    echo "[Model Check] pip not found for ${PYTHON_CMD}, attempting bootstrap..."
+
+    # First try stdlib ensurepip (works in many environments and virtualenvs).
+    "${PYTHON_CMD}" -m ensurepip --upgrade >/dev/null 2>&1 || true
+
+    if "${PYTHON_CMD}" -m pip --version >/dev/null 2>&1; then
+        return 0
+    fi
+
+    # Ubuntu fallback for system Python where ensurepip may be unavailable.
+    if command -v apt-get >/dev/null 2>&1; then
+        echo "[Model Check] Installing python3-pip via apt..."
+        sudo apt-get update
+        if ! sudo apt-get install -y python3-pip; then
+            echo "[Model Check] ERROR: Failed to install python3-pip"
+            exit 1
+        fi
+    fi
+
+    if ! "${PYTHON_CMD}" -m pip --version >/dev/null 2>&1; then
+        echo "[Model Check] ERROR: pip is still unavailable for ${PYTHON_CMD}"
+        echo "[Model Check] Please install pip manually and rerun"
+        exit 1
+    fi
+}
+
+ensure_openvino_tooling() {
+    if ! "${PYTHON_CMD}" -c "import optimum.commands.optimum_cli" >/dev/null 2>&1; then
+        echo "[Model Check] 'optimum-cli' not found, installing optimum-intel[openvino]..."
+        ensure_pip_available
+        "${PYTHON_CMD}" -m pip install --upgrade-strategy eager "optimum-intel[openvino]"
+    fi
+}
+
+run_optimum_cli() {
+    local optimum_cli_bin
+
+    ensure_openvino_tooling
+    optimum_cli_bin="$(dirname "${PYTHON_CMD}")/optimum-cli"
+
+    if [[ -x "${optimum_cli_bin}" ]]; then
+        "${optimum_cli_bin}" "$@"
+        return 0
+    fi
+
+    "${PYTHON_CMD}" -m optimum.commands.optimum_cli "$@"
+}
+
+ensure_modelscope_tooling() {
+    if ! "${PYTHON_CMD}" -c "import modelscope" >/dev/null 2>&1; then
+        echo "[Model Check] 'modelscope' not found, installing modelscope..."
+        ensure_pip_available
+        "${PYTHON_CMD}" -m pip install modelscope
+    fi
+}
+
+ensure_huggingface_tooling() {
+    if ! "${PYTHON_CMD}" -c "import huggingface_hub" >/dev/null 2>&1; then
+        echo "[Model Check] 'huggingface_hub' not found, installing huggingface_hub..."
+        ensure_pip_available
+        "${PYTHON_CMD}" -m pip install huggingface_hub
+    fi
+}
+
+normalize_ov_conversion_method() {
+    local method="${1:-int4}"
+
+    case "${method,,}" in
+        int4)
+            echo "int4"
+            ;;
+        int8)
+            echo "int8"
+            ;;
+        fp16)
+            echo "fp16"
+            ;;
+        *)
+            echo "[Model Check] ERROR: Unsupported OV_CONVERSION_METHOD='${method}'" >&2
+            echo "[Model Check] Supported values: int4 | int8 | fp16" >&2
+            exit 1
+            ;;
+    esac
+}
+
+normalize_embedding_reranker_weight_format() {
+    local format="${1:-}"
+
+    case "${format,,}" in
+        ""|none)
+            echo "none"
+            ;;
+        auto)
+            echo "auto"
+            ;;
+        int4)
+            echo "int4"
+            ;;
+        int8)
+            echo "int8"
+            ;;
+        fp16)
+            echo "fp16"
+            ;;
+        *)
+            echo "[Model Check] ERROR: Unsupported EMBEDDING_RERANKER_OV_WEIGHT_FORMAT='${format}'" >&2
+            echo "[Model Check] Supported values: <empty> | none | auto | int4 | int8 | fp16" >&2
+            exit 1
+            ;;
+    esac
+}
+
+get_embedding_or_reranker_target_dir() {
+    local model_id="$1"
+    local format="$2"
+
+    if [[ "${format}" == "auto" || "${format}" == "none" ]]; then
+        echo "${MODEL_PATH}/${model_id}"
+    else
+        echo "${MODEL_PATH}/${model_id}-${format}"
+    fi
+}
+
+get_ov_llm_repo_id() {
+    local model_id="$1"
+    local method="$2"
+
+    if [[ "${model_id}" == OpenVINO/*-ov ]]; then
+        echo "${model_id}"
+        return 0
+    fi
+
+    echo "OpenVINO/${model_id##*/}-${method}-ov"
+}
+
+get_ov_llm_target_dir() {
+    local method
+    method=$(normalize_ov_conversion_method "${OV_CONVERSION_METHOD}")
+
+    echo "${MODEL_PATH}/$(get_ov_llm_repo_id "${LLM_MODEL}" "${method}")"
+}
+
+openvino_model_exists() {
+    local target_dir="$1"
+    [[ -f "${target_dir}/openvino_model.xml" ]]
+}
+
+source_model_dir_ready() {
+    local target_dir="$1"
+
+    [[ -d "${target_dir}" ]] || return 1
+
+    if [[ -f "${target_dir}/openvino_model.xml" ]]; then
+        return 0
+    fi
+
+    if [[ ! -f "${target_dir}/config.json" ]]; then
+        return 1
+    fi
+
+    if compgen -G "${target_dir}/*.safetensors" >/dev/null 2>&1; then
+        return 0
+    fi
+
+    if compgen -G "${target_dir}/*.bin" >/dev/null 2>&1; then
+        return 0
+    fi
+
+    if [[ -f "${target_dir}/model.safetensors.index.json" || -f "${target_dir}/pytorch_model.bin.index.json" ]]; then
+        return 0
+    fi
+
+    return 1
+}
+
+resolve_source_model_dir() {
+    local model_id="$1"
+    local default_source_dir="$2"
+    local custom_source_dir="${3:-}"
+
+    if [[ -n "${custom_source_dir}" ]] && source_model_dir_ready "${custom_source_dir}"; then
+        echo "${custom_source_dir}"
+        return 0
+    fi
+
+    if source_model_dir_ready "${default_source_dir}"; then
+        echo "${default_source_dir}"
+        return 0
+    fi
+
+    if source_model_dir_ready "${MODEL_PATH}/${model_id}" && [[ ! -f "${MODEL_PATH}/${model_id}/openvino_model.xml" ]]; then
+        echo "${MODEL_PATH}/${model_id}"
+        return 0
+    fi
+
+    if source_model_dir_ready "${model_id}"; then
+        echo "${model_id}"
+        return 0
+    fi
+
+    return 1
+}
+
+prepare_source_model() {
+    local model_id="$1"
+    local default_source_dir="$2"
+    local custom_source_dir="${3:-}"
+    local resolved_source_dir
+
+    if resolved_source_dir=$(resolve_source_model_dir "${model_id}" "${default_source_dir}" "${custom_source_dir}"); then
+        echo "[Model Check] Source model already available, skipping download for ${model_id}: ${resolved_source_dir}" >&2
+        echo "${resolved_source_dir}"
+        return 0
+    fi
+
+    if [[ "${SKIP_SOURCE_MODEL_DOWNLOAD}" == "1" ]]; then
+        echo "[Model Check] ERROR: Source model for '${model_id}' not found locally and SKIP_SOURCE_MODEL_DOWNLOAD=1" >&2
+        echo "[Model Check] Expected one of:" >&2
+        echo "[Model Check]   - ${custom_source_dir:-<custom source dir>}" >&2
+        echo "[Model Check]   - ${default_source_dir}" >&2
+        echo "[Model Check]   - ${MODEL_PATH}/${model_id}" >&2
+        exit 1
+    fi
+
+    echo "[Model Check] Downloading source model '${model_id}' via ${MODEL_DOWNLOAD_SOURCE}..." >&2
+    download_model "${model_id}" "${default_source_dir}"
+
+    if ! source_model_dir_ready "${default_source_dir}"; then
+        echo "[Model Check] ERROR: Download completed but source model directory is incomplete: ${default_source_dir}" >&2
+        exit 1
+    fi
+
+    echo "${default_source_dir}"
+}
+
+export_openvino_llm_model() {
+    local llm_src_dir="$1"
+    local target_dir="$2"
+    local method
+
+    method=$(normalize_ov_conversion_method "${OV_CONVERSION_METHOD}")
+
+    case "${method}" in
+        int4)
+            run_optimum_cli export openvino --model "${llm_src_dir}" "${target_dir}" --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
+            ;;
+        int8)
+            run_optimum_cli export openvino --model "${llm_src_dir}" "${target_dir}" --task text-generation-with-past --weight-format int8
+            ;;
+        fp16)
+            run_optimum_cli export openvino --model "${llm_src_dir}" "${target_dir}" --task text-generation-with-past --weight-format fp16
+            ;;
+    esac
+}
+
+download_model_with_modelscope() {
+    local model_id="$1"
+    local target_dir="$2"
+
+    ensure_modelscope_tooling
+    mkdir -p "${target_dir}"
+
+    "${PYTHON_CMD}" - "${model_id}" "${target_dir}" <<'PY' >&2
+import sys
+from modelscope import snapshot_download
+
+model_id = sys.argv[1]
+target_dir = sys.argv[2]
+
+snapshot_download(
+    model_id=model_id,
+    local_dir=target_dir,
+)
+
+print(f"[Model Check] ModelScope download complete: {model_id} -> {target_dir}", file=sys.stderr)
+PY
+}
+
+download_model_with_huggingface() {
+    local model_id="$1"
+    local target_dir="$2"
+
+    ensure_huggingface_tooling
+    mkdir -p "${target_dir}"
+
+    "${PYTHON_CMD}" - "${model_id}" "${target_dir}" <<'PY' >&2
+import sys
+from huggingface_hub import snapshot_download
+
+model_id = sys.argv[1]
+target_dir = sys.argv[2]
+
+snapshot_download(
+    repo_id=model_id,
+    local_dir=target_dir,
+)
+
+print(f"[Model Check] Hugging Face download complete: {model_id} -> {target_dir}", file=sys.stderr)
+PY
+}
+
+download_model() {
+    local model_id="$1"
+    local target_dir="$2"
+    local source
+    source=$(echo "${MODEL_DOWNLOAD_SOURCE}" | tr '[:upper:]' '[:lower:]')
+
+    case "${source}" in
+        modelscope)
+            download_model_with_modelscope "${model_id}" "${target_dir}"
+            ;;
+        huggingface)
+            download_model_with_huggingface "${model_id}" "${target_dir}"
+            ;;
+        *)
+            echo "[Model Check] ERROR: Unsupported MODEL_DOWNLOAD_SOURCE='${MODEL_DOWNLOAD_SOURCE}'"
+            echo "[Model Check]        Supported values: modelscope | huggingface"
+            exit 1
+            ;;
+    esac
+}
+
+ensure_embedding_and_reranker_models() {
+    ensure_embedding_model
+    ensure_reranker_model
+}
+
+ensure_embedding_model() {
+    local embedding_reranker_format
+    local embedding_dir
+    local embedding_src_dir="${MODEL_PATH}/.source_models/${EMBEDDING_MODEL}"
+    local resolved_embedding_src_dir
+
+    embedding_reranker_format=$(normalize_embedding_reranker_weight_format "${EMBEDDING_RERANKER_OV_WEIGHT_FORMAT}")
+    embedding_dir=$(get_embedding_or_reranker_target_dir "${EMBEDDING_MODEL}" "${embedding_reranker_format}")
+
+    if [ ! -f "${embedding_dir}/openvino_model.xml" ]; then
+        echo "[Model Check] Embedding model missing: ${embedding_dir}"
+        resolved_embedding_src_dir=$(prepare_source_model "${EMBEDDING_MODEL}" "${embedding_src_dir}")
+        ensure_openvino_tooling
+        mkdir -p "${embedding_dir}"
+        if [[ "${embedding_reranker_format}" == "auto" || "${embedding_reranker_format}" == "none" ]]; then
+            run_optimum_cli export openvino -m "${resolved_embedding_src_dir}" "${embedding_dir}" --task sentence-similarity
+        else
+            run_optimum_cli export openvino -m "${resolved_embedding_src_dir}" "${embedding_dir}" --weight-format "${embedding_reranker_format}" --task sentence-similarity
+        fi
+    else
+        echo "[Model Check] Embedding model exists: ${embedding_dir}"
+    fi
+}
+
+ensure_reranker_model() {
+    local embedding_reranker_format
+    local reranker_dir
+    local reranker_src_dir="${MODEL_PATH}/.source_models/${RERANKER_MODEL}"
+    local resolved_reranker_src_dir
+
+    embedding_reranker_format=$(normalize_embedding_reranker_weight_format "${EMBEDDING_RERANKER_OV_WEIGHT_FORMAT}")
+    reranker_dir=$(get_embedding_or_reranker_target_dir "${RERANKER_MODEL}" "${embedding_reranker_format}")
+
+    if [ ! -f "${reranker_dir}/openvino_model.xml" ]; then
+        echo "[Model Check] Reranker model missing: ${reranker_dir}"
+        resolved_reranker_src_dir=$(prepare_source_model "${RERANKER_MODEL}" "${reranker_src_dir}")
+        ensure_openvino_tooling
+        mkdir -p "${reranker_dir}"
+        if [[ "${embedding_reranker_format}" == "auto" || "${embedding_reranker_format}" == "none" ]]; then
+            run_optimum_cli export openvino -m "${resolved_reranker_src_dir}" "${reranker_dir}" --task text-classification
+        else
+            run_optimum_cli export openvino -m "${resolved_reranker_src_dir}" "${reranker_dir}" --weight-format "${embedding_reranker_format}" --task text-classification
+        fi
+    else
+        echo "[Model Check] Reranker model exists: ${reranker_dir}"
+    fi
+}
+
+ensure_llm_model_for_vllm() {
+    local llm_dir="${MODEL_PATH}/${LLM_MODEL}"
+    local llm_src_dir="${MODEL_PATH}/.source_models/${LLM_MODEL}"
+    local resolved_llm_src_dir
+
+    if [ ! -f "${llm_dir}/config.json" ]; then
+        echo "[Model Check] vLLM LLM model missing: ${llm_dir}"
+        resolved_llm_src_dir=$(prepare_source_model "${LLM_MODEL}" "${llm_src_dir}" "${SOURCE_MODEL_PATH}")
+        mkdir -p "${llm_dir}"
+        if [[ "${resolved_llm_src_dir}" != "${llm_dir}" ]]; then
+            cp -a "${resolved_llm_src_dir}/." "${llm_dir}/"
+        fi
+    else
+        echo "[Model Check] vLLM LLM model exists: ${llm_dir}"
+    fi
+}
+
+ensure_llm_model_for_ov() {
+    local ov_llm_dir
+    local llm_src_dir="${MODEL_PATH}/.source_models/${LLM_MODEL}"
+    local resolved_llm_src_dir
+    ov_llm_dir=$(get_ov_llm_target_dir)
+
+    if openvino_model_exists "${ov_llm_dir}"; then
+        echo "[Model Check] OpenVINO LLM model exists: ${ov_llm_dir}"
+        return 0
+    fi
+
+    echo "[Model Check] OpenVINO LLM model missing: ${ov_llm_dir}"
+    resolved_llm_src_dir=$(prepare_source_model "${LLM_MODEL}" "${llm_src_dir}" "${SOURCE_MODEL_PATH}")
+    echo "[Model Check] Converting LLM model '${LLM_MODEL}' to ${OV_CONVERSION_METHOD^^} OpenVINO..."
+    ensure_openvino_tooling
+    mkdir -p "${ov_llm_dir}"
+    export_openvino_llm_model "${resolved_llm_src_dir}" "${ov_llm_dir}"
+}
+
+ensure_required_models_for_vllm() {
+    echo ""
+    echo "Checking/downloading models for vLLM deployment..."
+    ensure_embedding_and_reranker_models
+    ensure_llm_model_for_vllm
+    echo "All vLLM models ready."
+    echo ""
+}
+
+ensure_required_models_for_ov() {
+    echo ""
+    echo "Checking/downloading models for OpenVINO deployment..."
+    ensure_embedding_and_reranker_models
+    ensure_llm_model_for_ov
+    echo "All OpenVINO models ready."
+    echo ""
+}
+
+ensure_required_models_for_embedding_reranker_only() {
+    echo ""
+    echo "Checking/downloading embedding and reranker models only (no LLM)..."
+    ensure_embedding_and_reranker_models
+    echo "Embedding and reranker models ready."
+    echo ""
+}
+
+ensure_required_models_for_embedding_only() {
+    echo ""
+    echo "Checking/downloading embedding model only (no reranker/LLM)..."
+    ensure_embedding_model
+    echo "Embedding model ready."
+    echo ""
+}
+
+ensure_required_models_for_reranker_only() {
+    echo ""
+    echo "Checking/downloading reranker model only (no embedding/LLM)..."
+    ensure_reranker_model
+    echo "Reranker model ready."
+    echo ""
+}
+
+usage() {
+    cat <<'EOF'
+Usage: ./tools/model_download.sh <mode> [model_id] [model_path] [source_model_path]
+
+Modes:
+  vllm   Ensure embedding/reranker OpenVINO models + vLLM LLM model
+    ov     Ensure embedding/reranker OpenVINO models + OpenVINO LLM model
+    emb-reranker  Ensure embedding/reranker OpenVINO models only (no LLM)
+    embedding     Ensure embedding OpenVINO model only
+    reranker      Ensure reranker OpenVINO model only
+
+Arguments:
+    model_id    Optional. Overrides LLM_MODEL for this run.
+    model_path  Optional. Overrides MODEL_PATH for this run.
+    source_model_path Optional. Local source model directory, mainly for LLM conversion/reuse.
+
+Environment:
+    OV_CONVERSION_METHOD  OpenVINO LLM conversion method: int4|int8|fp16 (default: int4)
+    EMBEDDING_RERANKER_OV_WEIGHT_FORMAT  OpenVINO embedding/reranker weight format: <empty>|none|auto|int4|int8|fp16 (default: empty, no quantization)
+    SKIP_SOURCE_MODEL_DOWNLOAD  Set to 1 to convert/reuse only local source models, never download.
+    SOURCE_MODEL_PATH  Local source model directory override, mainly for the LLM model.
+
+Examples:
+    ./tools/model_download.sh vllm
+    ./tools/model_download.sh ov Qwen/Qwen3-8B /data/models
+    ./tools/model_download.sh emb-reranker
+    ./tools/model_download.sh embedding
+    ./tools/model_download.sh reranker
+EOF
+}
+
+main() {
+    local mode="${1:-}"
+    local model_id="${2:-}"
+    local model_path="${3:-}"
+    local source_model_path="${4:-}"
+
+    setup_python_venv
+
+    if [[ -n "${model_id}" ]]; then
+        export LLM_MODEL="${model_id}"
+    fi
+
+    if [[ -n "${model_path}" ]]; then
+        export MODEL_PATH="${model_path}"
+    fi
+
+    if [[ -n "${source_model_path}" ]]; then
+        export SOURCE_MODEL_PATH="${source_model_path}"
+    fi
+
+    if [[ -n "${model_id}" || -n "${model_path}" || -n "${source_model_path}" ]]; then
+        echo "[Model Check] Runtime overrides: LLM_MODEL='${LLM_MODEL}', MODEL_PATH='${MODEL_PATH}', SOURCE_MODEL_PATH='${SOURCE_MODEL_PATH}'"
+    fi
+
+    export OV_CONVERSION_METHOD
+    OV_CONVERSION_METHOD=$(normalize_ov_conversion_method "${OV_CONVERSION_METHOD}")
+    export EMBEDDING_RERANKER_OV_WEIGHT_FORMAT
+    EMBEDDING_RERANKER_OV_WEIGHT_FORMAT=$(normalize_embedding_reranker_weight_format "${EMBEDDING_RERANKER_OV_WEIGHT_FORMAT}")
+
+    case "${mode}" in
+        vllm)
+            ensure_pip_available
+            ensure_required_models_for_vllm
+            ;;
+        ov)
+            ensure_pip_available
+            ensure_required_models_for_ov
+            ;;
+        emb-reranker|emb_reranker|retrieval)
+            ensure_pip_available
+            ensure_required_models_for_embedding_reranker_only
+            ;;
+        embedding)
+            ensure_pip_available
+            ensure_required_models_for_embedding_only
+            ;;
+        reranker)
+            ensure_pip_available
+            ensure_required_models_for_reranker_only
+            ;;
+        -h|--help|help|"")
+            usage
+            ;;
+        *)
+            echo "[Model Check] ERROR: Unknown mode '${mode}'"
+            usage
+            exit 1
+            ;;
+    esac
+}
+
+main "$@"
diff --git a/EdgeCraftRAG/tools/quick_start.sh b/EdgeCraftRAG/tools/quick_start.sh
index 76da043fd0..fc48cd3dd6 100755
--- a/EdgeCraftRAG/tools/quick_start.sh
+++ b/EdgeCraftRAG/tools/quick_start.sh
@@ -2,57 +2,546 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+# EdgeCraftRAG Quick Start
+# One-command deployment with automatic model download and setup
+
 set -e
 
 SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
-ip_address=$(hostname -I | awk '{print $1}')
-HOST_IP=$ip_address
 
-# global defaults to avoid docker compose warnings on unset variables
+# Default values
+ip_address=$(hostname -I | awk '{print $1}')
 export HOST_IP=${HOST_IP:-"${ip_address}"}
 export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
 export LLM_MODEL=${LLM_MODEL:-"Qwen/Qwen3-8B"}
+export EMBEDDING_MODEL=${EMBEDDING_MODEL:-"BAAI/bge-small-en-v1.5"}
+export RERANKER_MODEL=${RERANKER_MODEL:-"BAAI/bge-reranker-large"}
+export MODEL_DOWNLOAD_SOURCE=${MODEL_DOWNLOAD_SOURCE:-"modelscope"}
+export OV_CONVERSION_METHOD=${OV_CONVERSION_METHOD:-"int4"}
 export DOC_PATH=${DOC_PATH:-"${WORKPATH}/workspace"}
 export TMPFILE_PATH=${TMPFILE_PATH:-"${WORKPATH}/workspace"}
-export MILVUS_ENABLED=${MILVUS_ENABLED:-"0"}
+export MILVUS_ENABLED=${MILVUS_ENABLED:-"1"}
 export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-"0"}
+export SKIP_MODEL_CHECK=${SKIP_MODEL_CHECK:-"0"}
+export SKIP_INTEL_GPU_DRIVER_CHECK=${SKIP_INTEL_GPU_DRIVER_CHECK:-"0"}
+export AUTO_INSTALL_INTEL_GPU_DRIVER=${AUTO_INSTALL_INTEL_GPU_DRIVER:-"1"}
+export AUTO_INSTALL_NPM=${AUTO_INSTALL_NPM:-"1"}
+export RESTART_ON_RERUN=${RESTART_ON_RERUN:-"0"}
+
+# Proxy settings: interactive override > external env > default fallback
+export http_proxy=${http_proxy:-${HTTP_PROXY:-""}}
+export https_proxy=${https_proxy:-${HTTPS_PROXY:-""}}
+export no_proxy=${no_proxy:-${NO_PROXY:-"localhost,127.0.0.1,${HOST_IP},edgecraftrag,edgecraftrag-server"}}
+export HTTP_PROXY=${HTTP_PROXY:-"${http_proxy}"}
+export HTTPS_PROXY=${HTTPS_PROXY:-"${https_proxy}"}
+export NO_PROXY=${NO_PROXY:-"${no_proxy}"}
+
+# vLLM runtime options
+export MAX_MODEL_LEN=${MAX_MODEL_LEN:-"8192"}
+export GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL:-"0.8"}
+export QUANTIZATION=${QUANTIZATION:-"fp8"}
+export TOOL_PARSER=${TOOL_PARSER:-"qwen3_coder"}
+
+if [[ "${MODEL_DOWNLOAD_SOURCE,,}" == "huggingface" ]]; then
+    export HF_ENDPOINT=${HF_ENDPOINT:-"https://hf-mirror.com"}
+fi
+
+#==============================================================================
+# Python Virtual Environment Setup
+#==============================================================================
 
-#use python venv
 ENV_NAME="${WORKPATH}/ecrag_venv"
 
-# check if python3-venv (ensurepip) is fully available; install if missing
-if ! python3 -c "import ensurepip" &>/dev/null; then
-    echo "python3-venv (ensurepip) not found, installing..."
-    PY_VER=$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+setup_python_venv() {
+    # Prefer Python 3.10 or 3.11 for best compatibility
+    local PYTHON_CMD="python3"
+    if command -v python3.11 &>/dev/null; then
+        PYTHON_CMD="python3.11"
+        echo "Using Python 3.11 (recommended)"
+    elif command -v python3.10 &>/dev/null; then
+        PYTHON_CMD="python3.10"
+        echo "Using Python 3.10 (recommended)"
+    else
+        echo "Using $(python3 --version 2>&1)"
+        echo "⚠ Note: Python 3.10 or 3.11 recommended for best compatibility"
+    fi
+
+    # Check if python3-venv (ensurepip) is fully available; install if missing
+    if ! $PYTHON_CMD -c "import ensurepip" &>/dev/null; then
+        echo "python3-venv (ensurepip) not found, installing..."
+        PY_VER=$($PYTHON_CMD -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+        if command -v apt-get &>/dev/null; then
+            sudo apt-get install -y "python${PY_VER}-venv"
+        elif command -v dnf &>/dev/null; then
+            sudo dnf install -y python3-virtualenv
+        elif command -v yum &>/dev/null; then
+            sudo yum install -y python3-virtualenv
+        else
+            echo "ERROR: Cannot install python3-venv: unsupported package manager. Please install it manually."
+            exit 1
+        fi
+    fi
+
+    # Create venv if missing or broken (activate script absent)
+    if [ ! -f "${ENV_NAME}/bin/activate" ] && [ ! -f "${ENV_NAME}/Scripts/activate" ]; then
+        echo "Creating virtual environment at ${ENV_NAME}..."
+        rm -rf "${ENV_NAME}"
+        $PYTHON_CMD -m venv "${ENV_NAME}"
+    fi
+
+    # Activate venv
+    if [ -f "${ENV_NAME}/bin/activate" ]; then
+        source "${ENV_NAME}/bin/activate"
+    elif [ -f "${ENV_NAME}/Scripts/activate" ]; then
+        source "${ENV_NAME}/Scripts/activate"
+    else
+        echo "ERROR: Failed to activate virtual environment at ${ENV_NAME}"
+        exit 1
+    fi
+
+    echo "Python virtual environment activated: ${ENV_NAME}"
+}
+
+verify_venv_activated() {
+    echo ""
+    echo "[Venv Check] Verifying virtual environment..."
+
+    # Check if VIRTUAL_ENV is set
+    if [[ -z "${VIRTUAL_ENV:-}" ]]; then
+        echo "[Venv Check] ERROR: Virtual environment not activated"
+        echo "             VIRTUAL_ENV variable is not set"
+        exit 1
+    fi
+
+    # Check if we're using the expected venv
+    if [[ "${VIRTUAL_ENV}" != "${ENV_NAME}" ]]; then
+        echo "[Venv Check] WARNING: Using different venv than expected"
+        echo "             Expected: ${ENV_NAME}"
+        echo "             Active:   ${VIRTUAL_ENV}"
+    else
+        echo "[Venv Check] ✓ Virtual environment properly activated: ${VIRTUAL_ENV}"
+    fi
+
+    # Check Python version
+    python_version=$(python --version 2>&1 | awk '{print $2}')
+    python_major=$(echo "$python_version" | cut -d. -f1)
+    python_minor=$(echo "$python_version" | cut -d. -f2)
+
+    if [[ "$python_major" -lt 3 ]] || [[ "$python_major" -eq 3 && "$python_minor" -lt 10 ]]; then
+        echo "[Venv Check] ERROR: Python 3.10+ required, but found $python_version"
+        exit 1
+    fi
+
+    echo "[Venv Check] ✓ Python version: $python_version"
+
+    # Python 3.12+ is supported with a docarray compatibility pin during pip install.
+    # Keep this as an explicit warning so users understand why extra handling is applied.
+    if [[ "$python_major" -eq 3 && "$python_minor" -ge 12 ]]; then
+        echo ""
+        echo "[Venv Check] ⚠ Python 3.12+ detected"
+        echo "             Applying docarray compatibility pin during dependency installation"
+        echo "             (recommended fallback: Python 3.10 or 3.11)"
+        echo ""
+    fi
+}
+
+check_pip_requirements() {
+    local requirements_file="${WORKPATH}/edgecraftrag/requirements.txt"
+    local python_version
+    python_version=$(python --version 2>&1 | awk '{print $2}')
+    local python_major python_minor
+    python_major=$(echo "$python_version" | cut -d. -f1)
+    python_minor=$(echo "$python_version" | cut -d. -f2)
+
+    echo ""
+    echo "[Pip Check] Checking Python package requirements..."
+
+    # Check if requirements.txt exists
+    if [[ ! -f "$requirements_file" ]]; then
+        echo "[Pip Check] WARNING: requirements.txt not found at $requirements_file"
+        echo "            Skipping package check"
+        return 0
+    fi
+
+    # Check if timeout command is available
+    local HAS_TIMEOUT=1
+    if ! command -v timeout &>/dev/null; then
+        echo "[Pip Check] Note: 'timeout' command not found, checks may take longer"
+        HAS_TIMEOUT=0
+    fi
+
+    # Upgrade pip if needed
+    echo "[Pip Check] Ensuring pip is up to date..."
+    python -m pip install --quiet --upgrade pip
+
+    # Ensure docarray compatibility for Python 3.12+.
+    # Some transitive dependency chains may otherwise resolve an incompatible version.
+    if [[ "$python_major" -eq 3 && "$python_minor" -ge 12 ]]; then
+        echo "[Pip Check] Python 3.12+ detected, pinning docarray==0.40.0..."
+        if ! python -m pip install --quiet "docarray==0.40.0"; then
+            echo "[Pip Check] ERROR: Failed to install docarray==0.40.0 for Python 3.12+"
+            exit 1
+        fi
+    fi
+
+    # Check for critical packages
+    local critical_packages=(
+        "langchain-core"
+        "llama-index"
+        "opea-comps"
+        "transformers"
+    )
+
+    local missing_packages=()
+    local installed_count=0
+
+    for package in "${critical_packages[@]}"; do
+        echo -n "[Pip Check] Checking $package... "
+        local check_start=$SECONDS
+
+        # Skip import check, just verify package is installed via pip
+        # Import checks can hang on some packages like llama-index
+        # Use timeout to prevent pip show from hanging
+        local show_result=1
+        if [[ $HAS_TIMEOUT -eq 1 ]]; then
+            if timeout 5 python -m pip show "${package}" >/dev/null 2>&1; then
+                show_result=0
+            else
+                show_result=$?
+            fi
+        else
+            if python -m pip show "${package}" >/dev/null 2>&1; then
+                show_result=0
+            else
+                show_result=$?
+            fi
+        fi
+
+        if [[ $show_result -eq 0 ]]; then
+            local check_elapsed=$((SECONDS - check_start))
+            echo "✓ (${check_elapsed}s)"
+            installed_count=$((installed_count + 1))
+        elif [[ $show_result -eq 124 ]]; then
+            local check_elapsed=$((SECONDS - check_start))
+            echo "⏱ (${check_elapsed}s, timeout - treating as missing)"
+            missing_packages+=("$package")
+        else
+            local check_elapsed=$((SECONDS - check_start))
+            echo "✗ (${check_elapsed}s, missing)"
+            missing_packages+=("$package")
+        fi
+    done
+
+    echo "[Pip Check] Package check loop completed: $installed_count installed, ${#missing_packages[@]} missing"
+
+    if [[ ${#missing_packages[@]} -gt 0 ]]; then
+        echo ""
+        echo "[Pip Check] Missing ${#missing_packages[@]} critical packages"
+        echo "[Pip Check] Missing package list: ${missing_packages[*]}"
+        echo "[Pip Check] Installing requirements from $requirements_file..."
+        echo ""
+
+        # Install all requirements with PyTorch CPU index
+        # Note: requirements.txt contains torch==2.8.0+cpu which needs PyTorch's extra index
+        if python -m pip install -r "$requirements_file" \
+            --extra-index-url https://download.pytorch.org/whl/cpu; then
+            echo ""
+            echo "[Pip Check] ✓ All requirements installed successfully"
+        else
+            echo ""
+            echo "[Pip Check] ERROR: Failed to install requirements"
+            echo "[Pip Check] You can manually install with:"
+            echo "            python -m pip install -r $requirements_file \\"
+            echo "              --extra-index-url https://download.pytorch.org/whl/cpu"
+            exit 1
+        fi
+    else
+        echo "[Pip Check] ✓ All critical packages are installed ($installed_count/${#critical_packages[@]})"
+
+        # Skip pip check to avoid potential hangs - critical packages are installed
+        echo "[Pip Check] Skipping full dependency check (critical packages verified)"
+    fi
+
+    echo "[Pip Check] Completed successfully"
+}
+
+check_npm_requirements() {
+    echo ""
+    echo "[NPM Check] Checking Node.js/npm for baremetal UI startup..."
+
+    if command -v npm &>/dev/null; then
+        local npm_version
+        npm_version=$(npm --version 2>/dev/null || echo "unknown")
+        echo "[NPM Check] ✓ npm is available: ${npm_version}"
+        return 0
+    fi
+
+    echo "[NPM Check] npm not found"
+
+    if [[ "${AUTO_INSTALL_NPM}" != "1" ]]; then
+        echo "[NPM Check] ERROR: Auto-install disabled (AUTO_INSTALL_NPM=${AUTO_INSTALL_NPM})"
+        echo "[NPM Check] Please install Node.js/npm manually, or set AUTO_INSTALL_NPM=1"
+        exit 1
+    fi
+
+    echo "[NPM Check] Attempting to install npm..."
     if command -v apt-get &>/dev/null; then
-        sudo apt-get install -y "python${PY_VER}-venv"
+        sudo apt-get update
+        sudo apt-get install -y npm
     elif command -v dnf &>/dev/null; then
-        sudo dnf install -y python3-virtualenv
+        sudo dnf install -y npm
     elif command -v yum &>/dev/null; then
-        sudo yum install -y python3-virtualenv
+        sudo yum install -y npm
     else
-        echo "ERROR: Cannot install python3-venv: unsupported package manager. Please install it manually."
+        echo "[NPM Check] ERROR: Unsupported package manager. Please install npm manually."
         exit 1
     fi
-fi
 
-# create venv if missing or broken (activate script absent)
-if [ ! -f "${ENV_NAME}/bin/activate" ] && [ ! -f "${ENV_NAME}/Scripts/activate" ]; then
-    echo "Creating virtual environment at ${ENV_NAME}..."
-    rm -rf "${ENV_NAME}"
-    python3 -m venv "${ENV_NAME}"
-fi
+    if command -v npm &>/dev/null; then
+        local installed_npm_version
+        installed_npm_version=$(npm --version 2>/dev/null || echo "unknown")
+        echo "[NPM Check] ✓ npm installed successfully: ${installed_npm_version}"
+    else
+        echo "[NPM Check] ERROR: npm installation completed but npm is still unavailable"
+        exit 1
+    fi
+}
 
-# activate venv
-if [ -f "${ENV_NAME}/bin/activate" ]; then
-    source "${ENV_NAME}/bin/activate"
-elif [ -f "${ENV_NAME}/Scripts/activate" ]; then
-    source "${ENV_NAME}/Scripts/activate"
-else
-    echo "ERROR: Failed to activate virtual environment at ${ENV_NAME}"
-    exit 1
-fi
+#==============================================================================
+# Intel GPU Driver Validation and Installation
+#==============================================================================
+
+has_intel_gpu_device() {
+    if ! command -v lspci &>/dev/null; then
+        # lspci may be unavailable on minimal systems; fall back to /dev/dri presence.
+        [[ -e /dev/dri/card0 || -e /dev/dri/renderD128 ]]
+        return $?
+    fi
+
+    if lspci | grep -Ei 'VGA|3D|Display' | grep -qi 'intel'; then
+        return 0
+    fi
+
+    return 1
+}
+
+is_intel_gpu_driver_ready() {
+    if ! command -v clinfo &>/dev/null; then
+        echo "[GPU Driver Check] clinfo not found"
+        return 1
+    fi
+
+    if clinfo 2>/dev/null | grep -q "Device Name"; then
+        return 0
+    fi
+
+    echo "[GPU Driver Check] clinfo did not report any Device Name entries"
+    return 1
+}
+
+install_intel_gpu_driver_ubuntu() {
+    local version_codename
+    local apt_update_log
+    local missing_key
+    local candidate_packages
+    local level_zero_runtime_pkg=""
+    local level_zero_loader_pkg=""
+    local available_packages=()
+    version_codename=$(source /etc/os-release && echo "${VERSION_CODENAME:-}")
+
+    if [[ -z "${version_codename}" ]]; then
+        echo "[GPU Driver Check] ERROR: Unable to detect Ubuntu codename"
+        return 1
+    fi
+
+    echo "[GPU Driver Check] Installing Intel GPU runtime packages for Ubuntu ${version_codename}..."
+
+    sudo apt-get update
+    sudo apt-get install -y ca-certificates curl gpg
+
+    # Always refresh Intel repo key to handle key rotation on existing machines.
+    curl -fsSL https://repositories.intel.com/gpu/intel-graphics.key | \
+        sudo gpg --dearmor --yes -o /usr/share/keyrings/intel-graphics.gpg
+    sudo chmod a+r /usr/share/keyrings/intel-graphics.gpg
+
+    echo "deb [signed-by=/usr/share/keyrings/intel-graphics.gpg arch=amd64] https://repositories.intel.com/gpu/ubuntu ${version_codename} unified" | \
+        sudo tee /etc/apt/sources.list.d/intel-gpu.list >/dev/null
+
+    apt_update_log=$(mktemp)
+    if ! sudo apt-get update 2>&1 | tee "${apt_update_log}"; then
+        missing_key=$(grep -oE 'NO_PUBKEY [0-9A-F]+' "${apt_update_log}" | awk '{print $2}' | head -n1)
+        if [[ -n "${missing_key}" ]]; then
+            echo "[GPU Driver Check] Importing missing apt key: ${missing_key}"
+            if sudo gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys "${missing_key}" && \
+               sudo gpg --batch --export "${missing_key}" | sudo gpg --dearmor --yes -o /usr/share/keyrings/intel-graphics.gpg; then
+                sudo chmod a+r /usr/share/keyrings/intel-graphics.gpg
+                sudo apt-get update
+            else
+                rm -f "${apt_update_log}"
+                echo "[GPU Driver Check] ERROR: Failed to import missing key ${missing_key}"
+                return 1
+            fi
+        else
+            rm -f "${apt_update_log}"
+            echo "[GPU Driver Check] ERROR: apt-get update failed for Intel GPU repository"
+            return 1
+        fi
+    fi
+    rm -f "${apt_update_log}"
+
+    # Prefer newer package names first to avoid conflicts on newer Ubuntu releases
+    # where libze-intel-gpu1 may break intel-level-zero-gpu.
+    if apt-cache show libze-intel-gpu1 >/dev/null 2>&1; then
+        level_zero_runtime_pkg="libze-intel-gpu1"
+    elif apt-cache show intel-level-zero-gpu >/dev/null 2>&1; then
+        level_zero_runtime_pkg="intel-level-zero-gpu"
+    fi
+
+    if apt-cache show libze1 >/dev/null 2>&1; then
+        level_zero_loader_pkg="libze1"
+    elif apt-cache show level-zero >/dev/null 2>&1; then
+        level_zero_loader_pkg="level-zero"
+    fi
+
+    candidate_packages=(intel-opencl-icd xpu-smi clinfo)
+    if [[ -n "${level_zero_runtime_pkg}" ]]; then
+        candidate_packages+=("${level_zero_runtime_pkg}")
+    fi
+    if [[ -n "${level_zero_loader_pkg}" ]]; then
+        candidate_packages+=("${level_zero_loader_pkg}")
+    fi
+    for pkg in "${candidate_packages[@]}"; do
+        if apt-cache show "${pkg}" >/dev/null 2>&1; then
+            available_packages+=("${pkg}")
+        else
+            echo "[GPU Driver Check] WARNING: Package not found in current repos: ${pkg}"
+        fi
+    done
+
+    if [[ ${#available_packages[@]} -eq 0 ]]; then
+        echo "[GPU Driver Check] ERROR: No Intel GPU runtime packages available to install"
+        return 1
+    fi
+
+    if ! sudo apt-get install -y "${available_packages[@]}"; then
+        echo "[GPU Driver Check] ERROR: Failed to install Intel GPU runtime packages: ${available_packages[*]}"
+        return 1
+    fi
+}
+
+install_intel_gpu_driver() {
+    if command -v apt-get &>/dev/null; then
+        install_intel_gpu_driver_ubuntu
+        return $?
+    fi
+
+    echo "[GPU Driver Check] ERROR: Automatic Intel GPU driver installation is only supported on apt-based Linux in quick_start.sh"
+    echo "[GPU Driver Check] Please install Intel GPU drivers manually for your distribution"
+    return 1
+}
+
+ensure_intel_gpu_driver_ready() {
+    if [[ "${SKIP_INTEL_GPU_DRIVER_CHECK}" == "1" ]]; then
+        echo "[GPU Driver Check] Skipping Intel GPU driver validation (--skip-gpu-driver-check enabled)"
+        return 0
+    fi
+
+    if ! has_intel_gpu_device; then
+        echo "[GPU Driver Check] No Intel GPU device detected, skipping Intel GPU driver installation"
+        return 0
+    fi
+
+    echo ""
+    echo "[GPU Driver Check] Validating Intel GPU driver/runtime..."
+
+    if is_intel_gpu_driver_ready; then
+        echo "[GPU Driver Check] ✓ Intel GPU driver/runtime looks ready"
+        return 0
+    fi
+
+    echo "[GPU Driver Check] Intel GPU driver/runtime not ready"
+
+    if [[ "${AUTO_INSTALL_INTEL_GPU_DRIVER}" != "1" ]]; then
+        echo "[GPU Driver Check] ERROR: Auto-install disabled (AUTO_INSTALL_INTEL_GPU_DRIVER=${AUTO_INSTALL_INTEL_GPU_DRIVER})"
+        echo "[GPU Driver Check] Set AUTO_INSTALL_INTEL_GPU_DRIVER=1 or use --skip-gpu-driver-check"
+        exit 1
+    fi
+
+    echo "[GPU Driver Check] Attempting automatic installation..."
+    if ! install_intel_gpu_driver; then
+        echo "[GPU Driver Check] ERROR: Intel GPU driver installation failed"
+        echo "[GPU Driver Check] Refer to: https://dgpu-docs.intel.com/driver/client/overview.html"
+        exit 1
+    fi
+
+    if is_intel_gpu_driver_ready; then
+        echo "[GPU Driver Check] ✓ Intel GPU driver/runtime installed successfully"
+    else
+        echo "[GPU Driver Check] ERROR: Driver installation finished but GPU runtime is still unavailable"
+        echo "[GPU Driver Check] Try rebooting the machine and rerun quick_start.sh"
+        exit 1
+    fi
+}
+
+#==============================================================================
+# Model Download Functions (Unique Value of quick_start.sh)
+#==============================================================================
+
+run_model_download_tool() {
+    local mode="$1"
+    local tool_script="${SCRIPT_DIR}/model_download.sh"
+
+    if [[ ! -f "${tool_script}" ]]; then
+        echo "[Model Check] ERROR: Model download tool not found: ${tool_script}"
+        exit 1
+    fi
+
+    bash "${tool_script}" "${mode}"
+}
+
+ensure_required_models_for_vllm() {
+    run_model_download_tool "vllm"
+}
+
+ensure_required_models_for_ov() {
+    run_model_download_tool "ov"
+}
+
+resolve_download_mode_for_backend() {
+    local backend="$1"
+    local llm_model="$2"
+
+    case "$backend" in
+        openvino|ovms)
+            if [[ "$llm_model" == OpenVINO/*-ov ]]; then
+                echo "vllm"
+            else
+                echo "ov"
+            fi
+            ;;
+        vllm_a770|vllm_b60)
+            echo "vllm"
+            ;;
+        *)
+            echo "ov"
+            ;;
+    esac
+}
+
+download_required_models_for_backend() {
+    local backend="$1"
+    local llm_model="$2"
+    local download_mode
+
+    download_mode=$(resolve_download_mode_for_backend "$backend" "$llm_model")
+    echo "[Model Check] Resolved download mode for backend '${backend}': ${download_mode}"
+    run_model_download_tool "${download_mode}"
+}
+
+#==============================================================================
+# Interactive Helper Functions
+#==============================================================================
 
 get_user_input() {
     local var_name=$1
@@ -70,519 +559,1041 @@ get_enable_function() {
 
 print_ui_access_info() {
     echo ""
-    echo "Service launched successfully."
+    echo "════════════════════════════════════════════════════════════"
+    echo "Service launched successfully!"
+    echo "════════════════════════════════════════════════════════════"
+    echo ""
     echo "UI access URL: http://${HOST_IP}:8082"
-    echo "If you are accessing from another machine, replace ${HOST_IP} with the server's reachable IP or hostname."
+    echo ""
+    echo "If you are accessing from another machine, replace ${HOST_IP}"
+    echo "with the server's reachable IP or hostname."
+    echo ""
 }
 
-function start_vllm_services() {
-    COMPOSE_FILE="compose.yaml"
-    echo "stop former service..."
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+restart_services_before_deploy() {
+    if [[ "${RESTART_ON_RERUN}" != "1" ]]; then
+        return 0
+    fi
 
-    ip_address=$(hostname -I | awk '{print $1}')
-    HOST_IP=$(get_user_input "host ip" "${ip_address}")
-    DOC_PATH=$(get_user_input "DOC_PATH" "$WORKPATH/workspace")
-    TMPFILE_PATH=$(get_user_input "TMPFILE_PATH" "$WORKPATH/workspace")
-    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 1 for enable)" "0")
-    CHAT_HISTORY_ROUND=$(get_user_input "chat history round" "0")
-    LLM_MODEL=$(get_user_input "your LLM model" "Qwen/Qwen3-8B")
-    MODEL_PATH=$(get_user_input "your model path" "${WORKPATH}/workspace/models")
-    read -p "Have you prepare models in ${MODEL_PATH}:(yes/no) [yes]" user_input
-    user_input=${user_input:-"yes"}
+    echo ""
+    echo "Restart-on-rerun enabled: stopping existing services before deployment..."
+    # Best effort cleanup to guarantee a clean restart path.
+    bash "${SCRIPT_DIR}/bootstrap.sh" cleanup || true
+}
 
-    if [ "$user_input" == "yes" ]; then
-        # 模型文件路径请参考以下形式存放， llm为huggingface
-        # Indexer: ${MODEL_PATH}/BAAI/bge-small-en-v1.5
-        # Reranker: ${MODEL_PATH}/BAAI/bge-reranker-large
-        # llm :${MODEL_PATH}/${LLM_MODEL} (从huggingface或modelscope下载的原始模型，而不是经过OpenVINO转换的模型!)
-        echo "you skipped model downloading, please make sure you have prepared all models under ${MODEL_PATH}"
-        ensure_required_models_for_vllm
+resolve_runtime_script() {
+    local backend="$1"
+    local deployment_method="$2"
+
+    if [[ "$backend" == "openvino" ]]; then
+        if [[ "$deployment_method" == "container" ]]; then
+            echo "${SCRIPT_DIR}/run_ov_container.sh"
+        else
+            echo "${SCRIPT_DIR}/run_ov_baremetal.sh"
+        fi
+    elif [[ "$backend" == "ovms" ]]; then
+        if [[ "$deployment_method" == "container" ]]; then
+            echo "${SCRIPT_DIR}/run_ovms_container.sh"
+        else
+            echo "${SCRIPT_DIR}/run_ovms_baremetal.sh"
+        fi
     else
-        echo "you have not prepare models, starting to download models into ${MODEL_PATH}..."
-        mkdir -p $MODEL_PATH
-        python -m pip install --upgrade-strategy eager "optimum-intel[openvino]"
-        optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
-        optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task text-classification
-        pip install huggingface_hub
-        huggingface-cli download $LLM_MODEL --local-dir "${MODEL_PATH}/${LLM_MODEL}"
-    fi
-    HF_CACHE="${HOME}/.cache"
-    if [ ! -d "${HF_CACHE}" ]; then
-        mkdir -p "${HF_CACHE}"
-        echo "Created directory: ${HF_CACHE}"
-    fi
-    echo "give permission to related path..."
-    sudo chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
-    sudo chown -R 1000:1000 ${HF_CACHE}
-    HF_ENDPOINT=https://hf-mirror.com
-    # vllm ENV
-    export VLLM_SERVICE_PORT_A770=8086
-
-    read -p "Tensor parallel size(your tp size [1]), press Enter to confirm, or type a new value:" TENSOR_PARALLEL_SIZE; TP=${TP:-1}
-    CCL_DG2_USM=$(get_user_input "Set USM (Core=1, Xeon=0, default=0)" 0)
-    export HOST_IP=${HOST_IP}
-    # export ENV
-    export MODEL_PATH=${MODEL_PATH}
-    export DOC_PATH=${DOC_PATH}
-    export TMPFILE_PATH=${TMPFILE_PATH}
-    export LLM_MODEL=${LLM_MODEL}
-    export HF_ENDPOINT=${HF_ENDPOINT}
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    export MILVUS_ENABLED=${MILVUS_ENABLED}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
-    export TP=${TP}
-    export CCL_DG2_USM=${CCL_DG2_USM}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-
-
-    # Start Docker Containers
-    docker compose --profile a770 -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
-    echo "ipex-llm-serving-xpu is booting, please wait..."
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs ipex-llm-serving-xpu-container-0 > ipex-llm-serving-xpu-container.log 2>&1
-        if grep -q "Starting vLLM API server on http://0.0.0.0:" ipex-llm-serving-xpu-container.log; then
+        if [[ "$deployment_method" == "container" ]]; then
+            echo "${SCRIPT_DIR}/run_vllm_container.sh"
+        else
+            echo "${SCRIPT_DIR}/run_vllm_baremetal.sh"
+        fi
+    fi
+}
+
+are_target_services_running() {
+    local backend="$1"
+    local deployment_method="$2"
+    local runtime_script
+    local status_output
+
+    runtime_script=$(resolve_runtime_script "$backend" "$deployment_method")
+    if [[ ! -f "$runtime_script" ]]; then
+        return 1
+    fi
+
+    status_output=$(bash "$runtime_script" status 2>/dev/null || true)
+
+    if echo "$status_output" | grep -Eqi "stopped|not running"; then
+        return 1
+    fi
+
+    if echo "$status_output" | grep -Eqi "running"; then
+        return 0
+    fi
+
+    return 1
+}
+
+check_docker_and_compose_ready() {
+    echo ""
+    echo "[Docker Check] Validating Docker and Docker Compose..."
+
+    if ! command -v docker &>/dev/null || ! docker compose version >/dev/null 2>&1; then
+        echo "[Docker Check] Docker and/or Docker Compose not found. Installing on Ubuntu 24.04..."
+
+        if ! command -v apt-get &>/dev/null; then
+            echo "[Docker Check] ERROR: apt-get not found. Automatic installation only supports Ubuntu 24.04"
+            exit 1
+        fi
+
+        sudo apt-get update
+
+        # Ubuntu 24.04 package names can differ across mirrors/releases.
+        # Try the common variants for Compose plugin.
+        if ! sudo apt-get install -y docker.io docker-compose-v2; then
+            if ! sudo apt-get install -y docker.io docker-compose-plugin; then
+                echo "[Docker Check] ERROR: Failed to install docker.io and Docker Compose plugin"
+                exit 1
+            fi
+        fi
+    fi
+
+    if ! systemctl is-active --quiet docker; then
+        echo "[Docker Check] Starting Docker daemon..."
+        sudo systemctl enable --now docker
+        sudo systemctl start docker || true
+    fi
+
+    local docker_ready=0
+    local daemon_running=0
+
+    # Give systemd a short window to finish service activation.
+    for _ in {1..8}; do
+        if systemctl is-active --quiet docker; then
+            daemon_running=1
             break
         fi
-        sleep 6s
-        n=$((n+1))
+        sleep 1
     done
-    rm -rf ipex-llm-serving-xpu-container.log
-    print_ui_access_info
+
+    if docker info >/dev/null 2>&1; then
+        docker_ready=1
+    elif sudo docker info >/dev/null 2>&1; then
+        echo "[Docker Check] ERROR: Docker daemon is running but current user cannot access Docker socket"
+        echo "[Docker Check] Run: sudo usermod -aG docker ${USER}"
+        echo "[Docker Check] Then re-login (or run: newgrp docker) and rerun quick_start.sh"
+        exit 1
+    fi
+
+    if [[ "${docker_ready}" -ne 1 ]]; then
+        if [[ "${daemon_running}" -ne 1 ]]; then
+            echo "[Docker Check] ERROR: Docker daemon failed to start after installation"
+        else
+            echo "[Docker Check] ERROR: Docker daemon is not available after installation/start attempt"
+        fi
+        echo "[Docker Check] Recent docker service logs (last 20 lines):"
+        sudo journalctl -u docker --no-pager -n 20 || true
+        exit 1
+    fi
+
+    if ! docker compose version >/dev/null 2>&1; then
+        echo "[Docker Check] ERROR: Docker Compose plugin is still unavailable after installation"
+        exit 1
+    fi
+
+    echo "[Docker Check] ✓ Docker and Docker Compose are ready"
+}
+
+save_bootstrap_env_snapshot() {
+    local backend="$1"
+    local deployment_method="$2"
+    local config_file="${WORKPATH}/workspace/bootstrap.env"
+
+    mkdir -p "${WORKPATH}/workspace"
+
+    {
+        echo "# EdgeCraftRAG deployment environment snapshot"
+        echo "# Generated by quick_start.sh on $(date)"
+        echo "# Reuse with: source workspace/bootstrap.env && ./tools/bootstrap.sh"
+        echo ""
+
+        printf 'export INFERENCE_BACKEND=%q\n' "${backend}"
+        printf 'export DEPLOYMENT_METHOD=%q\n' "${deployment_method}"
+
+        local env_vars=(
+            HOST_IP
+            MODEL_PATH
+            DOC_PATH
+            TMPFILE_PATH
+            LLM_MODEL
+            OVMS_SERVICE_PORT
+            OVMS_ENDPOINT
+            EMBEDDING_MODEL
+            RERANKER_MODEL
+            MODEL_DOWNLOAD_SOURCE
+            OV_CONVERSION_METHOD
+            http_proxy
+            https_proxy
+            no_proxy
+            HTTP_PROXY
+            HTTPS_PROXY
+            NO_PROXY
+            HF_ENDPOINT
+            MILVUS_ENABLED
+            CHAT_HISTORY_ROUND
+            SKIP_MODEL_CHECK
+            SKIP_INTEL_GPU_DRIVER_CHECK
+            AUTO_INSTALL_INTEL_GPU_DRIVER
+            AUTO_INSTALL_NPM
+            RESTART_ON_RERUN
+            VLLM_BACKEND
+            TP
+            DP
+            DTYPE
+            MAX_MODEL_LEN
+            GPU_MEMORY_UTIL
+            QUANTIZATION
+            TOOL_PARSER
+            ZE_AFFINITY_MASK
+            CCL_DG2_USM
+            OVMS_REST_PORT
+            OVMS_SOURCE_MODEL
+            OVMS_MODEL_REPOSITORY_PATH
+            OVMS_MODEL_NAME
+            OVMS_TARGET_DEVICE
+            OVMS_TASK
+            OVMS_CACHE_DIR
+            OVMS_ENABLE_PREFIX_CACHING
+            OVMS_TOOL_PARSER
+            OVMS_ENABLE_TOOL_GUIDED_GENERATION
+            OVMS_MAX_NUM_BATCHED_TOKENS
+        )
+
+        local var_name
+        for var_name in "${env_vars[@]}"; do
+            if [[ -n "${!var_name+x}" ]]; then
+                printf 'export %s=%q\n' "${var_name}" "${!var_name}"
+            fi
+        done
+
+        # Keep bootstrap checks skipped on replay, matching quick_start behavior.
+        echo "export SKIP_VALIDATION=1"
+    } > "${config_file}"
+
+    chmod 644 "${config_file}"
+    echo "[Config] Saved deployment environment to workspace/bootstrap.env"
+}
+
+set_ovms_defaults() {
+    export OVMS_SERVICE_PORT=${OVMS_SERVICE_PORT:-8000}
+    export OVMS_ENDPOINT=${OVMS_ENDPOINT:-"http://${HOST_IP}:${OVMS_SERVICE_PORT}"}
+    export OVMS_REST_PORT=${OVMS_REST_PORT:-${OVMS_SERVICE_PORT}}
+    export OVMS_SOURCE_MODEL=${OVMS_SOURCE_MODEL:-${LLM_MODEL}}
+    export OVMS_MODEL_REPOSITORY_PATH=${OVMS_MODEL_REPOSITORY_PATH:-/models}
+    export OVMS_MODEL_NAME=${OVMS_MODEL_NAME:-${OVMS_SOURCE_MODEL}}
+    export OVMS_TARGET_DEVICE=${OVMS_TARGET_DEVICE:-GPU.0}
+    export OVMS_TASK=${OVMS_TASK:-text_generation}
+    export OVMS_CACHE_DIR=${OVMS_CACHE_DIR:-/models/.ov_cache}
+    export OVMS_ENABLE_PREFIX_CACHING=${OVMS_ENABLE_PREFIX_CACHING:-true}
+    export OVMS_TOOL_PARSER=${OVMS_TOOL_PARSER:-qwen3coder}
+    export OVMS_ENABLE_TOOL_GUIDED_GENERATION=${OVMS_ENABLE_TOOL_GUIDED_GENERATION:-true}
+    export OVMS_MAX_NUM_BATCHED_TOKENS=${OVMS_MAX_NUM_BATCHED_TOKENS:-8192}
+}
+
+set_vllm_defaults() {
+    export MAX_MODEL_LEN=${MAX_MODEL_LEN:-8192}
+    export GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL:-0.8}
+    export QUANTIZATION=${QUANTIZATION:-fp8}
+    export TOOL_PARSER=${TOOL_PARSER:-qwen3_coder}
 }
 
+#==============================================================================
+# Deployment Functions (Delegate to bootstrap.sh)
+#==============================================================================
 
-function start_services() {
-    COMPOSE_FILE="compose.yaml"
-    echo "stop former service..."
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+deploy_openvino_interactive() {
+    local force_model_download=0
 
+    echo ""
+    echo "═══════════════════════════════════════════"
+    echo "  OpenVINO Deployment Setup"
+    echo "═══════════════════════════════════════════"
+    echo ""
+
+    # Ask about deployment method
+    read -p "Deployment method (baremetal/container) [baremetal]: " deployment_method_input
+    deployment_method_input=${deployment_method_input:-"baremetal"}
+    export DEPLOYMENT_METHOD="${deployment_method_input}"
+
+    echo ""
+    echo "Selected deployment method: ${DEPLOYMENT_METHOD}"
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        echo "  → Python processes with virtual environment"
+    else
+        echo "  → Docker containers"
+    fi
+    echo ""
+
+    if [[ "${DEPLOYMENT_METHOD}" == "container" ]]; then
+        check_docker_and_compose_ready
+    fi
+
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "openvino" "${DEPLOYMENT_METHOD}"; then
+        echo "OpenVINO ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
+    fi
+
+    # Setup Python environment if baremetal deployment
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        setup_python_venv
+        verify_venv_activated
+        check_pip_requirements
+        check_npm_requirements
+        echo ""
+        echo "Python environment ready."
+        echo ""
+    fi
+
+    # Gather user inputs
     ip_address=$(hostname -I | awk '{print $1}')
     HOST_IP=$(get_user_input "host ip" "${ip_address}")
     DOC_PATH=$(get_user_input "DOC_PATH" "$WORKPATH/workspace")
     TMPFILE_PATH=$(get_user_input "TMPFILE_PATH" "$WORKPATH/workspace")
-    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 1 for enable)" "0")
+    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 0 to disable)" "1")
     CHAT_HISTORY_ROUND=$(get_user_input "chat history round" "0")
     LLM_MODEL=$(get_user_input "your LLM model" "Qwen/Qwen3-8B")
     MODEL_PATH=$(get_user_input "your model path" "${WORKPATH}/workspace/models")
-    read -p "Have you prepare models in ${MODEL_PATH}:(yes/no) [yes]" user_input
+    http_proxy=$(get_user_input "http_proxy" "${http_proxy}")
+    https_proxy=$(get_user_input "https_proxy" "${https_proxy}")
+    no_proxy=$(get_user_input "no_proxy" "${no_proxy}")
+
+    # Ask about model preparation
+    read -p "Have you prepared models in ${MODEL_PATH}? (yes/no) [yes]: " user_input
     user_input=${user_input:-"yes"}
+    user_input=${user_input,,}
 
-    if [ "$user_input" == "yes" ]; then
-        # 模型文件路径请参考以下形式存放
-        # Indexer: ${MODEL_PATH}/BAAI/bge-small-en-v1.5
-        # Reranker: ${MODEL_PATH}/BAAI/bge-reranker-large
-        # llm :${MODEL_PATH}/${LLM_MODEL}/INT4_compressed_weights
-        echo "you skipped model downloading, please make sure you have prepared all models under ${MODEL_PATH}"
-        ensure_required_models_for_ov
-    else
-        read -p "you have not prepare models, do you need one-click model downloading into ${MODEL_PATH}:(yes/no) [yes]" your_input
-        your_input=${your_input:-"yes"}
-        if [ "$your_input" == "yes" ]; then
-            echo "start to download models..."
-            mkdir -p $MODEL_PATH
-            python -m pip install --upgrade-strategy eager "optimum-intel[openvino]"
-            optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
-            optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task text-classification
-            optimum-cli export openvino --model ${LLM_MODEL} ${MODEL_PATH}/${LLM_MODEL}/INT4_compressed_weights --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
-        else
-            echo "Please prepare models before launch service..."
-            exit 0
-        fi
+    if [[ "$user_input" != "yes" && "$user_input" != "y" ]]; then
+        force_model_download=1
+        echo "Models not prepared. Auto downloading required models into ${MODEL_PATH}..."
     fi
-    HF_CACHE="${HOME}/.cache"
-    if [ ! -d "${HF_CACHE}" ]; then
-        mkdir -p "${HF_CACHE}"
-        echo "Created directory: ${HF_CACHE}"
-    fi
-    echo "give permission to related path..."
-    sudo chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
-    sudo chown -R 1000:1000 ${HF_CACHE}
-    HF_ENDPOINT=https://hf-mirror.com
-
-    # export ENV
-    export MODEL_PATH=${MODEL_PATH}
-    export DOC_PATH=${DOC_PATH}
-    export TMPFILE_PATH=${TMPFILE_PATH}
-    export HOST_IP=${HOST_IP}
-    export LLM_MODEL=${LLM_MODEL}
-    export HF_ENDPOINT=${HF_ENDPOINT}
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    export MILVUS_ENABLED=${MILVUS_ENABLED}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-    export MAX_MODEL_LEN=5000
-
-    # Start Docker Containers
-    COMPOSE_FILE="compose.yaml"
-    echo "starting service..."
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
-    print_ui_access_info
-}
-
-
-function check_baai_folder() {
-    local baai_path="${MODEL_PATH}/BAAI"
-
-    if [ -d "${baai_path}" ]; then
-        return 0
-    else
-        echo "Error: BAAI folder not found in ${MODEL_PATH}!"
-        echo "Please prepare the models first, then run quick_start_ov_services again."
-        exit 1
+
+    # Export environment variables
+    export HOST_IP
+    export MODEL_PATH
+    export DOC_PATH
+    export TMPFILE_PATH
+    export MILVUS_ENABLED
+    export CHAT_HISTORY_ROUND
+    export LLM_MODEL
+    export http_proxy
+    export https_proxy
+    export no_proxy
+    export HTTP_PROXY="${http_proxy}"
+    export HTTPS_PROXY="${https_proxy}"
+    export NO_PROXY="${no_proxy}"
+
+    # If user explicitly said models are not prepared, force download in interactive mode.
+    if [[ "${force_model_download}" == "1" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "User selected models not prepared; forcing model download (ignoring --skip-model-check)"
+        fi
+        download_required_models_for_backend "openvino" "${LLM_MODEL}"
+    # In interactive mode, user-confirmed prepared models should skip verification/download.
+    elif [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        echo "User confirmed models are prepared; skipping model verification/download in interactive mode."
     fi
+
+    # Delegate to bootstrap.sh
+    echo ""
+    echo "Starting OpenVINO deployment..."
+    restart_services_before_deploy
+    export INFERENCE_BACKEND="openvino"
+    # Use existing DEPLOYMENT_METHOD if set, otherwise default to baremetal
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
-ensure_openvino_tooling() {
-    if ! command -v optimum-cli >/dev/null 2>&1; then
-        echo "[Model Check] 'optimum-cli' not found, installing optimum-intel[openvino]..."
-        python -m pip install --upgrade-strategy eager "optimum-intel[openvino]"
+deploy_openvino_noninteractive() {
+    echo ""
+    echo "Starting OpenVINO deployment (non-interactive)..."
+
+    if [[ "${DEPLOYMENT_METHOD}" == "container" ]]; then
+        check_docker_and_compose_ready
     fi
-}
 
-ensure_huggingface_tooling() {
-    if ! command -v huggingface-cli >/dev/null 2>&1; then
-        echo "[Model Check] 'huggingface-cli' not found, installing huggingface_hub..."
-        python -m pip install huggingface_hub
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "openvino" "${DEPLOYMENT_METHOD}"; then
+        echo "OpenVINO ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
     fi
+
+    # Download/verify models (only for baremetal, containers handle this internally)
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "Skipping model verification/download (--skip-model-check enabled)"
+        else
+            ensure_required_models_for_ov
+        fi
+    fi
+
+    # Delegate to bootstrap.sh
+    restart_services_before_deploy
+    export INFERENCE_BACKEND="openvino"
+    # Use existing DEPLOYMENT_METHOD if set, otherwise default to baremetal
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
-ensure_embedding_and_reranker_models() {
-    local embedding_dir="${MODEL_PATH}/BAAI/bge-small-en-v1.5"
-    local reranker_dir="${MODEL_PATH}/BAAI/bge-reranker-large"
+deploy_vllm_interactive() {
+    local backend=$1  # a770 or b60
+    local force_model_download=0
 
-    if [ ! -f "${embedding_dir}/openvino_model.xml" ]; then
-        echo "[Model Check] Embedding model missing: ${embedding_dir}"
-        echo "[Model Check] Downloading embedding model..."
-        ensure_openvino_tooling
-        mkdir -p "${embedding_dir}"
-        optimum-cli export openvino -m BAAI/bge-small-en-v1.5 "${embedding_dir}" --task sentence-similarity
+    echo ""
+    echo "═══════════════════════════════════════════"
+    echo "  vLLM ${backend^^} Deployment Setup"
+    echo "═══════════════════════════════════════════"
+    echo ""
+
+    # Ask about deployment method
+    read -p "Deployment method (baremetal/container) [baremetal]: " deployment_method_input
+    deployment_method_input=${deployment_method_input:-"baremetal"}
+    export DEPLOYMENT_METHOD="${deployment_method_input}"
+
+    echo ""
+    echo "Selected deployment method: ${DEPLOYMENT_METHOD}"
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        echo "  → vLLM container + EdgeCraftRAG services as Python processes"
     else
-        echo "[Model Check] Embedding model exists: ${embedding_dir}"
+        echo "  → All services in Docker containers"
     fi
+    echo ""
 
-    if [ ! -f "${reranker_dir}/openvino_model.xml" ]; then
-        echo "[Model Check] Reranker model missing: ${reranker_dir}"
-        echo "[Model Check] Downloading reranker model..."
-        ensure_openvino_tooling
-        mkdir -p "${reranker_dir}"
-        optimum-cli export openvino -m BAAI/bge-reranker-large "${reranker_dir}" --task text-classification
-    else
-        echo "[Model Check] Reranker model exists: ${reranker_dir}"
+    # vLLM deployments always use Docker for model serving.
+    check_docker_and_compose_ready
+
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "vllm" "${DEPLOYMENT_METHOD}"; then
+        echo "vLLM ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
     fi
-}
 
-ensure_llm_model_for_vllm() {
-    local llm_dir="${MODEL_PATH}/${LLM_MODEL}"
-    if [ ! -f "${llm_dir}/config.json" ]; then
-        echo "[Model Check] vLLM LLM model missing: ${llm_dir}"
-        echo "[Model Check] Downloading LLM model '${LLM_MODEL}'..."
-        ensure_huggingface_tooling
-        mkdir -p "${llm_dir}"
-        huggingface-cli download "${LLM_MODEL}" --local-dir "${llm_dir}"
-    else
-        echo "[Model Check] vLLM LLM model exists: ${llm_dir}"
+    # Setup Python environment if baremetal deployment
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        setup_python_venv
+        verify_venv_activated
+        check_pip_requirements
+        check_npm_requirements
+        echo ""
+        echo "Python environment ready."
+        echo ""
     fi
-}
 
-ensure_llm_model_for_ov() {
-    local ov_llm_dir="${MODEL_PATH}/${LLM_MODEL}/INT4_compressed_weights"
-    if [ ! -f "${ov_llm_dir}/openvino_model.xml" ]; then
-        echo "[Model Check] OpenVINO LLM model missing: ${ov_llm_dir}"
-        echo "[Model Check] Downloading and converting LLM model '${LLM_MODEL}' to INT4 OpenVINO..."
-        ensure_openvino_tooling
-        mkdir -p "${ov_llm_dir}"
-        optimum-cli export openvino --model "${LLM_MODEL}" "${ov_llm_dir}" --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8
+    # Gather user inputs
+    ip_address=$(hostname -I | awk '{print $1}')
+    HOST_IP=$(get_user_input "host ip" "${ip_address}")
+    DOC_PATH=$(get_user_input "DOC_PATH" "$WORKPATH/workspace")
+    TMPFILE_PATH=$(get_user_input "TMPFILE_PATH" "$WORKPATH/workspace")
+    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 0 to disable)" "1")
+    CHAT_HISTORY_ROUND=$(get_user_input "chat history round" "0")
+    LLM_MODEL=$(get_user_input "your LLM model" "Qwen/Qwen3-8B")
+    MODEL_PATH=$(get_user_input "your model path" "${WORKPATH}/workspace/models")
+    http_proxy=$(get_user_input "http_proxy" "${http_proxy}")
+    https_proxy=$(get_user_input "https_proxy" "${https_proxy}")
+    no_proxy=$(get_user_input "no_proxy" "${no_proxy}")
+
+    # Ask about model preparation
+    read -p "Have you prepared models in ${MODEL_PATH}? (yes/no) [yes]: " user_input
+    user_input=${user_input:-"yes"}
+
+    if [ "$user_input" != "yes" ]; then
+        force_model_download=1
+        echo "Models not prepared. Auto downloading required models into ${MODEL_PATH}..."
+    fi
+
+    # Export environment variables
+    export HOST_IP
+    export MODEL_PATH
+    export DOC_PATH
+    export TMPFILE_PATH
+    export MILVUS_ENABLED
+    export CHAT_HISTORY_ROUND
+    export LLM_MODEL
+    export VLLM_BACKEND="${backend}"
+    export http_proxy
+    export https_proxy
+    export no_proxy
+    export HTTP_PROXY="${http_proxy}"
+    export HTTPS_PROXY="${https_proxy}"
+    export NO_PROXY="${no_proxy}"
+
+    # vLLM specific parameters
+    set_vllm_defaults
+    export MAX_MODEL_LEN=$(get_user_input "MAX_MODEL_LEN" "${MAX_MODEL_LEN}")
+    export GPU_MEMORY_UTIL=$(get_user_input "GPU_MEMORY_UTIL (e.g. 0.8)" "${GPU_MEMORY_UTIL}")
+    export QUANTIZATION=$(get_user_input "QUANTIZATION (fp8/sym_int4)" "${QUANTIZATION}")
+    export TOOL_PARSER=$(get_user_input "tool_parser (qwen3_coder/hermes)" "${TOOL_PARSER}")
+
+    if [ "$backend" == "a770" ]; then
+        read -p "Tensor parallel size (TP size) [1]: " TP
+        TP=${TP:-1}
+        export TP
+        export CCL_DG2_USM=$(get_user_input "Set USM (Core=1, Xeon=0, default=0)" 0)
+    elif [ "$backend" == "b60" ]; then
+        read -p "DP number (how many containers to run) [1]: " DP
+        export DP=${DP:-1}
+        read -p "Tensor parallel size (TP size) [1]: " TP
+        export TP=${TP:-1}
+        export DTYPE=$(get_user_input "DTYPE (vLLM data type, e.g. float16/bfloat16)" "float16")
+        export ZE_AFFINITY_MASK=$(get_user_input "ZE_AFFINITY_MASK (GPU affinity mask)" "0")
+    fi
+
+    # If user explicitly said models are not prepared, force download in interactive mode.
+    if [[ "${force_model_download}" == "1" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "User selected models not prepared; forcing model download (ignoring --skip-model-check)"
+        fi
+        download_required_models_for_backend "vllm_${backend}" "${LLM_MODEL}"
+    # Download/verify models (only for baremetal, containers handle this internally)
+    elif [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "Skipping model verification/download (--skip-model-check enabled)"
+        else
+            download_required_models_for_backend "vllm_${backend}" "${LLM_MODEL}"
+        fi
+    fi
+
+    # Delegate to bootstrap.sh
+    echo ""
+    echo "Starting vLLM ${backend^^} deployment..."
+    restart_services_before_deploy
+    if [ "$backend" == "a770" ]; then
+        export INFERENCE_BACKEND="vllm_a770"
     else
-        echo "[Model Check] OpenVINO LLM model exists: ${ov_llm_dir}"
+        export INFERENCE_BACKEND="vllm_b60"
     fi
+    # Use existing DEPLOYMENT_METHOD if set, otherwise default to baremetal
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
-ensure_required_models_for_vllm() {
-    ensure_embedding_and_reranker_models
-    ensure_llm_model_for_vllm
-}
+deploy_vllm_noninteractive() {
+    local backend=$1  # a770 or b60
 
-ensure_required_models_for_ov() {
-    ensure_embedding_and_reranker_models
-    ensure_llm_model_for_ov
-}
+    echo ""
+    echo "Starting vLLM ${backend^^} deployment (non-interactive)..."
 
+    # vLLM deployments always use Docker for model serving.
+    check_docker_and_compose_ready
 
-function quick_start_vllm_services() {
-    COMPOSE_FILE="compose.yaml"
-    EC_RAG_SERVICE_PORT=16010
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "vllm" "${DEPLOYMENT_METHOD}"; then
+        echo "vLLM ${backend^^} ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
+    fi
 
-    ip_address=$(hostname -I | awk '{print $1}')
-    export HOST_IP=${HOST_IP:-"${ip_address}"}
-    export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
-    export DOC_PATH=${DOC_PATH:-"$WORKPATH/workspace"}
-    export TMPFILE_PATH=${TMPFILE_PATH:-"$WORKPATH/workspace"}
-    export DP_NUM=${DP_NUM:-1}
-    export MILVUS_ENABLED=${MILVUS_ENABLED:-1}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-2}
-    export HF_ENDPOINT=${HF_ENDPOINT:-https://hf-mirror.com}
-    export TP=${TP:-1}
-    export MAX_NUM_SEQS=${MAX_NUM_SEQS:-64}
-    export MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
-    export MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-10240}
-    export QUANTIZATION=${QUANTIZATION:-fp8}
-    export CCL_DG2_USM=${CCL_DG2_USM:-0}
-    export LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
-    export LLM_MODEL_PATH=${LLM_MODEL_PATH:-"${MODEL_PATH}/${LLM_MODEL}"}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-    export VLLM_SERVICE_PORT_A770=8086
-
-    ensure_required_models_for_vllm
-    export HF_CACHE=${HF_CACHE:-"${HOME}/.cache"}
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    if [ ! -d "${HF_CACHE}" ]; then
-        mkdir -p "${HF_CACHE}"
-        echo "Created directory: ${HF_CACHE}"
-    fi
-    sudo chown -R 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
-    sudo chown -R 1000:1000 ${HF_CACHE}
-    cd $WORKPATH/docker_compose/intel/gpu/arc
-
-    docker compose --profile a770 -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
-    echo "ipex-llm-serving-xpu is booting, please wait..."
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs ipex-llm-serving-xpu-container-0 > ipex-llm-serving-xpu-container.log 2>&1
-        if grep -q "Starting vLLM API server on http://0.0.0.0:" ipex-llm-serving-xpu-container.log; then
-            break
+    export VLLM_BACKEND="${backend}"
+    set_vllm_defaults
+
+    # Download/verify models (only for baremetal, containers handle this internally)
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "Skipping model verification/download (--skip-model-check enabled)"
+        else
+            ensure_required_models_for_vllm
         fi
-        sleep 6s
-        n=$((n+1))
-    done
-    rm -rf ipex-llm-serving-xpu-container.log
-    print_ui_access_info
+    fi
+
+    # Delegate to bootstrap.sh
+    restart_services_before_deploy
+    if [ "$backend" == "a770" ]; then
+        export INFERENCE_BACKEND="vllm_a770"
+    else
+        export INFERENCE_BACKEND="vllm_b60"
+    fi
+    # Use existing DEPLOYMENT_METHOD if set, otherwise default to baremetal
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
+deploy_ovms_interactive() {
+    local force_model_download=0
 
-function quick_start_ov_services() {
-    COMPOSE_FILE="compose.yaml"
-    echo "stop former service..."
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+    echo ""
+    echo "═══════════════════════════════════════════"
+    echo "  OVMS Deployment Setup"
+    echo "═══════════════════════════════════════════"
+    echo ""
 
-    ip_address=$(hostname -I | awk '{print $1}')
-    export HOST_IP=${HOST_IP:-"${ip_address}"}
-    export DOC_PATH=${DOC_PATH:-"$WORKPATH/workspace"}
-    export TMPFILE_PATH=${TMPFILE_PATH:-"$WORKPATH/workspace"}
-    export MILVUS_ENABLED=${MILVUS_ENABLED:-1}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-"0"}
-    export LLM_MODEL=${LLM_MODEL:-"Qwen/Qwen3-8B"}
-    export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-    export MAX_MODEL_LEN=5000
-
-    ensure_required_models_for_ov
-    export HF_CACHE=${HF_CACHE:-"${HOME}/.cache"}
-    if [ ! -d "${HF_CACHE}" ]; then
-        mkdir -p "${HF_CACHE}"
-        echo "Created directory: ${HF_CACHE}"
-    fi
-
-    sudo chown 1000:1000 "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}"
-    sudo chown -R 1000:1000 "${HF_CACHE}"
-    export HF_ENDPOINT=${HF_ENDPOINT:-"https://hf-mirror.com"}
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    export CCL_DG2_USM=${CCL_DG2_USM:-0}
+    read -p "Deployment method (baremetal/container) [baremetal]: " deployment_method_input
+    deployment_method_input=${deployment_method_input:-"baremetal"}
+    export DEPLOYMENT_METHOD="${deployment_method_input}"
 
-    echo "Starting service..."
-    docker compose -f "$WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE" up -d
-    print_ui_access_info
-}
+    echo ""
+    echo "Selected deployment method: ${DEPLOYMENT_METHOD}"
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        echo "  → OVMS container + EdgeCraftRAG services as Python processes"
+    else
+        echo "  → All services in Docker containers"
+    fi
+    echo ""
+
+    check_docker_and_compose_ready
 
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "ovms" "${DEPLOYMENT_METHOD}"; then
+        echo "OVMS ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
+    fi
 
-function start_vLLM_B60_services() {
-    COMPOSE_FILE="compose.yaml"
-    echo "stop former service..."
-    export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/models"}
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        setup_python_venv
+        verify_venv_activated
+        check_pip_requirements
+        check_npm_requirements
+        echo ""
+        echo "Python environment ready."
+        echo ""
+    fi
 
     ip_address=$(hostname -I | awk '{print $1}')
     HOST_IP=$(get_user_input "host ip" "${ip_address}")
     DOC_PATH=$(get_user_input "DOC_PATH" "$WORKPATH/workspace")
     TMPFILE_PATH=$(get_user_input "TMPFILE_PATH" "$WORKPATH/workspace")
-    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 1 for enable)" "0")
+    MILVUS_ENABLED=$(get_enable_function "MILVUS DB(Enter 0 to disable)" "1")
     CHAT_HISTORY_ROUND=$(get_user_input "chat history round" "0")
     LLM_MODEL=$(get_user_input "your LLM model" "Qwen/Qwen3-8B")
     MODEL_PATH=$(get_user_input "your model path" "${WORKPATH}/workspace/models")
-    read -p "Have you prepare models in ${MODEL_PATH}:(yes/no) [yes]" user_input
+    http_proxy=$(get_user_input "http_proxy" "${http_proxy}")
+    https_proxy=$(get_user_input "https_proxy" "${https_proxy}")
+    no_proxy=$(get_user_input "no_proxy" "${no_proxy}")
+    OVMS_SERVICE_PORT=$(get_user_input "OVMS service port" "8000")
+
+    # Ask about model preparation
+    read -p "Have you prepared models in ${MODEL_PATH}? (yes/no) [yes]: " user_input
     user_input=${user_input:-"yes"}
+    if [ "$user_input" != "yes" ]; then
+        force_model_download=1
+        echo "Models not prepared. Auto downloading required models into ${MODEL_PATH}..."
+    fi
 
-    if [ "$user_input" == "yes" ]; then
-        # 模型文件路径请参考以下形式存放， llm为huggingface
-        # Indexer: ${MODEL_PATH}/BAAI/bge-small-en-v1.5
-        # Reranker: ${MODEL_PATH}/BAAI/bge-reranker-large
-        # llm :${MODEL_PATH}/${LLM_MODEL} (从huggingface或modelscope下载的原始模型，而不是经过OpenVINO转换的模型!)
-        echo "you skipped model downloading, please make sure you have prepared all models under ${MODEL_PATH}"
-        ensure_required_models_for_vllm
-    else
-        echo "you have not prepare models, starting to download models into ${MODEL_PATH}..."
-        mkdir -p $MODEL_PATH
-        python -m pip install --upgrade-strategy eager "optimum-intel[openvino]"
-        optimum-cli export openvino -m BAAI/bge-small-en-v1.5 ${MODEL_PATH}/BAAI/bge-small-en-v1.5 --task sentence-similarity
-        optimum-cli export openvino -m BAAI/bge-reranker-large ${MODEL_PATH}/BAAI/bge-reranker-large --task text-classification
-        pip install huggingface_hub
-        huggingface-cli download $LLM_MODEL --local-dir "${MODEL_PATH}/${LLM_MODEL}"
-    fi
-    echo "give permission to related path..."
-    sudo chown 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
-    # vllm ENV
-    export VLLM_SERVICE_PORT_B60=8086
-    export vLLM_ENDPOINT="http://${HOST_IP}:${VLLM_SERVICE_PORT_B60}"
-    read -p "DP number(how many containers to run B60_vLLM) [4] , press Enter to confirm, or type a new value:" DP; DP=${DP:-4}
-    read -p "Tensor parallel size(your tp size [1]), press Enter to confirm, or type a new value:" TP; TP=${TP:-1}
-    DTYPE=$(get_user_input "DTYPE (vLLM data type, e.g. float16/bfloat16)" "float16")
-    ZE_AFFINITY_MASK=$(get_user_input "ZE_AFFINITY_MASK (GPU affinity mask, multi-GPU use 0,1,2...)" "0,1,2,3")
-    ENFORCE_EAGER=$(get_user_input "ENFORCE_EAGER (enable eager execution, 1=enable/0=disable)" "1")
-    TRUST_REMOTE_CODE=$(get_user_input "TRUST_REMOTE_CODE (trust remote code for custom models, 1=enable/0=disable)" "1")
-    DISABLE_SLIDING_WINDOW=$(get_user_input "DISABLE_SLIDING_WINDOW (disable sliding window attention, 1=disable/0=enable)" "1")
-    GPU_MEMORY_UTIL=$(get_user_input "GPU_MEMORY_UTIL (GPU memory utilization, range 0.1-1.0)" "0.8")
-    NO_ENABLE_PREFIX_CACHING=$(get_user_input "NO_ENABLE_PREFIX_CACHING (disable prefix caching, 1=disable/0=enable)" "1")
-    MAX_NUM_BATCHED_TOKENS=$(get_user_input "MAX_NUM_BATCHED_TOKENS (max number of batched tokens)" "8192")
-    DISABLE_LOG_REQUESTS=$(get_user_input "DISABLE_LOG_REQUESTS (disable request logs, 1=disable/0=enable)" "1")
-    MAX_MODEL_LEN=$(get_user_input "MAX_MODEL_LEN (max model context length, e.g. 40000/10240)" "40000")
-    BLOCK_SIZE=$(get_user_input "BLOCK_SIZE (vLLM block size)" "64")
-    QUANTIZATION=$(get_user_input "QUANTIZATION (model quantization method, e.g. fp8/int4)" "fp8")
-    # export ENV
-    export HOST_IP=${HOST_IP:-"${ip_address}"}
-    export MODEL_PATH=${MODEL_PATH}
-    export DOC_PATH=${DOC_PATH}
-    export TMPFILE_PATH=${TMPFILE_PATH}
-    export LLM_MODEL=${LLM_MODEL}
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    export MILVUS_ENABLED=${MILVUS_ENABLED}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
-    export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-    # export vllm ENV
-    export DP=${DP}
-    export TP=${TP}
-    export DTYPE=${DTYPE}
-    export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}
-    export ENFORCE_EAGER=${ENFORCE_EAGER}
-    export TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE}
-    export DISABLE_SLIDING_WINDOW=${DISABLE_SLIDING_WINDOW}
-    export GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL}
-    export NO_ENABLE_PREFIX_CACHING=${NO_ENABLE_PREFIX_CACHING}
-    export MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS}
-    export DISABLE_LOG_REQUESTS=${DISABLE_LOG_REQUESTS}
-    export MAX_MODEL_LEN=${MAX_MODEL_LEN}
-    export BLOCK_SIZE=${BLOCK_SIZE}
-    export QUANTIZATION=${QUANTIZATION}
-
-    # Start Docker Containers
-    docker compose --profile b60 -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
-    echo "ipex-llm-serving-xpu is booting, please wait..."
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs ipex-llm-serving-xpu-container-0 > ipex-llm-serving-xpu-container.log 2>&1
-        if grep -q "Starting vLLM API server on http://0.0.0.0:" ipex-llm-serving-xpu-container.log; then
-            break
+    export HOST_IP
+    export MODEL_PATH
+    export DOC_PATH
+    export TMPFILE_PATH
+    export MILVUS_ENABLED
+    export CHAT_HISTORY_ROUND
+    export LLM_MODEL
+    export OVMS_SERVICE_PORT
+    export http_proxy
+    export https_proxy
+    export no_proxy
+    export HTTP_PROXY="${http_proxy}"
+    export HTTPS_PROXY="${https_proxy}"
+    export NO_PROXY="${no_proxy}"
+    unset OVMS_ENDPOINT OVMS_REST_PORT OVMS_SOURCE_MODEL OVMS_MODEL_REPOSITORY_PATH OVMS_MODEL_NAME \
+        OVMS_TARGET_DEVICE OVMS_TASK OVMS_CACHE_DIR OVMS_ENABLE_PREFIX_CACHING OVMS_TOOL_PARSER \
+        OVMS_ENABLE_TOOL_GUIDED_GENERATION OVMS_MAX_NUM_BATCHED_TOKENS
+    set_ovms_defaults
+
+    if [[ "${force_model_download}" == "1" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "User selected models not prepared; forcing model download (ignoring --skip-model-check)"
         fi
-        sleep 6s
-        n=$((n+1))
-    done
-    rm -rf ipex-llm-serving-xpu-container.log
-    print_ui_access_info
+        download_required_models_for_backend "ovms" "${LLM_MODEL}"
+    elif [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "Skipping model verification/download (--skip-model-check enabled)"
+        else
+            download_required_models_for_backend "ovms" "${LLM_MODEL}"
+        fi
+    fi
+
+    echo ""
+    echo "Starting OVMS deployment..."
+    restart_services_before_deploy
+    export INFERENCE_BACKEND="ovms"
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
+deploy_ovms_noninteractive() {
+    echo ""
+    echo "Starting OVMS deployment (non-interactive)..."
 
-function quick_start_vllm_B60_services() {
-    COMPOSE_FILE="compose.yaml"
-    EC_RAG_SERVICE_PORT=16010
-    docker compose -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE down
+    check_docker_and_compose_ready
 
-    ip_address=$(hostname -I | awk '{print $1}')
-    export HOST_IP=${HOST_IP:-"${ip_address}"}
-    export MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
-    export DOC_PATH=${DOC_PATH:-"$WORKPATH/workspace"}
-    export TMPFILE_PATH=${TMPFILE_PATH:-"$WORKPATH/workspace"}
-    export MILVUS_ENABLED=${MILVUS_ENABLED:-1}
-    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-2}
-    export LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
-    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
-    export RENDERGROUPID=$(getent group render | cut -d: -f3)
-    # export vllm ENV
-    export DP=${DP:-1}
-    export TP=${TP:-1}
-    export DTYPE=${DTYPE:-float16}
-    export ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0}
-    export ENFORCE_EAGER=${ENFORCE_EAGER:-1}
-    export TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-1}
-    export DISABLE_SLIDING_WINDOW=${DISABLE_SLIDING_WINDOW:-1}
-    export GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL:-0.8}
-    export NO_ENABLE_PREFIX_CACHING=${NO_ENABLE_PREFIX_CACHING:-1}
-    export MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-8192}
-    export DISABLE_LOG_REQUESTS=${disable_LOG_REQUESTS:-1}
-    export MAX_MODEL_LEN=${MAX_MODEL_LEN:-40000}
-    export BLOCK_SIZE=${BLOCK_SIZE:-64}
-    export QUANTIZATION=${QUANTIZATION:-fp8}
+    if [[ "${RESTART_ON_RERUN}" != "1" ]] && are_target_services_running "ovms" "${DEPLOYMENT_METHOD}"; then
+        echo "OVMS ${DEPLOYMENT_METHOD} services are already running."
+        echo "Skipping redeploy. Use './tools/quick_start.sh restart' to force restart."
+        return 0
+    fi
 
+    set_ovms_defaults
 
-    ensure_required_models_for_vllm
-    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
-    sudo chown -R 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
-    docker compose --profile b60 -f $WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE up -d
-    echo "ipex-llm-serving-xpu is booting, please wait..."
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        docker logs ipex-llm-serving-xpu-container-0 > ipex-llm-serving-xpu-container.log 2>&1
-        if grep -q "Starting vLLM API server on http://0.0.0.0:" ipex-llm-serving-xpu-container.log; then
-            break
+    if [[ "${DEPLOYMENT_METHOD}" == "baremetal" ]]; then
+        if [[ "${SKIP_MODEL_CHECK}" == "1" ]]; then
+            echo "Skipping model verification/download (--skip-model-check enabled)"
+        else
+            ensure_required_models_for_vllm
         fi
-        sleep 6s
-        n=$((n+1))
-    done
-    rm -rf ipex-llm-serving-xpu-container.log
-    print_ui_access_info
+    fi
+
+    restart_services_before_deploy
+    export INFERENCE_BACKEND="ovms"
+    export DEPLOYMENT_METHOD="${DEPLOYMENT_METHOD:-baremetal}"
+    export SKIP_VALIDATION=1
+    save_bootstrap_env_snapshot "${INFERENCE_BACKEND}" "${DEPLOYMENT_METHOD}"
+    bash "${SCRIPT_DIR}/bootstrap.sh"
 }
 
+#==============================================================================
+# Usage Information
+#==============================================================================
+
+usage() {
+    cat << EOF
+EdgeCraftRAG Quick Start
+One-command deployment with automatic model download and setup
+
+USAGE:
+    ./tools/quick_start.sh [COMMAND] [OPTIONS]
+
+COMMANDS:
+    (none)          Start deployment (interactive or non-interactive mode)
+    cleanup         Stop all services and cleanup
+    restart         Restart all services, then deploy
 
-function quick_cleanup_services() {
-    COMPOSE_FILE="compose.yaml"
-    echo "Stopping EdgeCraftRAG services..."
-    docker compose -f "$WORKPATH/docker_compose/intel/gpu/arc/$COMPOSE_FILE" down
-    echo "Cleanup completed."
+OPTIONS:
+    -h, --help      Show this help message
+    --version       Show script version
+    -i, --interactive Enable interactive mode (prompt for deployment selection)
+    --skip-model-check Skip model verification/download steps
+    --skip-gpu-driver-check Skip Intel GPU driver validation/install steps
+
+MODES:
+    Interactive Mode:
+        ./tools/quick_start.sh -i
+
+        Prompts for:
+                    • Deployment type (OpenVINO/vLLM_A770/vLLM_B60/OVMS)
+          • Deployment method (baremetal/container)
+          • Configuration parameters (HOST_IP, MODEL_PATH, etc.)
+
+    Non-Interactive Mode (default):
+        ./tools/quick_start.sh
+
+        Uses environment variables (see below)
+        Default: OpenVINO baremetal deployment
+
+ENVIRONMENT VARIABLES:
+    Deployment Selection:
+        INFERENCE_BACKEND   Inference backend: openvino|vllm_a770|vllm_b60|ovms (default: openvino)
+        COMPOSE_PROFILES    Backward-compatible alias for legacy profile selection
+        DEPLOYMENT_METHOD   Deployment method: baremetal|container (default: baremetal)
+                           baremetal = Python processes with venv/pip checks
+                           container = Docker containers (skips venv/pip checks)
+
+    Common Configuration:
+        HOST_IP            Server IP address (default: auto-detected)
+        MODEL_PATH         Model storage path (default: workspace/models)
+        DOC_PATH           Document storage path (default: workspace)
+        TMPFILE_PATH       Temporary files path (default: workspace)
+        LLM_MODEL          LLM model name (default: Qwen/Qwen3-8B)
+        EMBEDDING_MODEL    Embedding model ID (default: BAAI/bge-small-en-v1.5)
+        RERANKER_MODEL     Reranker model ID (default: BAAI/bge-reranker-large)
+        MODEL_DOWNLOAD_SOURCE Model source: modelscope|huggingface (default: modelscope)
+        MILVUS_ENABLED     Enable Milvus DB: 0|1 (default: 1)
+        CHAT_HISTORY_ROUND Chat history length (default: 0)
+        SKIP_MODEL_CHECK   Skip model verification/download: 0|1 (default: 0)
+        SKIP_INTEL_GPU_DRIVER_CHECK Skip Intel GPU driver validation/install: 0|1 (default: 0)
+        AUTO_INSTALL_INTEL_GPU_DRIVER Auto install Intel GPU driver/runtime when missing: 0|1 (default: 1)
+        AUTO_INSTALL_NPM   Auto install npm when missing for baremetal UI startup: 0|1 (default: 1)
+        RESTART_ON_RERUN  Restart services when quick_start is run again: 0|1 (default: 0)
+                          (set to 1 automatically when using 'restart' command)
+
+    vLLM Specific (A770/B60):
+        MAX_MODEL_LEN      Maximum model context length (default: 8192)
+        GPU_MEMORY_UTIL    GPU memory utilization ratio (default: 0.8)
+        QUANTIZATION       Quantization mode: fp8|sym_int4 (default: fp8)
+        TOOL_PARSER        Tool parser: qwen3_coder|hermes (default: qwen3_coder)
+
+    vLLM Specific (A770):
+        TP                 Tensor parallel size (default: 1)
+        CCL_DG2_USM        USM setting (default: 0)
+
+    vLLM Specific (B60):
+        DP                 Data parallel instances (default: 1)
+        TP                 Tensor parallel size (default: 1)
+        DTYPE              Data type: float16|bfloat16 (default: float16)
+        ZE_AFFINITY_MASK   GPU affinity mask (default: 0)
+
+FEATURES:
+    ✓ Automatic Python virtual environment setup (baremetal mode)
+    ✓ Virtual environment activation verification (Python 3.10+ check)
+    ✓ Automatic pip requirements check and installation (baremetal mode)
+    ✓ Automatic npm check and installation for baremetal UI startup
+    ✓ Optional full-service restart via 'restart' command
+    ✓ Automatic model download (default ModelScope, optional Hugging Face)
+    ✓ Model download logic extracted to tools/model_download.sh
+    ✓ Supports both baremetal and container deployment methods
+    ✓ Delegates to bootstrap.sh for deployment
+    ✓ Interactive prompts or environment variable configuration
+
+EXAMPLES:
+    # Interactive mode (prompts for deployment selection)
+    ./tools/quick_start.sh -i
+
+    # Non-interactive OpenVINO deployment (default)
+    ./tools/quick_start.sh
+
+    # Non-interactive vLLM A770 deployment
+    export INFERENCE_BACKEND=vllm_a770
+    export MODEL_PATH=/data/models
+    ./tools/quick_start.sh
+
+    # Non-interactive vLLM B60 deployment with custom settings
+    export INFERENCE_BACKEND=vllm_b60
+    export MODEL_PATH=/data/models
+    export DP=2
+    export TP=1
+    ./tools/quick_start.sh
+
+    # Container deployment (skips venv and pip checks)
+    export DEPLOYMENT_METHOD=container
+    export MODEL_PATH=/data/models
+    ./tools/quick_start.sh
+
+    # OVMS deployment
+    export INFERENCE_BACKEND=ovms
+    ./tools/quick_start.sh
+
+    # Skip model verification/download (models must already exist)
+    ./tools/quick_start.sh --skip-model-check
+
+    # Skip Intel GPU driver validation/install
+    ./tools/quick_start.sh --skip-gpu-driver-check
+
+    # Cleanup (stop all services)
+    ./tools/quick_start.sh cleanup
+
+    # Restart all services, then deploy
+    ./tools/quick_start.sh restart
+
+DEPLOYMENT METHOD:
+    This script supports two deployment methods (set via DEPLOYMENT_METHOD):
+
+    Baremetal (default):
+      • Runs services as Python processes with virtual environment
+      • Automatic venv setup and pip requirements installation
+      • OpenVINO: All services as Python processes (no Docker)
+      • vLLM: vLLM container + EdgeCraftRAG services as processes
+      • Benefits: Faster startup, direct log access, easier debugging
+
+    Container:
+      • Runs all services in Docker containers
+      • Skips venv and pip checks (containers are pre-built)
+      • All services managed via Docker Compose
+      • Benefits: Isolated environment, easier distribution
+
+LOGS:
+    OpenVINO: workspace/logs/bare_metal/
+    vLLM:     workspace/logs/vllm_baremetal/
+
+SERVICE MANAGEMENT:
+    Start:    ./tools/quick_start.sh
+    Status:   ./tools/run_ov_baremetal.sh status
+              ./tools/run_vllm_baremetal.sh status
+              ./tools/run_ovms_baremetal.sh status
+    Stop:     ./tools/quick_start.sh cleanup
+    Restart:  ./tools/quick_start.sh restart
+
+NOTES:
+    • First run will download models automatically if missing (unless --skip-model-check is used)
+    • Intel GPU driver/runtime is validated automatically and installed when missing (apt-based Linux)
+    • Python 3.10+ required; 3.10/3.11 recommended for the smoothest setup
+    • For container deployment, use bootstrap.sh with DEPLOYMENT_METHOD=container
+    • Backward compatible with previous COMPOSE_PROFILES settings
+
+For more details, see: EdgeCraftRAG/tools/README.md
+
+EOF
+}
+
+#==============================================================================
+# Cleanup Function
+#==============================================================================
+
+cleanup_services() {
+    # Delegate to bootstrap.sh
+    bash "${SCRIPT_DIR}/bootstrap.sh" cleanup
 }
 
+#==============================================================================
+# Main Function
+#==============================================================================
+
+main() {
+    local command=""
+
+    # Parse command line arguments
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            -h|--help|help)
+                usage
+                exit 0
+                ;;
+            --version)
+                echo "EdgeCraftRAG Quick Start v1.0"
+                exit 0
+                ;;
+            -i|--interactive)
+                export INTERACTIVE_MODE=1
+                ;;
+            --skip-model-check)
+                export SKIP_MODEL_CHECK=1
+                ;;
+            --skip-gpu-driver-check)
+                export SKIP_INTEL_GPU_DRIVER_CHECK=1
+                ;;
+            cleanup)
+                if [[ -n "${command}" ]]; then
+                    echo "ERROR: Multiple commands provided."
+                    usage
+                    exit 1
+                fi
+                command="cleanup"
+                ;;
+            restart)
+                if [[ -n "${command}" ]]; then
+                    echo "ERROR: Multiple commands provided."
+                    usage
+                    exit 1
+                fi
+                command="restart"
+                ;;
+            *)
+                echo "ERROR: Unknown argument: $1"
+                usage
+                exit 1
+                ;;
+        esac
+        shift
+    done
 
-function main {
-    if [[ "${1:-}" == "cleanup" ]]; then
-        quick_cleanup_services
+    if [[ "${command}" == "cleanup" ]]; then
+        cleanup_services
         exit 0
     fi
 
-    if [[ $- == *i* ]]; then
-        read -p "Do you want to start vLLM or local OpenVINO services? (vLLM_A770/vLLM_B60/ov) [ov]: " user_input
-        user_input=${user_input:-"ov"}
+    if [[ "${command}" == "restart" ]]; then
+        export RESTART_ON_RERUN=1
+    fi
+
+    ensure_intel_gpu_driver_ready
+
+    # For interactive mode, skip venv/pip checks here
+    # The deploy_*_interactive functions will handle environment setup based on user's choice
+    if [[ "${INTERACTIVE_MODE:-0}" != "1" ]]; then
+        # Non-interactive mode: detect deployment method and setup environment
+        DEPLOYMENT_METHOD=${DEPLOYMENT_METHOD:-baremetal}
+
+        # Skip venv and pip checks for container deployments
+        if [[ "${DEPLOYMENT_METHOD}" == "container" ]]; then
+            echo ""
+            echo "Container deployment detected - skipping Python environment setup"
+            echo ""
+        else
+            # Setup Python virtual environment for baremetal deployments
+            setup_python_venv
+
+            # Verify venv is activated
+            verify_venv_activated
+
+            # Check and install pip requirements
+            check_pip_requirements
+
+            # Check and install npm for baremetal UI startup
+            check_npm_requirements
+
+            echo ""
+            echo "Deployment preparation complete."
+            echo ""
+        fi
+    fi
+
+    # Detect interactive vs non-interactive mode
+    # Use INTERACTIVE_MODE variable set by -i flag or environment
+    if [[ "${INTERACTIVE_MODE:-0}" == "1" ]]; then
+        # Interactive mode: prompt user
+        echo ""
+        echo "════════════════════════════════════════════════════════════"
+        echo "  EdgeCraftRAG Quick Start - Interactive Mode"
+        echo "════════════════════════════════════════════════════════════"
+        echo ""
+
+        # Use timeout with read to prevent hanging
+        if read -t 60 -p "Do you want to start vLLM, OVMS, or local OpenVINO services? (vLLM_A770/vLLM_B60/ovms/ov) [ov]: " user_input; then
+            user_input=${user_input:-"ov"}
+        else
+            echo ""
+            echo "No input received (timeout or non-interactive), defaulting to OpenVINO..."
+            user_input="ov"
+        fi
+
         if [[ "$user_input" == "vLLM_A770" ]]; then
-            start_vllm_services
+            deploy_vllm_interactive "a770"
         elif [[ "$user_input" == "vLLM_B60" ]]; then
-            start_vLLM_B60_services
+            deploy_vllm_interactive "b60"
+        elif [[ "$user_input" == "ovms" || "$user_input" == "OVMS" ]]; then
+            deploy_ovms_interactive
         else
-            start_services
+            deploy_openvino_interactive
         fi
     else
+        # Non-interactive mode: resolve INFERENCE_BACKEND with openvino as the default
+        echo "Running in non-interactive mode..."
         export COMPOSE_PROFILES=${COMPOSE_PROFILES:-""}
-        if [[ "$COMPOSE_PROFILES" == "vLLM_A770" || "$COMPOSE_PROFILES" == "vLLM"  || "$COMPOSE_PROFILES" == "vllm_on_a770" ]]; then
-            quick_start_vllm_services
-        elif [[ "$COMPOSE_PROFILES" == "vLLM_B60" || "$COMPOSE_PROFILES" == "vLLM_b60" || "$COMPOSE_PROFILES" == "vllm_on_b60" ]]; then
-            quick_start_vllm_B60_services
+
+        selected_backend="${INFERENCE_BACKEND:-}"
+        if [[ -z "${selected_backend}" ]]; then
+            case "${COMPOSE_PROFILES}" in
+                vLLM_A770|vLLM|vllm_on_a770)
+                    selected_backend="vllm_a770"
+                    ;;
+                vLLM_B60|vLLM_b60|vllm_on_b60)
+                    selected_backend="vllm_b60"
+                    ;;
+                ovms|OVMS)
+                    selected_backend="ovms"
+                    ;;
+                *)
+                    selected_backend="openvino"
+                    ;;
+            esac
+        fi
+        export INFERENCE_BACKEND="${selected_backend}"
+
+        if [[ "$selected_backend" == "vllm_a770" || "$selected_backend" == "vLLM_A770" || "$selected_backend" == "vLLM" || "$selected_backend" == "vllm_on_a770" ]]; then
+            echo "Detected vLLM A770 inference backend"
+            deploy_vllm_noninteractive "a770"
+        elif [[ "$selected_backend" == "vllm_b60" || "$selected_backend" == "vLLM_B60" || "$selected_backend" == "vLLM_b60" || "$selected_backend" == "vllm_on_b60" ]]; then
+            echo "Detected vLLM B60 inference backend"
+            deploy_vllm_noninteractive "b60"
+        elif [[ "$selected_backend" == "ovms" || "$selected_backend" == "OVMS" ]]; then
+            echo "Detected OVMS inference backend"
+            deploy_ovms_noninteractive
         else
-            quick_start_ov_services
+            echo "Detected OpenVINO inference backend (default)"
+            deploy_openvino_noninteractive
         fi
     fi
 }
 
-main
+main "$@"
diff --git a/EdgeCraftRAG/tools/run_ov_baremetal.sh b/EdgeCraftRAG/tools/run_ov_baremetal.sh
new file mode 100755
index 0000000000..c727e8d9ce
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_ov_baremetal.sh
@@ -0,0 +1,378 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+WORKSPACE_ROOT="${WORKPATH}/workspace"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+PIPELINE_SERVICE_HOST_IP=${PIPELINE_SERVICE_HOST_IP:-0.0.0.0}
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+if [[ -n "${PYTHON_BIN:-}" ]]; then
+  PYTHON_BIN=${PYTHON_BIN}
+elif [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then
+  PYTHON_BIN="${VIRTUAL_ENV}/bin/python"
+elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
+  PYTHON_BIN="${CONDA_PREFIX}/bin/python"
+elif [[ -x "${HOME}/miniforge3/envs/edgeairag/bin/python3" ]]; then
+  PYTHON_BIN="${HOME}/miniforge3/envs/edgeairag/bin/python3"
+else
+  PYTHON_BIN=$(command -v python3)
+fi
+
+MODEL_PATH=${MODEL_PATH:-"${WORKSPACE_ROOT}/models"}
+DOC_PATH=${DOC_PATH:-"${WORKSPACE_ROOT}"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKSPACE_ROOT}"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+
+LOG_DIR="${WORKSPACE_ROOT}/logs/bare_metal"
+PID_DIR="${WORKSPACE_ROOT}/pids"
+mkdir -p "${LOG_DIR}" "${PID_DIR}" "${DOC_PATH}" "${TMPFILE_PATH}"
+if [[ -L "${MODEL_PATH}" ]]; then
+  MODEL_PATH_LINK_TARGET=$(readlink "${MODEL_PATH}")
+  if [[ "${MODEL_PATH_LINK_TARGET}" = /* ]]; then
+    mkdir -p "${MODEL_PATH_LINK_TARGET}"
+  else
+    mkdir -p "$(cd "$(dirname "${MODEL_PATH}")" && pwd)/${MODEL_PATH_LINK_TARGET}"
+  fi
+elif [[ ! -d "${MODEL_PATH}" ]]; then
+  mkdir -p "${MODEL_PATH}"
+fi
+
+SERVER_PID_FILE="${PID_DIR}/edgecraftrag-server.pid"
+MEGA_PID_FILE="${PID_DIR}/edgecraftrag.pid"
+UI_PID_FILE="${PID_DIR}/edgecraftrag-ui.pid"
+
+SERVER_LOG="${LOG_DIR}/edgecraftrag-server.log"
+MEGA_LOG="${LOG_DIR}/edgecraftrag.log"
+UI_LOG="${LOG_DIR}/edgecraftrag-ui.log"
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+is_pid_running() {
+  local pid_file=$1
+  if [[ -f "$pid_file" ]]; then
+    local pid
+    pid=$(cat "$pid_file")
+    [[ -n "$pid" ]] && kill -0 "$pid" >/dev/null 2>&1
+  else
+    return 1
+  fi
+}
+
+start_process() {
+  local name=$1
+  local pid_file=$2
+  local log_file=$3
+  shift 3
+
+  if is_pid_running "$pid_file"; then
+    echo "${name} is already running (pid $(cat "$pid_file"))"
+    return 0
+  fi
+
+  echo "Starting ${name}..."
+  setsid nohup "$@" >"$log_file" 2>&1 &
+  local pid=$!
+  echo "$pid" >"$pid_file"
+  sleep 2
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "${name} started (pid ${pid}), log: ${log_file}"
+  else
+    echo "ERROR: failed to start ${name}. Check log: ${log_file}"
+    rm -f "$pid_file"
+    exit 1
+  fi
+}
+
+find_listening_pids_by_port() {
+  local port=$1
+  local pids=""
+
+  if command -v lsof >/dev/null 2>&1; then
+    pids=$(lsof -tiTCP:"${port}" -sTCP:LISTEN 2>/dev/null || true)
+  elif command -v ss >/dev/null 2>&1; then
+    pids=$(ss -ltnp "sport = :${port}" 2>/dev/null | sed -nE 's/.*pid=([0-9]+).*/\1/p' | sort -u)
+  fi
+
+  echo "$pids"
+}
+
+stop_port_listener() {
+  local port=$1
+  local name=$2
+  local pids
+  pids=$(find_listening_pids_by_port "$port")
+
+  if [[ -z "$pids" ]]; then
+    return 0
+  fi
+
+  echo "Port ${port} is already in use by ${name} pid(s): ${pids}"
+  echo "Stopping stale listener(s) on port ${port}..."
+
+  for pid in $pids; do
+    kill "$pid" >/dev/null 2>&1 || true
+  done
+
+  for _ in {1..5}; do
+    local remaining
+    remaining=$(find_listening_pids_by_port "$port")
+    if [[ -z "$remaining" ]]; then
+      break
+    fi
+    sleep 1
+  done
+
+  local remaining
+  remaining=$(find_listening_pids_by_port "$port")
+  if [[ -n "$remaining" ]]; then
+    echo "Force killing remaining listener(s) on port ${port}: ${remaining}"
+    for pid in $remaining; do
+      kill -9 "$pid" >/dev/null 2>&1 || true
+    done
+  fi
+}
+
+stop_process() {
+  local name=$1
+  local pid_file=$2
+
+  if ! is_pid_running "$pid_file"; then
+    echo "${name} is not running"
+    rm -f "$pid_file"
+    return 0
+  fi
+
+  local pid
+  pid=$(cat "$pid_file")
+  echo "Stopping ${name} (pid ${pid})..."
+  kill -TERM -- "-$pid" >/dev/null 2>&1 || kill "$pid" >/dev/null 2>&1 || true
+
+  for _ in {1..10}; do
+    if kill -0 "$pid" >/dev/null 2>&1; then
+      sleep 1
+    else
+      break
+    fi
+  done
+
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "Force killing ${name} (pid ${pid})..."
+    kill -KILL -- "-$pid" >/dev/null 2>&1 || kill -9 "$pid" >/dev/null 2>&1 || true
+  fi
+
+  rm -f "$pid_file"
+  echo "${name} stopped"
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  ensure_cmd "$PYTHON_BIN"
+
+  export HOST_IP
+  export MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export LLM_MODEL
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,${HOST_IP}"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+}
+
+start_server() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag-server" \
+    "$SERVER_PID_FILE" \
+    "$SERVER_LOG" \
+    env PIPELINE_SERVICE_HOST_IP="${PIPELINE_SERVICE_HOST_IP}" PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" \
+    "$PYTHON_BIN" -m edgecraftrag.server
+  popd >/dev/null
+}
+
+start_mega() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag (mega service)" \
+    "$MEGA_PID_FILE" \
+    "$MEGA_LOG" \
+    env MEGA_SERVICE_PORT="${MEGA_SERVICE_PORT}" PIPELINE_SERVICE_HOST_IP="127.0.0.1" PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" \
+    "$PYTHON_BIN" chatqna.py
+  popd >/dev/null
+}
+
+start_ui() {
+  prepare_runtime_env
+  ensure_cmd npm
+
+  pushd "${WORKPATH}/ui/vue" >/dev/null
+  if [[ ! -d node_modules ]]; then
+    echo "ui/node_modules not found, running npm install..."
+    npm install
+  fi
+
+  stop_port_listener "${UI_PORT}" "UI"
+
+  start_process \
+    "edgecraftrag-ui (vite)" \
+    "$UI_PID_FILE" \
+    "$UI_LOG" \
+    env ECRAG_LOCAL_PROXY="1" ECRAG_LOCAL_API_PROXY_TARGET="http://127.0.0.1:${PIPELINE_SERVICE_PORT}" ECRAG_LOCAL_CHATBOT_PROXY_TARGET="http://127.0.0.1:${MEGA_SERVICE_PORT}" VITE_API_URL="/" VITE_CHATBOT_URL="/" \
+    npm run dev -- --host 0.0.0.0 --port "${UI_PORT}"
+  popd >/dev/null
+}
+
+stop_server() {
+  stop_process "edgecraftrag-server" "$SERVER_PID_FILE"
+}
+
+stop_mega() {
+  stop_process "edgecraftrag (mega service)" "$MEGA_PID_FILE"
+}
+
+stop_ui() {
+  stop_process "edgecraftrag-ui (vite)" "$UI_PID_FILE"
+}
+
+status_service() {
+  local name=$1
+  local pid_file=$2
+
+  if is_pid_running "$pid_file"; then
+    echo "${name}: running (pid $(cat "$pid_file"))"
+  else
+    echo "${name}: stopped"
+  fi
+}
+
+start_all() {
+  start_server
+  start_mega
+  start_ui
+
+  echo ""
+  echo "All local processes started successfully."
+  echo "UI:              http://${HOST_IP}:${UI_PORT}"
+  echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+  echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+  echo "Logs:            ${LOG_DIR}"
+}
+
+stop_all() {
+  stop_ui
+  stop_mega
+  stop_server
+}
+
+status_all() {
+  status_service "edgecraftrag-server" "$SERVER_PID_FILE"
+  status_service "edgecraftrag (mega service)" "$MEGA_PID_FILE"
+  status_service "edgecraftrag-ui (vite)" "$UI_PID_FILE"
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status} [all|server|mega|ui]"
+  echo ""
+  echo "Examples:"
+  echo "  $0 start"
+  echo "  $0 restart ui"
+  echo "  $0 status server"
+  echo "  $0 -h"
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" || "${1:-}" == "help" || "${2:-}" == "-h" || "${2:-}" == "--help" ]]; then
+  usage
+  exit 0
+fi
+
+ACTION=${1:-start}
+TARGET=${2:-all}
+case "$ACTION" in
+  start)
+    case "$TARGET" in
+      all) start_all ;;
+      server) start_server ;;
+      mega) start_mega ;;
+      ui) start_ui ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  stop)
+    case "$TARGET" in
+      all) stop_all ;;
+      server) stop_server ;;
+      mega) stop_mega ;;
+      ui) stop_ui ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  restart)
+    case "$TARGET" in
+      all)
+        stop_all
+        start_all
+        ;;
+      server)
+        stop_server
+        start_server
+        ;;
+      mega)
+        stop_mega
+        start_mega
+        ;;
+      ui)
+        stop_ui
+        start_ui
+        ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  status)
+    case "$TARGET" in
+      all) status_all ;;
+      server) status_service "edgecraftrag-server" "$SERVER_PID_FILE" ;;
+      mega) status_service "edgecraftrag (mega service)" "$MEGA_PID_FILE" ;;
+      ui) status_service "edgecraftrag-ui (vite)" "$UI_PID_FILE" ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_ov_container.sh b/EdgeCraftRAG/tools/run_ov_container.sh
new file mode 100755
index 0000000000..48fb0618ac
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_ov_container.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+COMPOSE_DIR="${WORKPATH}/docker_compose/intel/gpu/arc"
+COMPOSE_FILE="compose.yaml"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+# Environment variables
+MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
+DOC_PATH=${DOC_PATH:-"${WORKPATH}/workspace"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKPATH}/workspace"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-5000}
+
+# Container names for status checking
+CONTAINER_SERVER="edgecraftrag-server"
+CONTAINER_MEGA="edgecraftrag"
+CONTAINER_UI="edgecraftrag-ui"
+
+# Ports
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+check_docker() {
+  ensure_cmd docker
+
+  if ! docker info >/dev/null 2>&1; then
+    echo "ERROR: Docker daemon is not running"
+    echo "Please start Docker: sudo systemctl start docker"
+    exit 1
+  fi
+}
+
+is_container_running() {
+  local container_name=$1
+  docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}$"
+}
+
+get_container_status() {
+  local container_name=$1
+  if is_container_running "$container_name"; then
+    echo "running"
+  else
+    if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}$"; then
+      echo "stopped"
+    else
+      echo "not created"
+    fi
+  fi
+}
+
+prepare_directories() {
+  mkdir -p "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}"
+
+  # Ensure proper permissions for Docker (uid:gid 1000:1000)
+  if [[ ! -w "${MODEL_PATH}" ]] || [[ ! -w "${DOC_PATH}" ]] || [[ ! -w "${TMPFILE_PATH}" ]]; then
+    echo "Setting permissions for Docker containers..."
+    sudo chown -R 1000:1000 "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}" 2>/dev/null || true
+  fi
+
+  # Also set cache permissions
+  if [[ -d "${HOME}/.cache" ]]; then
+    sudo chown -R 1000:1000 "${HOME}/.cache" 2>/dev/null || true
+  fi
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  export HOST_IP
+  export MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export LLM_MODEL
+  export MAX_MODEL_LEN
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,${HOST_IP},edgecraftrag,edgecraftrag-server"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+
+  # Set GPU group IDs for Docker
+  if getent group video >/dev/null 2>&1; then
+    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
+  fi
+
+  if getent group render >/dev/null 2>&1; then
+    export RENDERGROUPID=$(getent group render | cut -d: -f3)
+  fi
+
+  # Set compose profiles (empty for OpenVINO)
+  export COMPOSE_PROFILES=${COMPOSE_PROFILES:-""}
+}
+
+start_services() {
+  check_docker
+  prepare_directories
+  prepare_runtime_env
+
+  echo "Starting EdgeCraftRAG containers..."
+  echo "  Model path: ${MODEL_PATH}"
+  echo "  Document path: ${DOC_PATH}"
+  echo "  LLM model: ${LLM_MODEL}"
+  echo "  Compose profile: ${COMPOSE_PROFILES:-default (OpenVINO)}"
+  echo ""
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+
+  if [[ -n "${COMPOSE_PROFILES}" ]]; then
+    docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" up -d
+  else
+    docker compose -f "${COMPOSE_FILE}" up -d
+  fi
+
+  popd >/dev/null
+
+  echo ""
+  echo "Waiting for services to be ready..."
+  sleep 5
+
+  # Check if containers are running
+  local all_running=true
+  if ! is_container_running "${CONTAINER_SERVER}"; then
+    echo "WARNING: ${CONTAINER_SERVER} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_MEGA}"; then
+    echo "WARNING: ${CONTAINER_MEGA} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_UI}"; then
+    echo "WARNING: ${CONTAINER_UI} is not running"
+    all_running=false
+  fi
+
+  if [[ "$all_running" == "true" ]]; then
+    echo ""
+    echo "All containers started successfully."
+    echo "UI:              http://${HOST_IP}:${UI_PORT}"
+    echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+    echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+    echo ""
+    echo "View logs:"
+    echo "  docker logs -f ${CONTAINER_SERVER}"
+    echo "  docker logs -f ${CONTAINER_MEGA}"
+    echo "  docker logs -f ${CONTAINER_UI}"
+  else
+    echo ""
+    echo "Some containers failed to start. Check Docker logs for details."
+    exit 1
+  fi
+}
+
+stop_services() {
+  check_docker
+  prepare_runtime_env
+
+  echo "Stopping EdgeCraftRAG containers..."
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+
+  if [[ -n "${COMPOSE_PROFILES}" ]]; then
+    docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" down
+  else
+    docker compose -f "${COMPOSE_FILE}" down
+  fi
+
+  popd >/dev/null
+
+  echo "All containers stopped."
+}
+
+restart_services() {
+  stop_services
+  echo ""
+  start_services
+}
+
+status_service() {
+  local container_name=$1
+  local status
+  status=$(get_container_status "$container_name")
+
+  case "$status" in
+    running)
+      local container_id
+      container_id=$(docker ps -q --filter "name=^${container_name}$")
+      echo "${container_name}: running (container id: ${container_id})"
+      ;;
+    stopped)
+      echo "${container_name}: stopped"
+      ;;
+    not\ created)
+      echo "${container_name}: not created"
+      ;;
+  esac
+}
+
+status_all() {
+  check_docker
+
+  echo "EdgeCraftRAG Container Status:"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  status_service "${CONTAINER_SERVER}"
+  status_service "${CONTAINER_MEGA}"
+  status_service "${CONTAINER_UI}"
+  echo ""
+
+  # Show additional Milvus status if enabled
+  if [[ "${MILVUS_ENABLED}" == "1" ]]; then
+    echo "Additional services (Milvus enabled):"
+    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    docker ps --filter "name=milvus" --filter "name=etcd" --filter "name=minio" --format "table {{.Names}}\t{{.Status}}" 2>/dev/null || echo "No additional services running"
+  fi
+}
+
+logs_service() {
+  local container_name=$1
+  check_docker
+
+  if ! is_container_running "$container_name"; then
+    echo "Container ${container_name} is not running"
+    exit 1
+  fi
+
+  docker logs -f "$container_name"
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status|logs} [service]"
+  echo ""
+  echo "Commands:"
+  echo "  start        Start all containers"
+  echo "  stop         Stop all containers"
+  echo "  restart      Restart all containers"
+  echo "  status       Show container status"
+  echo "  logs         Follow logs for a specific service"
+  echo ""
+  echo "Services (for logs command):"
+  echo "  server       Pipeline server"
+  echo "  mega         Mega service"
+  echo "  ui           UI service"
+  echo ""
+  echo "Examples:"
+  echo "  $0 start"
+  echo "  $0 restart"
+  echo "  $0 status"
+  echo "  $0 logs server"
+  echo "  $0 -h"
+  echo ""
+  echo "Environment Variables:"
+  echo "  HOST_IP              Server IP (default: auto-detected)"
+  echo "  MODEL_PATH           Model storage path (default: workspace/models)"
+  echo "  DOC_PATH             Document storage (default: workspace)"
+  echo "  TMPFILE_PATH         Temporary files (default: workspace)"
+  echo "  LLM_MODEL            LLM model name (default: Qwen/Qwen3-8B)"
+  echo "  MILVUS_ENABLED       Enable Milvus DB: 0|1 (default: 1)"
+  echo "  CHAT_HISTORY_ROUND   Chat history length (default: 0)"
+  echo "  COMPOSE_PROFILES     Docker compose profile (default: none/OpenVINO)"
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" || "${1:-}" == "help" ]]; then
+  usage
+  exit 0
+fi
+
+ACTION=${1:-start}
+SERVICE=${2:-}
+
+case "$ACTION" in
+  start)
+    start_services
+    ;;
+  stop)
+    stop_services
+    ;;
+  restart)
+    restart_services
+    ;;
+  status)
+    status_all
+    ;;
+  logs)
+    if [[ -z "$SERVICE" ]]; then
+      echo "ERROR: Please specify a service: server, mega, or ui"
+      echo ""
+      usage
+      exit 1
+    fi
+
+    case "$SERVICE" in
+      server) logs_service "${CONTAINER_SERVER}" ;;
+      mega) logs_service "${CONTAINER_MEGA}" ;;
+      ui) logs_service "${CONTAINER_UI}" ;;
+      *)
+        echo "ERROR: Unknown service: $SERVICE"
+        usage
+        exit 1
+        ;;
+    esac
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_ovms_baremetal.sh b/EdgeCraftRAG/tools/run_ovms_baremetal.sh
new file mode 100755
index 0000000000..82ebcb5651
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_ovms_baremetal.sh
@@ -0,0 +1,416 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+WORKSPACE_ROOT="${WORKPATH}/workspace"
+COMPOSE_DIR="${WORKPATH}/docker_compose/intel/gpu/arc"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+PIPELINE_SERVICE_HOST_IP=${PIPELINE_SERVICE_HOST_IP:-0.0.0.0}
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+OVMS_SERVICE_PORT=${OVMS_SERVICE_PORT:-8000}
+OVMS_ENDPOINT=${OVMS_ENDPOINT:-"http://${HOST_IP}:${OVMS_SERVICE_PORT}"}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+OVMS_REST_PORT=${OVMS_REST_PORT:-${OVMS_SERVICE_PORT}}
+OVMS_SOURCE_MODEL=${OVMS_SOURCE_MODEL:-${LLM_MODEL}}
+OVMS_MODEL_REPOSITORY_PATH=${OVMS_MODEL_REPOSITORY_PATH:-/models}
+OVMS_MODEL_NAME=${OVMS_MODEL_NAME:-${OVMS_SOURCE_MODEL}}
+OVMS_TARGET_DEVICE=${OVMS_TARGET_DEVICE:-GPU.0}
+OVMS_TASK=${OVMS_TASK:-text_generation}
+OVMS_CACHE_DIR=${OVMS_CACHE_DIR:-/models/.ov_cache}
+OVMS_ENABLE_PREFIX_CACHING=${OVMS_ENABLE_PREFIX_CACHING:-true}
+OVMS_TOOL_PARSER=${OVMS_TOOL_PARSER:-qwen3coder}
+OVMS_ENABLE_TOOL_GUIDED_GENERATION=${OVMS_ENABLE_TOOL_GUIDED_GENERATION:-true}
+OVMS_MAX_NUM_BATCHED_TOKENS=${OVMS_MAX_NUM_BATCHED_TOKENS:-8192}
+
+if [[ -n "${PYTHON_BIN:-}" ]]; then
+  PYTHON_BIN=${PYTHON_BIN}
+elif [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then
+  PYTHON_BIN="${VIRTUAL_ENV}/bin/python"
+elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
+  PYTHON_BIN="${CONDA_PREFIX}/bin/python"
+elif [[ -x "${HOME}/miniforge3/envs/edgeairag/bin/python3" ]]; then
+  PYTHON_BIN="${HOME}/miniforge3/envs/edgeairag/bin/python3"
+else
+  PYTHON_BIN=$(command -v python3)
+fi
+
+MODEL_PATH=${MODEL_PATH:-"${WORKSPACE_ROOT}/models"}
+DOC_PATH=${DOC_PATH:-"${WORKSPACE_ROOT}"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKSPACE_ROOT}"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+
+LOG_DIR="${WORKSPACE_ROOT}/logs/ovms_baremetal"
+PID_DIR="${WORKSPACE_ROOT}/pids"
+mkdir -p "${LOG_DIR}" "${PID_DIR}" "${DOC_PATH}" "${TMPFILE_PATH}" "${MODEL_PATH}"
+
+SERVER_PID_FILE="${PID_DIR}/edgecraftrag-server-ovms.pid"
+MEGA_PID_FILE="${PID_DIR}/edgecraftrag-ovms.pid"
+UI_PID_FILE="${PID_DIR}/edgecraftrag-ui-ovms.pid"
+
+SERVER_LOG="${LOG_DIR}/edgecraftrag-server.log"
+MEGA_LOG="${LOG_DIR}/edgecraftrag.log"
+UI_LOG="${LOG_DIR}/edgecraftrag-ui.log"
+OVMS_LOG="${LOG_DIR}/ovms-container.log"
+
+OVMS_CONTAINER="ovms-serving"
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+check_docker() {
+  ensure_cmd docker
+  if ! docker info >/dev/null 2>&1; then
+    echo "ERROR: Docker daemon is not running"
+    echo "Please start Docker: sudo systemctl start docker"
+    exit 1
+  fi
+}
+
+is_pid_running() {
+  local pid_file=$1
+  if [[ -f "$pid_file" ]]; then
+    local pid
+    pid=$(cat "$pid_file")
+    [[ -n "$pid" ]] && kill -0 "$pid" >/dev/null 2>&1
+  else
+    return 1
+  fi
+}
+
+is_ovms_running() {
+  docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${OVMS_CONTAINER}$"
+}
+
+start_process() {
+  local name=$1
+  local pid_file=$2
+  local log_file=$3
+  shift 3
+
+  if is_pid_running "$pid_file"; then
+    echo "${name} is already running (pid $(cat "$pid_file"))"
+    return 0
+  fi
+
+  echo "Starting ${name}..."
+  setsid nohup "$@" >"$log_file" 2>&1 &
+  local pid=$!
+  echo "$pid" >"$pid_file"
+  sleep 2
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "${name} started (pid ${pid}), log: ${log_file}"
+  else
+    echo "ERROR: failed to start ${name}. Check log: ${log_file}"
+    rm -f "$pid_file"
+    exit 1
+  fi
+}
+
+stop_process() {
+  local name=$1
+  local pid_file=$2
+
+  if ! is_pid_running "$pid_file"; then
+    echo "${name} is not running"
+    rm -f "$pid_file"
+    return 0
+  fi
+
+  local pid
+  pid=$(cat "$pid_file")
+  echo "Stopping ${name} (pid ${pid})..."
+  kill -TERM -- "-$pid" >/dev/null 2>&1 || kill "$pid" >/dev/null 2>&1 || true
+
+  for _ in {1..10}; do
+    if kill -0 "$pid" >/dev/null 2>&1; then
+      sleep 1
+    else
+      break
+    fi
+  done
+
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "Force killing ${name} (pid ${pid})..."
+    kill -KILL -- "-$pid" >/dev/null 2>&1 || kill -9 "$pid" >/dev/null 2>&1 || true
+  fi
+
+  rm -f "$pid_file"
+  echo "${name} stopped"
+}
+
+prepare_ovms_env() {
+  export HOST_IP
+  export MODEL_PATH
+  export LLM_MODEL
+  export OVMS_SERVICE_PORT
+  export OVMS_ENDPOINT
+  export OVMS_REST_PORT
+  export OVMS_SOURCE_MODEL
+  export OVMS_MODEL_REPOSITORY_PATH
+  export OVMS_MODEL_NAME
+  export OVMS_TARGET_DEVICE
+  export OVMS_TASK
+  export OVMS_CACHE_DIR
+  export OVMS_ENABLE_PREFIX_CACHING
+  export OVMS_TOOL_PARSER
+  export OVMS_ENABLE_TOOL_GUIDED_GENERATION
+  export OVMS_MAX_NUM_BATCHED_TOKENS
+  export OVMS_UID=${OVMS_UID:-$(id -u)}
+  export OVMS_GID=${OVMS_GID:-$(id -g)}
+
+  if getent group render >/dev/null 2>&1; then
+    export RENDERGROUPID
+    RENDERGROUPID=$(getent group render | cut -d: -f3)
+  fi
+}
+
+start_ovms_container() {
+  check_docker
+  prepare_ovms_env
+
+  if is_ovms_running; then
+    echo "OVMS container is already running"
+    return 0
+  fi
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+  docker compose -f compose.yaml up -d ovms-serving
+  popd >/dev/null
+
+  echo "Waiting for OVMS to be ready..."
+  local n=0
+  until [[ "$n" -ge 60 ]]; do
+    docker logs "${OVMS_CONTAINER}" > "${OVMS_LOG}" 2>&1 || true
+    if grep -Eqi "Started|listening|REST API" "${OVMS_LOG}"; then
+      echo "OVMS container is ready"
+      return 0
+    fi
+    sleep 2
+    n=$((n+1))
+  done
+
+  echo "WARNING: OVMS startup timeout. Check logs: ${OVMS_LOG}"
+}
+
+stop_ovms_container() {
+  check_docker
+
+  if ! is_ovms_running; then
+    echo "OVMS container is not running"
+    return 0
+  fi
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+  docker compose -f compose.yaml stop ovms-serving 2>/dev/null || true
+  docker compose -f compose.yaml rm -f ovms-serving 2>/dev/null || true
+  docker rm -f "${OVMS_CONTAINER}" 2>/dev/null || true
+  popd >/dev/null
+
+  echo "OVMS container stopped"
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  ensure_cmd "$PYTHON_BIN"
+
+  export HOST_IP
+  export MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export LLM_MODEL
+  export OVMS_ENDPOINT
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,${HOST_IP}"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+}
+
+start_server() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag-server (ovms)" \
+    "$SERVER_PID_FILE" \
+    "$SERVER_LOG" \
+    env PIPELINE_SERVICE_HOST_IP="${PIPELINE_SERVICE_HOST_IP}" PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" OVMS_ENDPOINT="${OVMS_ENDPOINT}" \
+    "$PYTHON_BIN" -m edgecraftrag.server
+  popd >/dev/null
+}
+
+start_mega() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag (mega service, ovms)" \
+    "$MEGA_PID_FILE" \
+    "$MEGA_LOG" \
+    env MEGA_SERVICE_PORT="${MEGA_SERVICE_PORT}" PIPELINE_SERVICE_HOST_IP="127.0.0.1" PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" OVMS_ENDPOINT="${OVMS_ENDPOINT}" \
+    "$PYTHON_BIN" chatqna.py
+  popd >/dev/null
+}
+
+start_ui() {
+  prepare_runtime_env
+  ensure_cmd npm
+
+  pushd "${WORKPATH}/ui/vue" >/dev/null
+  if [[ ! -d node_modules ]]; then
+    echo "ui/node_modules not found, running npm install..."
+    npm install
+  fi
+
+  start_process \
+    "edgecraftrag-ui (vite, ovms)" \
+    "$UI_PID_FILE" \
+    "$UI_LOG" \
+    env ECRAG_LOCAL_PROXY="1" ECRAG_LOCAL_API_PROXY_TARGET="http://127.0.0.1:${PIPELINE_SERVICE_PORT}" ECRAG_LOCAL_CHATBOT_PROXY_TARGET="http://127.0.0.1:${MEGA_SERVICE_PORT}" VITE_API_URL="/" VITE_CHATBOT_URL="/" \
+    npm run dev -- --host 0.0.0.0 --port "${UI_PORT}"
+  popd >/dev/null
+}
+
+stop_server() { stop_process "edgecraftrag-server (ovms)" "$SERVER_PID_FILE"; }
+stop_mega() { stop_process "edgecraftrag (mega service, ovms)" "$MEGA_PID_FILE"; }
+stop_ui() { stop_process "edgecraftrag-ui (vite, ovms)" "$UI_PID_FILE"; }
+
+status_service() {
+  local name=$1
+  local pid_file=$2
+
+  if is_pid_running "$pid_file"; then
+    echo "${name}: running (pid $(cat "$pid_file"))"
+  else
+    echo "${name}: stopped"
+  fi
+}
+
+start_all() {
+  start_ovms_container
+  start_server
+  start_mega
+  start_ui
+
+  echo ""
+  echo "All OVMS baremetal services started successfully."
+  echo "OVMS:            ${OVMS_ENDPOINT}"
+  echo "UI:              http://${HOST_IP}:${UI_PORT}"
+  echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+  echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+  echo "Logs:            ${LOG_DIR}"
+}
+
+stop_all() {
+  stop_ui
+  stop_mega
+  stop_server
+  stop_ovms_container
+}
+
+status_all() {
+  if is_ovms_running; then
+    echo "ovms-serving: running"
+  else
+    if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${OVMS_CONTAINER}$"; then
+      echo "ovms-serving: stopped"
+    else
+      echo "ovms-serving: not created"
+    fi
+  fi
+  status_service "edgecraftrag-server (ovms)" "$SERVER_PID_FILE"
+  status_service "edgecraftrag (mega service, ovms)" "$MEGA_PID_FILE"
+  status_service "edgecraftrag-ui (vite, ovms)" "$UI_PID_FILE"
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status} [all|server|mega|ui|ovms]"
+}
+
+ACTION=${1:-start}
+TARGET=${2:-all}
+case "$ACTION" in
+  start)
+    case "$TARGET" in
+      all) start_all ;;
+      server) start_server ;;
+      mega) start_mega ;;
+      ui) start_ui ;;
+      ovms) start_ovms_container ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  stop)
+    case "$TARGET" in
+      all) stop_all ;;
+      server) stop_server ;;
+      mega) stop_mega ;;
+      ui) stop_ui ;;
+      ovms) stop_ovms_container ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  restart)
+    case "$TARGET" in
+      all)
+        stop_all
+        start_all
+        ;;
+      server)
+        stop_server
+        start_server
+        ;;
+      mega)
+        stop_mega
+        start_mega
+        ;;
+      ui)
+        stop_ui
+        start_ui
+        ;;
+      ovms)
+        stop_ovms_container
+        start_ovms_container
+        ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  status)
+    status_all
+    ;;
+  -h|--help|help)
+    usage
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_ovms_container.sh b/EdgeCraftRAG/tools/run_ovms_container.sh
new file mode 100755
index 0000000000..29829c7def
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_ovms_container.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+COMPOSE_DIR="${WORKPATH}/docker_compose/intel/gpu/arc"
+COMPOSE_FILE="compose.yaml"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
+DOC_PATH=${DOC_PATH:-"${WORKPATH}/workspace"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKPATH}/workspace"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-5000}
+OVMS_SERVICE_PORT=${OVMS_SERVICE_PORT:-8000}
+OVMS_ENDPOINT=${OVMS_ENDPOINT:-"http://${HOST_IP}:${OVMS_SERVICE_PORT}"}
+OVMS_REST_PORT=${OVMS_REST_PORT:-${OVMS_SERVICE_PORT}}
+OVMS_SOURCE_MODEL=${OVMS_SOURCE_MODEL:-${LLM_MODEL}}
+OVMS_MODEL_REPOSITORY_PATH=${OVMS_MODEL_REPOSITORY_PATH:-/models}
+OVMS_MODEL_NAME=${OVMS_MODEL_NAME:-${OVMS_SOURCE_MODEL}}
+OVMS_TARGET_DEVICE=${OVMS_TARGET_DEVICE:-GPU.0}
+OVMS_TASK=${OVMS_TASK:-text_generation}
+OVMS_CACHE_DIR=${OVMS_CACHE_DIR:-/models/.ov_cache}
+OVMS_ENABLE_PREFIX_CACHING=${OVMS_ENABLE_PREFIX_CACHING:-true}
+OVMS_TOOL_PARSER=${OVMS_TOOL_PARSER:-qwen3coder}
+OVMS_ENABLE_TOOL_GUIDED_GENERATION=${OVMS_ENABLE_TOOL_GUIDED_GENERATION:-true}
+OVMS_MAX_NUM_BATCHED_TOKENS=${OVMS_MAX_NUM_BATCHED_TOKENS:-8192}
+
+CONTAINER_OVMS="ovms-serving"
+CONTAINER_SERVER="edgecraftrag-server"
+CONTAINER_MEGA="edgecraftrag"
+CONTAINER_UI="edgecraftrag-ui"
+
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+check_docker() {
+  ensure_cmd docker
+
+  if ! docker info >/dev/null 2>&1; then
+    echo "ERROR: Docker daemon is not running"
+    echo "Please start Docker: sudo systemctl start docker"
+    exit 1
+  fi
+}
+
+is_container_running() {
+  local container_name=$1
+  docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}$"
+}
+
+get_container_status() {
+  local container_name=$1
+  if is_container_running "$container_name"; then
+    echo "running"
+  else
+    if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}$"; then
+      echo "stopped"
+    else
+      echo "not created"
+    fi
+  fi
+}
+
+prepare_directories() {
+  mkdir -p "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}"
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  export HOST_IP
+  export MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export MAX_MODEL_LEN
+  export LLM_MODEL
+  export OVMS_SERVICE_PORT
+  export OVMS_ENDPOINT
+  export OVMS_REST_PORT
+  export OVMS_SOURCE_MODEL
+  export OVMS_MODEL_REPOSITORY_PATH
+  export OVMS_MODEL_NAME
+  export OVMS_TARGET_DEVICE
+  export OVMS_TASK
+  export OVMS_CACHE_DIR
+  export OVMS_ENABLE_PREFIX_CACHING
+  export OVMS_TOOL_PARSER
+  export OVMS_ENABLE_TOOL_GUIDED_GENERATION
+  export OVMS_MAX_NUM_BATCHED_TOKENS
+  export OVMS_UID=${OVMS_UID:-$(id -u)}
+  export OVMS_GID=${OVMS_GID:-$(id -g)}
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,${HOST_IP},edgecraftrag,edgecraftrag-server"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+
+  if getent group video >/dev/null 2>&1; then
+    export VIDEOGROUPID
+    VIDEOGROUPID=$(getent group video | cut -d: -f3)
+  fi
+
+  if getent group render >/dev/null 2>&1; then
+    export RENDERGROUPID
+    RENDERGROUPID=$(getent group render | cut -d: -f3)
+  fi
+
+  export COMPOSE_PROFILES=ovms
+}
+
+start_services() {
+  check_docker
+  prepare_directories
+  prepare_runtime_env
+
+  echo "Starting EdgeCraftRAG with OVMS..."
+  echo "  Model path: ${MODEL_PATH}"
+  echo "  LLM model: ${LLM_MODEL}"
+  echo "  OVMS endpoint: ${OVMS_ENDPOINT}"
+  echo ""
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+  docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" up -d
+  popd >/dev/null
+
+  echo ""
+  echo "Waiting for services to be ready..."
+  sleep 5
+
+  local all_running=true
+  if ! is_container_running "${CONTAINER_OVMS}"; then
+    echo "WARNING: ${CONTAINER_OVMS} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_SERVER}"; then
+    echo "WARNING: ${CONTAINER_SERVER} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_MEGA}"; then
+    echo "WARNING: ${CONTAINER_MEGA} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_UI}"; then
+    echo "WARNING: ${CONTAINER_UI} is not running"
+    all_running=false
+  fi
+
+  if [[ "$all_running" == "true" ]]; then
+    echo ""
+    echo "All containers started successfully."
+    echo "OVMS:            ${OVMS_ENDPOINT}"
+    echo "UI:              http://${HOST_IP}:${UI_PORT}"
+    echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+    echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+  else
+    echo ""
+    echo "Some containers failed to start. Check Docker logs for details."
+    exit 1
+  fi
+}
+
+stop_services() {
+  check_docker
+  prepare_runtime_env
+
+  echo "Stopping EdgeCraftRAG OVMS containers..."
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+  docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" down
+  popd >/dev/null
+
+  echo "All containers stopped."
+}
+
+restart_services() {
+  stop_services
+  echo ""
+  start_services
+}
+
+status_service() {
+  local container_name=$1
+  local status
+  status=$(get_container_status "$container_name")
+
+  case "$status" in
+    running)
+      local container_id
+      container_id=$(docker ps -q --filter "name=^${container_name}$")
+      echo "${container_name}: running (container id: ${container_id})"
+      ;;
+    stopped)
+      echo "${container_name}: stopped"
+      ;;
+    not\ created)
+      echo "${container_name}: not created"
+      ;;
+  esac
+}
+
+status_all() {
+  check_docker
+
+  echo "EdgeCraftRAG OVMS Container Status:"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+  status_service "${CONTAINER_OVMS}"
+  status_service "${CONTAINER_SERVER}"
+  status_service "${CONTAINER_MEGA}"
+  status_service "${CONTAINER_UI}"
+}
+
+logs_service() {
+  local container_name=$1
+  check_docker
+
+  if ! is_container_running "$container_name"; then
+    echo "Container ${container_name} is not running"
+    exit 1
+  fi
+
+  docker logs -f "$container_name"
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status|logs} [service]"
+  echo ""
+  echo "Commands:"
+  echo "  start        Start all OVMS containers"
+  echo "  stop         Stop all OVMS containers"
+  echo "  restart      Restart all OVMS containers"
+  echo "  status       Show container status"
+  echo "  logs         Follow logs for a specific service"
+  echo ""
+  echo "Services (for logs command):"
+  echo "  ovms         OVMS model server"
+  echo "  server       Pipeline server"
+  echo "  mega         Mega service"
+  echo "  ui           UI service"
+}
+
+ACTION=${1:-start}
+TARGET=${2:-all}
+
+case "$ACTION" in
+  start)
+    start_services
+    ;;
+  stop)
+    stop_services
+    ;;
+  restart)
+    restart_services
+    ;;
+  status)
+    status_all
+    ;;
+  logs)
+    case "$TARGET" in
+      ovms) logs_service "${CONTAINER_OVMS}" ;;
+      server) logs_service "${CONTAINER_SERVER}" ;;
+      mega) logs_service "${CONTAINER_MEGA}" ;;
+      ui) logs_service "${CONTAINER_UI}" ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  -h|--help|help)
+    usage
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_vllm_baremetal.sh b/EdgeCraftRAG/tools/run_vllm_baremetal.sh
new file mode 100755
index 0000000000..a97ab5b18f
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_vllm_baremetal.sh
@@ -0,0 +1,536 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# vLLM Baremetal Deployment
+# Runs vLLM container for LLM inference + EdgeCraftRAG services on bare-metal
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+WORKSPACE_ROOT="${WORKPATH}/workspace"
+COMPOSE_DIR="${WORKPATH}/docker_compose/intel/gpu/arc"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+# EdgeCraftRAG service ports
+PIPELINE_SERVICE_HOST_IP=${PIPELINE_SERVICE_HOST_IP:-0.0.0.0}
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+# vLLM configuration
+VLLM_BACKEND=${VLLM_BACKEND:-a770}  # a770 or b60
+NGINX_PORT=${NGINX_PORT:-8086}
+vLLM_ENDPOINT=${vLLM_ENDPOINT:-"http://${HOST_IP}:${NGINX_PORT}"}
+DP_NUM=${DP_NUM:-1}
+TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}
+MAX_NUM_SEQS=${MAX_NUM_SEQS:-64}
+MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-8192}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-8192}
+LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-fp8}
+SELECTED_XPU_0=${SELECTED_XPU_0:-0}
+
+# Python detection (same as run_ov_baremetal.sh)
+if [[ -n "${PYTHON_BIN:-}" ]]; then
+  PYTHON_BIN=${PYTHON_BIN}
+elif [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then
+  PYTHON_BIN="${VIRTUAL_ENV}/bin/python"
+elif [[ -n "${CONDA_PREFIX:-}" && -x "${CONDA_PREFIX}/bin/python" ]]; then
+  PYTHON_BIN="${CONDA_PREFIX}/bin/python"
+elif [[ -x "${HOME}/miniforge3/envs/edgeairag/bin/python3" ]]; then
+  PYTHON_BIN="${HOME}/miniforge3/envs/edgeairag/bin/python3"
+else
+  PYTHON_BIN=$(command -v python3)
+fi
+
+MODEL_PATH=${MODEL_PATH:-"${WORKSPACE_ROOT}/models"}
+DOC_PATH=${DOC_PATH:-"${WORKSPACE_ROOT}"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKSPACE_ROOT}"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+LLM_MODEL_PATH=${LLM_MODEL_PATH:-"${MODEL_PATH}/${LLM_MODEL}"}
+
+LOG_DIR="${WORKSPACE_ROOT}/logs/vllm_baremetal"
+PID_DIR="${WORKSPACE_ROOT}/pids"
+mkdir -p "${LOG_DIR}" "${PID_DIR}" "${DOC_PATH}" "${TMPFILE_PATH}"
+
+SERVER_PID_FILE="${PID_DIR}/edgecraftrag-server-vllm.pid"
+MEGA_PID_FILE="${PID_DIR}/edgecraftrag-vllm.pid"
+UI_PID_FILE="${PID_DIR}/edgecraftrag-ui-vllm.pid"
+
+SERVER_LOG="${LOG_DIR}/edgecraftrag-server.log"
+MEGA_LOG="${LOG_DIR}/edgecraftrag.log"
+UI_LOG="${LOG_DIR}/edgecraftrag-ui.log"
+VLLM_LOG="${LOG_DIR}/vllm-container.log"
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+check_docker() {
+  ensure_cmd docker
+
+  if ! docker info >/dev/null 2>&1; then
+    echo "ERROR: Docker daemon is not running"
+    echo "Please start Docker: sudo systemctl start docker"
+    exit 1
+  fi
+}
+
+is_pid_running() {
+  local pid_file=$1
+  if [[ -f "$pid_file" ]]; then
+    local pid
+    pid=$(cat "$pid_file")
+    [[ -n "$pid" ]] && kill -0 "$pid" >/dev/null 2>&1
+  else
+    return 1
+  fi
+}
+
+is_vllm_container_running() {
+  local container_name
+  container_name=$(get_vllm_container_name)
+  docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}$"
+}
+
+get_vllm_service_name() {
+  case "$VLLM_BACKEND" in
+    a770)
+      echo "llm-serving-xpu-770"
+      ;;
+    b60)
+      echo "llm-serving-xpu-b60"
+      ;;
+    *)
+      echo "ERROR: Invalid VLLM_BACKEND: $VLLM_BACKEND (must be a770 or b60)" >&2
+      return 1
+      ;;
+  esac
+}
+
+get_vllm_container_name() {
+  case "$VLLM_BACKEND" in
+    a770)
+      echo "ipex-llm-serving-xpu-770"
+      ;;
+    b60)
+      echo "ipex-serving-xpu-container"
+      ;;
+    *)
+      echo "ERROR: Invalid VLLM_BACKEND: $VLLM_BACKEND (must be a770 or b60)" >&2
+      return 1
+      ;;
+  esac
+}
+
+start_process() {
+  local name=$1
+  local pid_file=$2
+  local log_file=$3
+  shift 3
+
+  if is_pid_running "$pid_file"; then
+    echo "${name} is already running (pid $(cat "$pid_file"))"
+    return 0
+  fi
+
+  echo "Starting ${name}..."
+  setsid nohup "$@" >"$log_file" 2>&1 &
+  local pid=$!
+  echo "$pid" >"$pid_file"
+  sleep 2
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "${name} started (pid ${pid}), log: ${log_file}"
+  else
+    echo "ERROR: failed to start ${name}. Check log: ${log_file}"
+    rm -f "$pid_file"
+    exit 1
+  fi
+}
+
+stop_process() {
+  local name=$1
+  local pid_file=$2
+
+  if ! is_pid_running "$pid_file"; then
+    echo "${name} is not running"
+    rm -f "$pid_file"
+    return 0
+  fi
+
+  local pid
+  pid=$(cat "$pid_file")
+  echo "Stopping ${name} (pid ${pid})..."
+  kill -TERM -- "-$pid" >/dev/null 2>&1 || kill "$pid" >/dev/null 2>&1 || true
+
+  for _ in {1..10}; do
+    if kill -0 "$pid" >/dev/null 2>&1; then
+      sleep 1
+    else
+      break
+    fi
+  done
+
+  if kill -0 "$pid" >/dev/null 2>&1; then
+    echo "Force killing ${name} (pid ${pid})..."
+    kill -KILL -- "-$pid" >/dev/null 2>&1 || kill -9 "$pid" >/dev/null 2>&1 || true
+  fi
+
+  rm -f "$pid_file"
+  echo "${name} stopped"
+}
+
+prepare_vllm_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  export HOST_IP
+  export MODEL_PATH
+  export LLM_MODEL
+  export LLM_MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,192.168.1.1,${HOST_IP}"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+
+  # vLLM specific
+  export NGINX_PORT
+  export vLLM_ENDPOINT
+  export DP_NUM
+  export TENSOR_PARALLEL_SIZE
+  export MAX_NUM_SEQS
+  export MAX_NUM_BATCHED_TOKENS
+  export MAX_MODEL_LEN
+  export LOAD_IN_LOW_BIT
+  export SELECTED_XPU_0
+  export NGINX_CONFIG_PATH="${WORKPATH}/nginx/nginx.conf"
+  export VLLM_SERVICE_PORT_0=8100
+  export NGINX_PORT_0=8100
+}
+
+start_vllm_container() {
+  check_docker
+  prepare_vllm_env
+
+  if is_vllm_container_running; then
+    echo "vLLM container is already running"
+    return 0
+  fi
+
+  echo "Starting vLLM container..."
+  echo "  LLM model: ${LLM_MODEL}"
+  echo "  vLLM endpoint: ${vLLM_ENDPOINT}"
+  echo "  DP_NUM: ${DP_NUM}, TP: ${TENSOR_PARALLEL_SIZE}"
+  echo ""
+
+  # Ensure proper permissions
+  sudo chown -R 1000:1000 "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}" 2>/dev/null || true
+  sudo chown -R 1000:1000 "${HOME}/.cache" 2>/dev/null || true
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+
+  # Generate nginx config based on DP_NUM when available.
+  if [[ -f "${WORKPATH}/nginx/nginx-conf-generator.sh" ]]; then
+    bash "${WORKPATH}/nginx/nginx-conf-generator.sh" "${DP_NUM}" "${NGINX_CONFIG_PATH}"
+  fi
+
+  local service_name
+  service_name=$(get_vllm_service_name)
+
+  # Start only the selected vLLM service from the shared compose file.
+  docker compose -f compose.yaml up -d "${service_name}"
+
+  popd >/dev/null
+
+  echo "Waiting for vLLM container to be ready..."
+  local n=0
+  local container_name
+  container_name=$(get_vllm_container_name)
+  until [[ "$n" -ge 100 ]]; do
+    docker logs "${container_name}" > "${VLLM_LOG}" 2>&1 || true
+    if grep -q "Starting vLLM API server on http://0.0.0.0:" "${VLLM_LOG}"; then
+      echo "vLLM container is ready"
+      return 0
+    fi
+    sleep 3
+    n=$((n+1))
+  done
+
+  echo "WARNING: vLLM container startup timeout. Check logs: ${VLLM_LOG}"
+}
+
+stop_vllm_container() {
+  check_docker
+
+  if ! is_vllm_container_running; then
+    echo "vLLM container is not running"
+    return 0
+  fi
+
+  echo "Stopping vLLM container..."
+
+  local service_name
+  service_name=$(get_vllm_service_name)
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+  docker compose -f compose.yaml stop "${service_name}" 2>/dev/null || true
+  docker compose -f compose.yaml rm -f "${service_name}" 2>/dev/null || true
+  docker rm -f ipex-serving-xpu-container ipex-llm-serving-xpu-770 2>/dev/null || true
+  popd >/dev/null
+
+  echo "vLLM container stopped"
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  ensure_cmd "$PYTHON_BIN"
+
+  export HOST_IP
+  export MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export LLM_MODEL
+  export vLLM_ENDPOINT
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,${HOST_IP}"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+}
+
+start_server() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag-server (vLLM)" \
+    "$SERVER_PID_FILE" \
+    "$SERVER_LOG" \
+    env PIPELINE_SERVICE_HOST_IP="${PIPELINE_SERVICE_HOST_IP}" \
+        PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" \
+        vLLM_ENDPOINT="${vLLM_ENDPOINT}" \
+    "$PYTHON_BIN" -m edgecraftrag.server
+  popd >/dev/null
+}
+
+start_mega() {
+  prepare_runtime_env
+  pushd "${WORKPATH}" >/dev/null
+  start_process \
+    "edgecraftrag (mega service, vLLM)" \
+    "$MEGA_PID_FILE" \
+    "$MEGA_LOG" \
+    env MEGA_SERVICE_PORT="${MEGA_SERVICE_PORT}" \
+        PIPELINE_SERVICE_HOST_IP="127.0.0.1" \
+        PIPELINE_SERVICE_PORT="${PIPELINE_SERVICE_PORT}" \
+        vLLM_ENDPOINT="${vLLM_ENDPOINT}" \
+    "$PYTHON_BIN" chatqna.py
+  popd >/dev/null
+}
+
+start_ui() {
+  prepare_runtime_env
+  ensure_cmd npm
+
+  pushd "${WORKPATH}/ui/vue" >/dev/null
+  if [[ ! -d node_modules ]]; then
+    echo "ui/node_modules not found, running npm install..."
+    npm install
+  fi
+
+  start_process \
+    "edgecraftrag-ui (vite, vLLM)" \
+    "$UI_PID_FILE" \
+    "$UI_LOG" \
+    env ECRAG_LOCAL_PROXY="1" \
+        ECRAG_LOCAL_API_PROXY_TARGET="http://127.0.0.1:${PIPELINE_SERVICE_PORT}" \
+        ECRAG_LOCAL_CHATBOT_PROXY_TARGET="http://127.0.0.1:${MEGA_SERVICE_PORT}" \
+        VITE_API_URL="/" \
+        VITE_CHATBOT_URL="/" \
+    npm run dev -- --host 0.0.0.0 --port "${UI_PORT}"
+  popd >/dev/null
+}
+
+stop_server() {
+  stop_process "edgecraftrag-server (vLLM)" "$SERVER_PID_FILE"
+}
+
+stop_mega() {
+  stop_process "edgecraftrag (mega service, vLLM)" "$MEGA_PID_FILE"
+}
+
+stop_ui() {
+  stop_process "edgecraftrag-ui (vite, vLLM)" "$UI_PID_FILE"
+}
+
+status_service() {
+  local name=$1
+  local pid_file=$2
+
+  if is_pid_running "$pid_file"; then
+    echo "${name}: running (pid $(cat "$pid_file"))"
+  else
+    echo "${name}: stopped"
+  fi
+}
+
+start_all() {
+  start_vllm_container
+  echo ""
+  start_server
+  start_mega
+  start_ui
+
+  echo ""
+  echo "All services started successfully."
+  echo "vLLM endpoint:   ${vLLM_ENDPOINT}"
+  echo "UI:              http://${HOST_IP}:${UI_PORT}"
+  echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+  echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+  echo "Logs:            ${LOG_DIR}"
+}
+
+stop_all() {
+  stop_ui
+  stop_mega
+  stop_server
+  echo ""
+  stop_vllm_container
+}
+
+status_all() {
+  echo "vLLM Container Status:"
+  local container_name
+  container_name=$(get_vllm_container_name)
+  if is_vllm_container_running; then
+    echo "  ${container_name}: running"
+    echo "  Endpoint: ${vLLM_ENDPOINT}"
+  else
+    echo "  ${container_name}: stopped"
+  fi
+  echo ""
+  echo "EdgeCraftRAG Services:"
+  status_service "edgecraftrag-server (vLLM)" "$SERVER_PID_FILE"
+  status_service "edgecraftrag (mega service, vLLM)" "$MEGA_PID_FILE"
+  status_service "edgecraftrag-ui (vite, vLLM)" "$UI_PID_FILE"
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status} [all|vllm|server|mega|ui]"
+  echo ""
+  echo "Examples:"
+  echo "  $0 start              # Start vLLM container + all EdgeCraftRAG services"
+  echo "  $0 start vllm         # Start only vLLM container"
+  echo "  $0 restart server     # Restart server service"
+  echo "  $0 status             # Show status of all services"
+  echo "  $0 stop               # Stop all services + vLLM container"
+  echo "  $0 -h"
+  echo ""
+  echo "Environment Variables:"
+  echo "  VLLM_BACKEND         vLLM backend: a770|b60 (default: a770)"
+  echo "  DP_NUM               Number of DP instances (default: 1)"
+  echo "  TENSOR_PARALLEL_SIZE Tensor parallel size (default: 1)"
+  echo "  NGINX_PORT           vLLM nginx port (default: 8086)"
+  echo "  LLM_MODEL            LLM model name (default: Qwen/Qwen3-8B)"
+  echo "  MODEL_PATH           Model storage path"
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" || "${1:-}" == "help" ]]; then
+  usage
+  exit 0
+fi
+
+ACTION=${1:-start}
+TARGET=${2:-all}
+
+case "$ACTION" in
+  start)
+    case "$TARGET" in
+      all) start_all ;;
+      vllm) start_vllm_container ;;
+      server) start_server ;;
+      mega) start_mega ;;
+      ui) start_ui ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  stop)
+    case "$TARGET" in
+      all) stop_all ;;
+      vllm) stop_vllm_container ;;
+      server) stop_server ;;
+      mega) stop_mega ;;
+      ui) stop_ui ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  restart)
+    case "$TARGET" in
+      all)
+        stop_all
+        start_all
+        ;;
+      vllm)
+        stop_vllm_container
+        start_vllm_container
+        ;;
+      server)
+        stop_server
+        start_server
+        ;;
+      mega)
+        stop_mega
+        start_mega
+        ;;
+      ui)
+        stop_ui
+        start_ui
+        ;;
+      *) usage; exit 1 ;;
+    esac
+    ;;
+  status)
+    status_all
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_vllm_container.sh b/EdgeCraftRAG/tools/run_vllm_container.sh
new file mode 100755
index 0000000000..9cb5fcad91
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_vllm_container.sh
@@ -0,0 +1,508 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# vLLM Container Deployment
+# Runs vLLM container for LLM inference + EdgeCraftRAG services in containers
+
+set -euo pipefail
+
+SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+SCRIPT_DIR=$(cd "$(dirname "${SCRIPT_PATH}")" && pwd)
+WORKPATH=$(cd "${SCRIPT_DIR}/.." && pwd)
+COMPOSE_DIR="${WORKPATH}/docker_compose/intel/gpu/arc"
+COMPOSE_FILE="compose.yaml"
+
+HOST_IP_DEFAULT=$(hostname -I | awk '{print $1}')
+HOST_IP=${HOST_IP:-${HOST_IP_DEFAULT}}
+
+# Environment variables
+MODEL_PATH=${MODEL_PATH:-"${WORKPATH}/workspace/models"}
+DOC_PATH=${DOC_PATH:-"${WORKPATH}/workspace"}
+TMPFILE_PATH=${TMPFILE_PATH:-"${WORKPATH}/workspace"}
+MILVUS_ENABLED=${MILVUS_ENABLED:-1}
+CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND:-0}
+LLM_MODEL=${LLM_MODEL:-Qwen/Qwen3-8B}
+LLM_MODEL_PATH=${LLM_MODEL_PATH:-"${MODEL_PATH}/${LLM_MODEL}"}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-8192}
+
+# vLLM backend selection (a770 or b60)
+VLLM_BACKEND=${VLLM_BACKEND:-a770}
+
+# vLLM configuration
+NGINX_PORT=${NGINX_PORT:-8086}
+vLLM_ENDPOINT=${vLLM_ENDPOINT:-"http://${HOST_IP}:${NGINX_PORT}"}
+DP_NUM=${DP_NUM:-1}
+TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}
+MAX_NUM_SEQS=${MAX_NUM_SEQS:-64}
+MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-8192}
+LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-fp8}
+SELECTED_XPU_0=${SELECTED_XPU_0:-0}
+
+# B60 specific
+DTYPE=${DTYPE:-float16}
+ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK:-0}
+ENFORCE_EAGER=${ENFORCE_EAGER:-1}
+TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE:-1}
+DISABLE_SLIDING_WINDOW=${DISABLE_SLIDING_WINDOW:-1}
+GPU_MEMORY_UTIL=${GPU_MEMORY_UTIL:-0.8}
+NO_ENABLE_PREFIX_CACHING=${NO_ENABLE_PREFIX_CACHING:-1}
+DISABLE_LOG_REQUESTS=${DISABLE_LOG_REQUESTS:-1}
+BLOCK_SIZE=${BLOCK_SIZE:-64}
+QUANTIZATION=${QUANTIZATION:-fp8}
+
+# Container names
+CONTAINER_SERVER="edgecraftrag-server"
+CONTAINER_MEGA="edgecraftrag"
+CONTAINER_UI="edgecraftrag-ui"
+
+get_vllm_compose_profile() {
+  case "$VLLM_BACKEND" in
+    a770)
+      echo "a770"
+      ;;
+    b60)
+      echo "b60"
+      ;;
+    *)
+      echo "ERROR: Invalid VLLM_BACKEND: $VLLM_BACKEND (must be a770 or b60)" >&2
+      return 1
+      ;;
+  esac
+}
+
+get_vllm_service_name() {
+  case "$VLLM_BACKEND" in
+    a770)
+      echo "llm-serving-xpu-770"
+      ;;
+    b60)
+      echo "llm-serving-xpu-b60"
+      ;;
+    *)
+      echo "ERROR: Invalid VLLM_BACKEND: $VLLM_BACKEND (must be a770 or b60)" >&2
+      return 1
+      ;;
+  esac
+}
+
+get_vllm_container_name() {
+  case "$VLLM_BACKEND" in
+    a770)
+      echo "ipex-llm-serving-xpu-770"
+      ;;
+    b60)
+      echo "ipex-serving-xpu-container"
+      ;;
+    *)
+      echo "ERROR: Invalid VLLM_BACKEND: $VLLM_BACKEND (must be a770 or b60)" >&2
+      return 1
+      ;;
+  esac
+}
+
+# Ports
+PIPELINE_SERVICE_PORT=${PIPELINE_SERVICE_PORT:-16010}
+MEGA_SERVICE_PORT=${MEGA_SERVICE_PORT:-16011}
+UI_PORT=${UI_PORT:-8082}
+
+ensure_cmd() {
+  local cmd=$1
+  if ! command -v "$cmd" >/dev/null 2>&1; then
+    echo "ERROR: required command not found: $cmd"
+    exit 1
+  fi
+}
+
+check_docker() {
+  ensure_cmd docker
+
+  if ! docker info >/dev/null 2>&1; then
+    echo "ERROR: Docker daemon is not running"
+    echo "Please start Docker: sudo systemctl start docker"
+    exit 1
+  fi
+}
+
+is_container_running() {
+  local container_name=$1
+  docker ps --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}"
+}
+
+get_container_status() {
+  local container_name=$1
+  if is_container_running "$container_name"; then
+    echo "running"
+  else
+    if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${container_name}"; then
+      echo "stopped"
+    else
+      echo "not created"
+    fi
+  fi
+}
+
+prepare_directories() {
+  mkdir -p "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}"
+
+  # Ensure proper permissions for Docker (uid:gid 1000:1000)
+  if [[ ! -w "${MODEL_PATH}" ]] || [[ ! -w "${DOC_PATH}" ]] || [[ ! -w "${TMPFILE_PATH}" ]]; then
+    echo "Setting permissions for Docker containers..."
+    sudo chown -R 1000:1000 "${MODEL_PATH}" "${DOC_PATH}" "${TMPFILE_PATH}" 2>/dev/null || true
+  fi
+
+  # Also set cache permissions
+  if [[ -d "${HOME}/.cache" ]]; then
+    sudo chown -R 1000:1000 "${HOME}/.cache" 2>/dev/null || true
+  fi
+}
+
+prepare_runtime_env() {
+  local default_no_proxy
+  local merged_no_proxy
+
+  export HOST_IP
+  export MODEL_PATH
+  export LLM_MODEL
+  export LLM_MODEL_PATH
+  export DOC_PATH
+  export TMPFILE_PATH
+  export MILVUS_ENABLED
+  export CHAT_HISTORY_ROUND
+  export MAX_MODEL_LEN
+  export HF_CACHE="${HF_CACHE:-${HOME}/.cache}"
+  export HF_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
+  export http_proxy="${http_proxy:-${HTTP_PROXY:-}}"
+  export https_proxy="${https_proxy:-${HTTPS_PROXY:-}}"
+  export HTTP_PROXY="${HTTP_PROXY:-${http_proxy:-}}"
+  export HTTPS_PROXY="${HTTPS_PROXY:-${https_proxy:-}}"
+
+  default_no_proxy="localhost,127.0.0.1,192.168.1.1,${HOST_IP},edgecraftrag,edgecraftrag-server"
+  merged_no_proxy="${no_proxy:-${NO_PROXY:-}}"
+  if [[ -n "${merged_no_proxy}" ]]; then
+    export no_proxy="${merged_no_proxy},${default_no_proxy}"
+  else
+    export no_proxy="${default_no_proxy}"
+  fi
+  export NO_PROXY="${no_proxy}"
+
+  # Set GPU group IDs for Docker
+  if getent group video >/dev/null 2>&1; then
+    export VIDEOGROUPID=$(getent group video | cut -d: -f3)
+  fi
+
+  if getent group render >/dev/null 2>&1; then
+    export RENDERGROUPID=$(getent group render | cut -d: -f3)
+  fi
+
+  # vLLM specific environment
+  export NGINX_PORT
+  export vLLM_ENDPOINT
+  export DP_NUM
+  export TENSOR_PARALLEL_SIZE
+  export MAX_NUM_SEQS
+  export MAX_NUM_BATCHED_TOKENS
+  export LOAD_IN_LOW_BIT
+  export SELECTED_XPU_0
+  export NGINX_CONFIG_PATH="${WORKPATH}/nginx/nginx.conf"
+  export VLLM_SERVICE_PORT_0=8100
+  export NGINX_PORT_0=8100
+  export VLLM_SERVICE_PORT_B60=${VLLM_SERVICE_PORT_B60:-8086}
+
+  # B60 specific
+  export DTYPE
+  export ZE_AFFINITY_MASK
+  export ENFORCE_EAGER
+  export TRUST_REMOTE_CODE
+  export DISABLE_SLIDING_WINDOW
+  export GPU_MEMORY_UTIL
+  export NO_ENABLE_PREFIX_CACHING
+  export DISABLE_LOG_REQUESTS
+  export BLOCK_SIZE
+  export QUANTIZATION
+
+  # Set compose profile based on backend
+  export COMPOSE_PROFILES
+  COMPOSE_PROFILES=$(get_vllm_compose_profile)
+}
+
+start_services() {
+  check_docker
+  prepare_directories
+  prepare_runtime_env
+
+  echo "Starting EdgeCraftRAG with vLLM (${VLLM_BACKEND})..."
+  echo "  Model path: ${MODEL_PATH}"
+  echo "  LLM model: ${LLM_MODEL}"
+  echo "  vLLM endpoint: ${vLLM_ENDPOINT}"
+  echo "  DP_NUM: ${DP_NUM}, TP: ${TENSOR_PARALLEL_SIZE}"
+  echo "  Compose profile: ${COMPOSE_PROFILES}"
+  echo ""
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+
+  # For vLLM deployments, may need to generate nginx config
+  if [[ -f "${WORKPATH}/nginx/nginx-conf-generator.sh" ]]; then
+    bash "${WORKPATH}/nginx/nginx-conf-generator.sh" "${DP_NUM}" "${NGINX_CONFIG_PATH}" 2>/dev/null || true
+  fi
+
+  docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" up -d
+
+  popd >/dev/null
+
+  echo ""
+  echo "Waiting for services to be ready..."
+
+  # Wait for vLLM container to be ready
+  local vllm_container
+  vllm_container=$(get_vllm_container_name)
+
+  if is_container_running "${vllm_container}"; then
+    echo "Waiting for vLLM container to initialize..."
+    local n=0
+    until [[ "$n" -ge 60 ]]; do
+      if docker logs "${vllm_container}" 2>&1 | grep -q "Starting vLLM API server"; then
+        echo "vLLM container is ready"
+        break
+      fi
+      sleep 3
+      n=$((n+1))
+    done
+  fi
+
+  sleep 5
+
+  # Check if containers are running
+  local all_running=true
+
+  if ! is_container_running "${vllm_container}"; then
+    echo "WARNING: vLLM container is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_SERVER}"; then
+    echo "WARNING: ${CONTAINER_SERVER} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_MEGA}"; then
+    echo "WARNING: ${CONTAINER_MEGA} is not running"
+    all_running=false
+  fi
+
+  if ! is_container_running "${CONTAINER_UI}"; then
+    echo "WARNING: ${CONTAINER_UI} is not running"
+    all_running=false
+  fi
+
+  if [[ "$all_running" == "true" ]]; then
+    echo ""
+    echo "All containers started successfully."
+    echo "vLLM endpoint:   ${vLLM_ENDPOINT}"
+    echo "UI:              http://${HOST_IP}:${UI_PORT}"
+    echo "API (server):    http://${HOST_IP}:${PIPELINE_SERVICE_PORT}"
+    echo "Mega service:    http://${HOST_IP}:${MEGA_SERVICE_PORT}"
+    echo ""
+    echo "View logs:"
+    echo "  docker logs -f ${vllm_container}"
+    echo "  docker logs -f ${CONTAINER_SERVER}"
+    echo "  docker logs -f ${CONTAINER_MEGA}"
+    echo "  docker logs -f ${CONTAINER_UI}"
+  else
+    echo ""
+    echo "Some containers failed to start. Check Docker logs for details."
+    exit 1
+  fi
+}
+
+stop_services() {
+  check_docker
+  prepare_runtime_env
+
+  echo "Stopping EdgeCraftRAG vLLM containers..."
+
+  pushd "${COMPOSE_DIR}" >/dev/null
+
+  docker compose --profile "${COMPOSE_PROFILES}" -f "${COMPOSE_FILE}" down
+
+  # Best effort cleanup for backend-specific vLLM containers that may be left in
+  # created state after a failed start and are not always selected by profile.
+  docker rm -f ipex-serving-xpu-container ipex-llm-serving-xpu-770 2>/dev/null || true
+
+  popd >/dev/null
+
+  echo "All containers stopped."
+}
+
+restart_services() {
+  stop_services
+  echo ""
+  start_services
+}
+
+status_service() {
+  local container_name=$1
+  local status
+  status=$(get_container_status "$container_name")
+
+  case "$status" in
+    running)
+      local container_id
+      container_id=$(docker ps -q --filter "name=^${container_name}" | head -1)
+      echo "${container_name}: running (container id: ${container_id})"
+      ;;
+    stopped)
+      echo "${container_name}: stopped"
+      ;;
+    not\ created)
+      echo "${container_name}: not created"
+      ;;
+  esac
+}
+
+status_all() {
+  check_docker
+
+  echo "EdgeCraftRAG vLLM Container Status:"
+  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+  # Check vLLM containers (may be multiple for DP)
+  local vllm_container
+  vllm_container=$(get_vllm_container_name)
+
+  if docker ps -a --format '{{.Names}}' 2>/dev/null | grep -q "^${vllm_container}$"; then
+    status_service "${vllm_container}"
+  else
+    echo "${vllm_container}: not created"
+  fi
+
+  status_service "${CONTAINER_SERVER}"
+  status_service "${CONTAINER_MEGA}"
+  status_service "${CONTAINER_UI}"
+  echo ""
+  echo "vLLM Backend: ${VLLM_BACKEND}"
+  echo "vLLM Endpoint: ${vLLM_ENDPOINT}"
+
+  # Show additional Milvus status if enabled
+  if [[ "${MILVUS_ENABLED}" == "1" ]]; then
+    echo ""
+    echo "Additional services (Milvus enabled):"
+    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    docker ps --filter "name=milvus" --filter "name=etcd" --filter "name=minio" --format "table {{.Names}}\t{{.Status}}" 2>/dev/null || echo "No additional services running"
+  fi
+}
+
+logs_service() {
+  local service=$1
+  check_docker
+
+  case "$service" in
+    vllm)
+      local vllm_container
+      vllm_container=$(get_vllm_container_name)
+      if ! is_container_running "${vllm_container}"; then
+        echo "vLLM container is not running"
+        exit 1
+      fi
+      docker logs -f "${vllm_container}"
+      ;;
+    server)
+      if ! is_container_running "${CONTAINER_SERVER}"; then
+        echo "Server container is not running"
+        exit 1
+      fi
+      docker logs -f "${CONTAINER_SERVER}"
+      ;;
+    mega)
+      if ! is_container_running "${CONTAINER_MEGA}"; then
+        echo "Mega service container is not running"
+        exit 1
+      fi
+      docker logs -f "${CONTAINER_MEGA}"
+      ;;
+    ui)
+      if ! is_container_running "${CONTAINER_UI}"; then
+        echo "UI container is not running"
+        exit 1
+      fi
+      docker logs -f "${CONTAINER_UI}"
+      ;;
+    *)
+      echo "ERROR: Unknown service: $service"
+      usage
+      exit 1
+      ;;
+  esac
+}
+
+usage() {
+  echo "Usage: $0 {start|stop|restart|status|logs} [service]"
+  echo ""
+  echo "Commands:"
+  echo "  start        Start all containers (vLLM + EdgeCraftRAG)"
+  echo "  stop         Stop all containers"
+  echo "  restart      Restart all containers"
+  echo "  status       Show container status"
+  echo "  logs         Follow logs for a specific service"
+  echo ""
+  echo "Services (for logs command):"
+  echo "  vllm         vLLM inference container"
+  echo "  server       Pipeline server"
+  echo "  mega         Mega service"
+  echo "  ui           UI service"
+  echo ""
+  echo "Examples:"
+  echo "  $0 start"
+  echo "  $0 restart"
+  echo "  $0 status"
+  echo "  $0 logs vllm"
+  echo "  $0 logs server"
+  echo "  $0 -h"
+  echo ""
+  echo "Environment Variables:"
+  echo "  VLLM_BACKEND         vLLM backend: a770|b60 (default: a770)"
+  echo "  HOST_IP              Server IP (default: auto-detected)"
+  echo "  MODEL_PATH           Model storage path (default: workspace/models)"
+  echo "  LLM_MODEL            LLM model name (default: Qwen/Qwen3-8B)"
+  echo "  DP_NUM               Number of DP instances (default: 1)"
+  echo "  TENSOR_PARALLEL_SIZE Tensor parallel size (default: 1)"
+  echo "  NGINX_PORT           vLLM nginx port (default: 8086)"
+  echo "  MILVUS_ENABLED       Enable Milvus DB: 0|1 (default: 1)"
+  echo "  CHAT_HISTORY_ROUND   Chat history length (default: 0)"
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" || "${1:-}" == "help" ]]; then
+  usage
+  exit 0
+fi
+
+ACTION=${1:-start}
+SERVICE=${2:-}
+
+case "$ACTION" in
+  start)
+    start_services
+    ;;
+  stop)
+    stop_services
+    ;;
+  restart)
+    restart_services
+    ;;
+  status)
+    status_all
+    ;;
+  logs)
+    if [[ -z "$SERVICE" ]]; then
+      echo "ERROR: Please specify a service: vllm, server, mega, or ui"
+      echo ""
+      usage
+      exit 1
+    fi
+    logs_service "$SERVICE"
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/EdgeCraftRAG/tools/run_vllm_on_double.sh.old b/EdgeCraftRAG/tools/run_vllm_on_double.sh.old
new file mode 100755
index 0000000000..a495b8c38c
--- /dev/null
+++ b/EdgeCraftRAG/tools/run_vllm_on_double.sh.old
@@ -0,0 +1,120 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+# 执行脚本前请确保镜像已全部下载完毕!
+
+set -e
+
+# 请确保WORKPATH指定到EdgeCraftRAG目录
+WORKPATH=$(dirname "$PWD")/EdgeCraftRAG
+LOG_PATH=$(dirname "$PWD")
+
+ip_address=$(hostname -I | awk '{print $1}')
+HOST_IP=$ip_address
+
+COMPOSE_FILE="compose_vllm.yaml"
+EC_RAG_SERVICE_PORT=16010
+
+# 模型文件路径请参考以下形式存放
+# Indexer: ${HOME}/models/BAAI/bge-small-en-v1.5
+# Reranker: ${HOME}/models/BAAI/bge-reranker-large
+MODEL_PATH="${HOME}/models"
+# 指定成您自己的DOC_PATH和TMPFILE_PATH
+DOC_PATH="$WORKPATH/tests"
+TMPFILE_PATH="$WORKPATH/tests"
+sudo chown -R 1000:1000 ${MODEL_PATH} ${DOC_PATH} ${TMPFILE_PATH}
+sudo chown -R 1000:1000 ${HOME}/.cache/huggingface
+
+DP_NUM=2
+
+MILVUS_ENABLED=0
+MAX_MODEL_LEN=5000
+CHAT_HISTORY_ROUND=5
+NGINX_PORT=8086
+NGINX_PORT_0=8100
+NGINX_PORT_1=8200
+VLLM_SERVICE_PORT_0=8100
+VLLM_SERVICE_PORT_1=8200
+TENSOR_PARALLEL_SIZE=1
+SELECTED_XPU_0=0
+SELECTED_XPU_1=1
+MAX_NUM_SEQS=64
+MAX_NUM_BATCHED_TOKENS=4000
+MAX_MODEL_LEN=3000
+LOAD_IN_LOW_BIT=fp8
+CCL_DG2_USM=""
+HF_ENDPOINT=https://hf-mirror.com
+vLLM_ENDPOINT="http://${HOST_IP}:${NGINX_PORT}"
+LLM_MODEL=Qwen/Qwen3-8B
+# 请设置您下载完成的llm模型路径(LLM_MODEL_PATH下请存放从huggingface或modelscope下载的原始模型，而不是经过OpenVINO转换的模型)
+LLM_MODEL_PATH="${HOME}/models/Qwen/Qwen3-8B"
+NGINX_CONFIG_PATH="$WORKPATH/nginx/nginx.conf"
+
+
+function start_services() {
+    cd $WORKPATH/docker_compose/intel/gpu/arc
+    export MODEL_PATH=${MODEL_PATH}
+    export DOC_PATH=${DOC_PATH}
+    export TMPFILE_PATH=${TMPFILE_PATH}
+    export HOST_IP=${HOST_IP}
+    export LLM_MODEL=${LLM_MODEL}
+    export HF_ENDPOINT=${HF_ENDPOINT}
+    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+    export no_proxy="localhost, 127.0.0.1, 192.168.1.1, ${HOST_IP}"
+    export LLM_MODEL_PATH=${LLM_MODEL_PATH}
+    export NGINX_PORT_0=${NGINX_PORT_0}
+    export NGINX_PORT_1=${NGINX_PORT_1}
+    export VLLM_SERVICE_PORT_0=${VLLM_SERVICE_PORT_0}
+    export VLLM_SERVICE_PORT_1=${VLLM_SERVICE_PORT_1}
+    export TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE}
+    export NGINX_CONFIG_PATH=${NGINX_CONFIG_PATH}
+    export SELECTED_XPU_0=${SELECTED_XPU_0}
+    export SELECTED_XPU_1=${SELECTED_XPU_1}
+    export SELECTED_XPU_2=${SELECTED_XPU_2}
+    export vLLM_ENDPOINT=${vLLM_ENDPOINT}
+    export MAX_NUM_SEQS=${MAX_NUM_SEQS}
+    export MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS}
+    export MAX_MODEL_LEN=${MAX_MODEL_LEN}
+    export LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT}
+    export CCL_DG2_USM=${CCL_DG2_USM}
+    export DP_NUM=${DP_NUM}
+    export MILVUS_ENABLED=${MILVUS_ENABLED}
+    export MAX_MODEL_LEN=${MAX_MODEL_LEN}
+    export CHAT_HISTORY_ROUND=${CHAT_HISTORY_ROUND}
+    export HF_CACHE="$HOME/.cache"
+    # generate nginx config file according to container count
+    bash $WORKPATH/nginx/nginx-conf-generator.sh $DP_NUM $WORKPATH/nginx/nginx.conf
+    # generate yaml file according to container count
+    bash multi-arc-yaml-generator.sh $DP_NUM $COMPOSE_FILE
+
+    # Start Docker Containers
+    docker compose -f $COMPOSE_FILE  up -d 
+    echo "ipex-llm-serving-xpu is booting, please wait."
+    n=0
+    until [[ "$n" -ge 100 ]]; do
+        docker logs ipex-llm-serving-xpu-container-0 > ${LOG_PATH}/ipex-llm-serving-xpu-container.log 2>&1
+        if grep -q "Starting vLLM API server on http://0.0.0.0:" ${LOG_PATH}/ipex-llm-serving-xpu-container.log; then
+            break
+        fi
+        sleep 6s
+        n=$((n+1))
+    done
+}
+
+function stop_docker() {
+    cd $WORKPATH/docker_compose/intel/gpu/arc
+    docker compose -f $COMPOSE_FILE  down
+}
+
+
+function main() {
+    echo "::group::stop_docker"
+    stop_docker
+    echo "::endgroup::"
+
+    echo "::group::start_services"
+    start_services
+    echo "The edgecraftrag services has started successfully."
+}
+
+main
diff --git a/EdgeCraftRAG/ui/vue/.gitignore b/EdgeCraftRAG/ui/vue/.gitignore
index 0fd69139e3..a547bf36d8 100644
--- a/EdgeCraftRAG/ui/vue/.gitignore
+++ b/EdgeCraftRAG/ui/vue/.gitignore
@@ -22,4 +22,3 @@ dist-ssr
 *.njsproj
 *.sln
 *.sw?
-package-lock.json
diff --git a/EdgeCraftRAG/ui/vue/components.d.ts b/EdgeCraftRAG/ui/vue/components.d.ts
index 5c31b7cc21..79fa8ff8f7 100644
--- a/EdgeCraftRAG/ui/vue/components.d.ts
+++ b/EdgeCraftRAG/ui/vue/components.d.ts
@@ -1,11 +1,8 @@
-// Copyright (C) 2026 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
 /* eslint-disable */
 // @ts-nocheck
 // Generated by unplugin-vue-components
 // Read more: https://github.com/vuejs/core/pull/3399
-export {};
+export {}
 
 /* prettier-ignore */
 declare module 'vue' {
diff --git a/EdgeCraftRAG/ui/vue/package.json b/EdgeCraftRAG/ui/vue/package.json
index b11bf4d991..470cc2d3b4 100644
--- a/EdgeCraftRAG/ui/vue/package.json
+++ b/EdgeCraftRAG/ui/vue/package.json
@@ -10,7 +10,7 @@
   },
   "dependencies": {
     "ant-design-vue": "^4.0.0-rc.6",
-    "axios": "^1.7.9",
+    "axios": "^1.14.0",
     "clipboard": "^2.0.11",
     "dayjs": "^1.11.13",
     "echarts": "^5.5.1",
@@ -30,7 +30,7 @@
     "vue": "^3.5.13",
     "vue-echarts": "^7.0.3",
     "vue-i18n": "^10.0.5",
-    "vue-json-pretty": "^2.4.0",
+    "vue-json-pretty": "^2.6.0",
     "vue-router": "^4.5.0",
     "ws": "^8.18.0"
   },
diff --git a/EdgeCraftRAG/ui/vue/src/api/agent/index.ts b/EdgeCraftRAG/ui/vue/src/api/agent/index.ts
index 9bf55f33ac..0a18e450c1 100644
--- a/EdgeCraftRAG/ui/vue/src/api/agent/index.ts
+++ b/EdgeCraftRAG/ui/vue/src/api/agent/index.ts
@@ -56,6 +56,17 @@ export const requestAgentDelete = (name: String) => {
   });
 };
 
+export const requestAgentSetActive = (name: String, active: boolean) => {
+  return request({
+    url: `/v1/agents/${name}/active`,
+    method: "patch",
+    data: { active },
+    showLoading: true,
+    showSuccessMsg: true,
+    successMsg: active ? "request.agent.activateSucc" : "request.agent.deactivateSucc",
+  });
+};
+
 export const getAgentConfigs = (type: String) => {
   return request({
     url: `/v1/agents/configs/${type}`,
diff --git a/EdgeCraftRAG/ui/vue/src/api/pipeline/index.ts b/EdgeCraftRAG/ui/vue/src/api/pipeline/index.ts
index 4bdecb1f66..ed50587b92 100644
--- a/EdgeCraftRAG/ui/vue/src/api/pipeline/index.ts
+++ b/EdgeCraftRAG/ui/vue/src/api/pipeline/index.ts
@@ -107,4 +107,13 @@ export const requestUrlVllm = (data: Object) => {
   });
 };
 
+export const requestUrlOvms = (data: Object) => {
+  return request({
+    url: "/v1/check/ovms",
+    method: "post",
+    data,
+    showLoading: true,
+  });
+};
+
 export const importUrl = `${import.meta.env.VITE_API_URL}v1/settings/pipelines/import`;
diff --git a/EdgeCraftRAG/ui/vue/src/auto-imports.d.ts b/EdgeCraftRAG/ui/vue/src/auto-imports.d.ts
index d07b1f9b7a..f6e2bab3ce 100644
--- a/EdgeCraftRAG/ui/vue/src/auto-imports.d.ts
+++ b/EdgeCraftRAG/ui/vue/src/auto-imports.d.ts
@@ -1,6 +1,3 @@
-// Copyright (C) 2026 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
 /* eslint-disable */
 /* prettier-ignore */
 // @ts-nocheck
@@ -9,98 +6,83 @@
 // biome-ignore lint: disable
 export {}
 declare global {
-  const EffectScope: (typeof import("vue"))["EffectScope"];
-  const acceptHMRUpdate: (typeof import("pinia"))["acceptHMRUpdate"];
-  const computed: (typeof import("vue"))["computed"];
-  const createApp: (typeof import("vue"))["createApp"];
-  const createPinia: (typeof import("pinia"))["createPinia"];
-  const customRef: (typeof import("vue"))["customRef"];
-  const defineAsyncComponent: (typeof import("vue"))["defineAsyncComponent"];
-  const defineComponent: (typeof import("vue"))["defineComponent"];
-  const defineStore: (typeof import("pinia"))["defineStore"];
-  const effectScope: (typeof import("vue"))["effectScope"];
-  const getActivePinia: (typeof import("pinia"))["getActivePinia"];
-  const getCurrentInstance: (typeof import("vue"))["getCurrentInstance"];
-  const getCurrentScope: (typeof import("vue"))["getCurrentScope"];
-  const h: (typeof import("vue"))["h"];
-  const inject: (typeof import("vue"))["inject"];
-  const isProxy: (typeof import("vue"))["isProxy"];
-  const isReactive: (typeof import("vue"))["isReactive"];
-  const isReadonly: (typeof import("vue"))["isReadonly"];
-  const isRef: (typeof import("vue"))["isRef"];
-  const mapActions: (typeof import("pinia"))["mapActions"];
-  const mapGetters: (typeof import("pinia"))["mapGetters"];
-  const mapState: (typeof import("pinia"))["mapState"];
-  const mapStores: (typeof import("pinia"))["mapStores"];
-  const mapWritableState: (typeof import("pinia"))["mapWritableState"];
-  const markRaw: (typeof import("vue"))["markRaw"];
-  const nextTick: (typeof import("vue"))["nextTick"];
-  const onActivated: (typeof import("vue"))["onActivated"];
-  const onBeforeMount: (typeof import("vue"))["onBeforeMount"];
-  const onBeforeRouteLeave: (typeof import("vue-router"))["onBeforeRouteLeave"];
-  const onBeforeRouteUpdate: (typeof import("vue-router"))["onBeforeRouteUpdate"];
-  const onBeforeUnmount: (typeof import("vue"))["onBeforeUnmount"];
-  const onBeforeUpdate: (typeof import("vue"))["onBeforeUpdate"];
-  const onDeactivated: (typeof import("vue"))["onDeactivated"];
-  const onErrorCaptured: (typeof import("vue"))["onErrorCaptured"];
-  const onMounted: (typeof import("vue"))["onMounted"];
-  const onRenderTracked: (typeof import("vue"))["onRenderTracked"];
-  const onRenderTriggered: (typeof import("vue"))["onRenderTriggered"];
-  const onScopeDispose: (typeof import("vue"))["onScopeDispose"];
-  const onServerPrefetch: (typeof import("vue"))["onServerPrefetch"];
-  const onUnmounted: (typeof import("vue"))["onUnmounted"];
-  const onUpdated: (typeof import("vue"))["onUpdated"];
-  const onWatcherCleanup: (typeof import("vue"))["onWatcherCleanup"];
-  const provide: (typeof import("vue"))["provide"];
-  const reactive: (typeof import("vue"))["reactive"];
-  const readonly: (typeof import("vue"))["readonly"];
-  const ref: (typeof import("vue"))["ref"];
-  const resolveComponent: (typeof import("vue"))["resolveComponent"];
-  const setActivePinia: (typeof import("pinia"))["setActivePinia"];
-  const setMapStoreSuffix: (typeof import("pinia"))["setMapStoreSuffix"];
-  const shallowReactive: (typeof import("vue"))["shallowReactive"];
-  const shallowReadonly: (typeof import("vue"))["shallowReadonly"];
-  const shallowRef: (typeof import("vue"))["shallowRef"];
-  const storeToRefs: (typeof import("pinia"))["storeToRefs"];
-  const toRaw: (typeof import("vue"))["toRaw"];
-  const toRef: (typeof import("vue"))["toRef"];
-  const toRefs: (typeof import("vue"))["toRefs"];
-  const toValue: (typeof import("vue"))["toValue"];
-  const triggerRef: (typeof import("vue"))["triggerRef"];
-  const unref: (typeof import("vue"))["unref"];
-  const useAttrs: (typeof import("vue"))["useAttrs"];
-  const useCssModule: (typeof import("vue"))["useCssModule"];
-  const useCssVars: (typeof import("vue"))["useCssVars"];
-  const useId: (typeof import("vue"))["useId"];
-  const useLink: (typeof import("vue-router"))["useLink"];
-  const useModel: (typeof import("vue"))["useModel"];
-  const useRoute: (typeof import("vue-router"))["useRoute"];
-  const useRouter: (typeof import("vue-router"))["useRouter"];
-  const useSlots: (typeof import("vue"))["useSlots"];
-  const useTemplateRef: (typeof import("vue"))["useTemplateRef"];
-  const watch: (typeof import("vue"))["watch"];
-  const watchEffect: (typeof import("vue"))["watchEffect"];
-  const watchPostEffect: (typeof import("vue"))["watchPostEffect"];
-  const watchSyncEffect: (typeof import("vue"))["watchSyncEffect"];
+  const EffectScope: typeof import('vue')['EffectScope']
+  const acceptHMRUpdate: typeof import('pinia')['acceptHMRUpdate']
+  const computed: typeof import('vue')['computed']
+  const createApp: typeof import('vue')['createApp']
+  const createPinia: typeof import('pinia')['createPinia']
+  const customRef: typeof import('vue')['customRef']
+  const defineAsyncComponent: typeof import('vue')['defineAsyncComponent']
+  const defineComponent: typeof import('vue')['defineComponent']
+  const defineStore: typeof import('pinia')['defineStore']
+  const effectScope: typeof import('vue')['effectScope']
+  const getActivePinia: typeof import('pinia')['getActivePinia']
+  const getCurrentInstance: typeof import('vue')['getCurrentInstance']
+  const getCurrentScope: typeof import('vue')['getCurrentScope']
+  const h: typeof import('vue')['h']
+  const inject: typeof import('vue')['inject']
+  const isProxy: typeof import('vue')['isProxy']
+  const isReactive: typeof import('vue')['isReactive']
+  const isReadonly: typeof import('vue')['isReadonly']
+  const isRef: typeof import('vue')['isRef']
+  const mapActions: typeof import('pinia')['mapActions']
+  const mapGetters: typeof import('pinia')['mapGetters']
+  const mapState: typeof import('pinia')['mapState']
+  const mapStores: typeof import('pinia')['mapStores']
+  const mapWritableState: typeof import('pinia')['mapWritableState']
+  const markRaw: typeof import('vue')['markRaw']
+  const nextTick: typeof import('vue')['nextTick']
+  const onActivated: typeof import('vue')['onActivated']
+  const onBeforeMount: typeof import('vue')['onBeforeMount']
+  const onBeforeRouteLeave: typeof import('vue-router')['onBeforeRouteLeave']
+  const onBeforeRouteUpdate: typeof import('vue-router')['onBeforeRouteUpdate']
+  const onBeforeUnmount: typeof import('vue')['onBeforeUnmount']
+  const onBeforeUpdate: typeof import('vue')['onBeforeUpdate']
+  const onDeactivated: typeof import('vue')['onDeactivated']
+  const onErrorCaptured: typeof import('vue')['onErrorCaptured']
+  const onMounted: typeof import('vue')['onMounted']
+  const onRenderTracked: typeof import('vue')['onRenderTracked']
+  const onRenderTriggered: typeof import('vue')['onRenderTriggered']
+  const onScopeDispose: typeof import('vue')['onScopeDispose']
+  const onServerPrefetch: typeof import('vue')['onServerPrefetch']
+  const onUnmounted: typeof import('vue')['onUnmounted']
+  const onUpdated: typeof import('vue')['onUpdated']
+  const onWatcherCleanup: typeof import('vue')['onWatcherCleanup']
+  const provide: typeof import('vue')['provide']
+  const reactive: typeof import('vue')['reactive']
+  const readonly: typeof import('vue')['readonly']
+  const ref: typeof import('vue')['ref']
+  const resolveComponent: typeof import('vue')['resolveComponent']
+  const setActivePinia: typeof import('pinia')['setActivePinia']
+  const setMapStoreSuffix: typeof import('pinia')['setMapStoreSuffix']
+  const shallowReactive: typeof import('vue')['shallowReactive']
+  const shallowReadonly: typeof import('vue')['shallowReadonly']
+  const shallowRef: typeof import('vue')['shallowRef']
+  const storeToRefs: typeof import('pinia')['storeToRefs']
+  const toRaw: typeof import('vue')['toRaw']
+  const toRef: typeof import('vue')['toRef']
+  const toRefs: typeof import('vue')['toRefs']
+  const toValue: typeof import('vue')['toValue']
+  const triggerRef: typeof import('vue')['triggerRef']
+  const unref: typeof import('vue')['unref']
+  const useAttrs: typeof import('vue')['useAttrs']
+  const useCssModule: typeof import('vue')['useCssModule']
+  const useCssVars: typeof import('vue')['useCssVars']
+  const useId: typeof import('vue')['useId']
+  const useLink: typeof import('vue-router')['useLink']
+  const useModel: typeof import('vue')['useModel']
+  const useRoute: typeof import('vue-router')['useRoute']
+  const useRouter: typeof import('vue-router')['useRouter']
+  const useSlots: typeof import('vue')['useSlots']
+  const useTemplateRef: typeof import('vue')['useTemplateRef']
+  const watch: typeof import('vue')['watch']
+  const watchEffect: typeof import('vue')['watchEffect']
+  const watchPostEffect: typeof import('vue')['watchPostEffect']
+  const watchSyncEffect: typeof import('vue')['watchSyncEffect']
 }
 // for type re-export
 declare global {
   // @ts-ignore
-  export type {
-    Component,
-    ComponentPublicInstance,
-    ComputedRef,
-    DirectiveBinding,
-    ExtractDefaultPropTypes,
-    ExtractPropTypes,
-    ExtractPublicPropTypes,
-    InjectionKey,
-    PropType,
-    Ref,
-    MaybeRef,
-    MaybeRefOrGetter,
-    VNode,
-    WritableComputedRef,
-  } from "vue";
-  import("vue");
+  export type { Component, ComponentPublicInstance, ComputedRef, DirectiveBinding, ExtractDefaultPropTypes, ExtractPropTypes, ExtractPublicPropTypes, InjectionKey, PropType, Ref, MaybeRef, MaybeRefOrGetter, VNode, WritableComputedRef } from 'vue'
+  import('vue')
 }
diff --git a/EdgeCraftRAG/ui/vue/src/i18n/en.ts b/EdgeCraftRAG/ui/vue/src/i18n/en.ts
index 3fad3ff70c..7b7fb65fb5 100644
--- a/EdgeCraftRAG/ui/vue/src/i18n/en.ts
+++ b/EdgeCraftRAG/ui/vue/src/i18n/en.ts
@@ -132,9 +132,11 @@ export default {
       weights: "Weights",
       local: "Local (OpenVINO)",
       vllm: "Remote (vLLM)",
+      ovms: "Remote (OVMS)",
       vector_url: "Vector Database URL",
       modelName: "Model Name",
       vllm_url: "vLLM URL",
+      ovms_url: "OVMS URL",
       kbadmin: "kbadmin",
       addAgent: "Agent Configuration",
       deleteAgentTip: "Are you sure you want to delete the agent generator configuration?",
@@ -169,6 +171,7 @@ export default {
       kb_vector_url: "IP : Port, (e.g. 192.168.1.1:29530)",
       vector_url: "IP : Port, (e.g. 192.168.1.1:19530)",
       vllm_url: "IP : Port, (e.g. 192.168.1.1:8086)",
+      ovms_url: "IP : Port, (e.g. 192.168.1.1:8000)",
       urlValid1: "Please enter vector url",
       urlValid2: "Please enter the correct url",
       urlValid3: "URL cannot be accessed",
@@ -180,15 +183,25 @@ export default {
       vllmUrlValid3: "URL cannot be accessed",
       vllmUrlValid4: "Test passed !",
       vllmUrlValid5: "The vLLM model has not passed verification yet",
-      nodeParserTypeTip: "Both Indexer Type and  Retriever Type will be set to kbadmin at the same time",
-      indexerTypeTip: "Both Node Parser Type and  Retriever Type will be set to kbadmin at the same time",
-      retrieverTypeTip: "Both Node Parser Type and  Indexer Type will be set to kbadmin at the same time",
+      ovmsUrlValid1: "Please enter OVMS url",
+      ovmsUrlValid2: "Please enter the correct url",
+      ovmsUrlValid3: "URL cannot be accessed",
+      ovmsUrlValid4: "Test passed !",
+      ovmsUrlValid5: "The OVMS model has not passed verification yet",
+      remoteUrlValid5: "The remote model has not passed verification yet",
+      nodeParserTypeTip:
+        "Both Indexer Type and  Retriever Type will be set to kbadmin at the same time",
+      indexerTypeTip:
+        "Both Node Parser Type and  Retriever Type will be set to kbadmin at the same time",
+      retrieverTypeTip:
+        "Both Node Parser Type and  Indexer Type will be set to kbadmin at the same time",
       retrieverChangeTip: "Please go to the Indexer stage to complete the data",
       indexerTypeValid1: "Indexer type can only select kbadmin",
       modelRequired: "Please enter embedding model url",
       modelFormat: "Please enter the correct url",
       retrieverValid: "Please return to the Indexer stage to supplement information.",
       modelTip: "Please connect to vLLM service",
+      ovmsModelTip: "Please connect to OVMS service",
     },
     desc: {
       name: "The name identifier of the pipeline",
@@ -280,7 +293,8 @@ export default {
     edit: "Edit Knowledge Base",
     deleteTip: "Are you sure delete this knowledge base?",
     activeTip: "Are you sure activate this knowledge base?",
-    uploadTip: "Supports PDF, Word, TXT,Doc,Html,PPT,ZIP formats, with a single file size not exceeding 200M",
+    uploadTip:
+      "Supports PDF, Word, TXT,Doc,Html,PPT,ZIP formats, with a single file size not exceeding 200M",
     notFileTip: "The knowledge base is empty. Go upload your files.",
     name: "Name",
     des: "Description",
@@ -318,7 +332,8 @@ export default {
     desc: {
       name: "The name identifier of the knowledge base.",
       type: "The type identifier of the knowledge base.",
-      description: "Briefly describe the purpose, content scope, or intended use of this knowledge base.",
+      description:
+        "Briefly describe the purpose, content scope, or intended use of this knowledge base.",
     },
   },
   request: {
@@ -349,6 +364,8 @@ export default {
       createSucc: "Agent created successfully!",
       updateSucc: "Agent update successful!",
       deleteSucc: "Agent deleted successfully!",
+      activateSucc: "Agent activated successfully!",
+      deactivateSucc: "Agent deactivated successfully!",
     },
   },
   error: {
@@ -405,6 +422,8 @@ export default {
     importErrTip: "Files upload failed!",
     name: "Name",
     id: "ID",
+    type: "Type",
+    state: "State",
     status: "Status",
     operation: "Operation",
     configs: "Configs",
diff --git a/EdgeCraftRAG/ui/vue/src/i18n/zh.ts b/EdgeCraftRAG/ui/vue/src/i18n/zh.ts
index d2ddbb7d14..4860f35d36 100644
--- a/EdgeCraftRAG/ui/vue/src/i18n/zh.ts
+++ b/EdgeCraftRAG/ui/vue/src/i18n/zh.ts
@@ -130,9 +130,11 @@ export default {
       weights: "权重",
       local: "本地（OpenVINO）",
       vllm: "远程（vLLM）",
+      ovms: "远程（OVMS）",
       vector_url: "向量数据库地址",
       modelName: "模型名称",
       vllm_url: "vLLM 地址",
+      ovms_url: "OVMS 地址",
       kbadmin: "kbadmin",
       addAgent: "智能体生成器配置",
       deleteAgentTip: "您确定要删除智能生成器体配置吗？",
@@ -167,6 +169,7 @@ export default {
       kb_vector_url: "IP : 端口，(例如 192.168.1.1:29530)",
       vector_url: "IP : 端口，(例如 192.168.1.1:19530)",
       vllm_url: "IP : 端口，(例如 192.168.1.1:8086)",
+      ovms_url: "IP : 端口，(例如 192.168.1.1:8000)",
       urlValid1: "向量数据库地址不能为空",
       urlValid2: "请输入正确的向量数据库地址",
       urlValid3: "向量数据库地址无法访问",
@@ -178,6 +181,12 @@ export default {
       vllmUrlValid3: "vLLM地址无法访问",
       vllmUrlValid4: "测试通过！",
       vllmUrlValid5: "vLLM模型还未通过校验",
+      ovmsUrlValid1: "OVMS地址不能为空",
+      ovmsUrlValid2: "请输入正确的OVMS地址",
+      ovmsUrlValid3: "OVMS地址无法访问",
+      ovmsUrlValid4: "测试通过！",
+      ovmsUrlValid5: "OVMS模型还未通过校验",
+      remoteUrlValid5: "远程模型还未通过校验",
       nodeParserTypeTip: "索引器类型和检索器类型将同时设置为kbadmin",
       indexerTypeTip: "节点解析器类型和检索器类型将同时设置为kbadmin",
       retrieverTypeTip: "索引器类型和节点解析器类型将同时设置为kbadmin",
@@ -187,6 +196,7 @@ export default {
       modelFormat: "请输入正确的模型地址",
       retrieverValid: "请回到Indexer阶段补充信息",
       modelTip: "请先连接vLLM服务",
+      ovmsModelTip: "请先连接OVMS服务",
     },
     desc: {
       name: "Pipeline的名称标识，用于区分不同工作流",
@@ -216,7 +226,8 @@ export default {
       vector: "矢量存储索引",
       simple: "解析文本，优先选择完整的句子。",
       hierarchical: "使用NodeParser将文档拆分为递归层次结构的节点。",
-      sentencewindow: "将文档分割成节点，每个节点代表一个句子。每个节点包含一个来自元数据中周围句子的窗口",
+      sentencewindow:
+        "将文档分割成节点，每个节点代表一个句子。每个节点包含一个来自元数据中周围句子的窗口",
       unstructured: "一个处理非结构化数据的组件",
       milvusVector: "矢量索引存储在Milvus中",
       vector_url: "测试Milvus地址是否可用",
@@ -344,6 +355,8 @@ export default {
       createSucc: "智能体创建成功!",
       updateSucc: "智能体更新成功!",
       deleteSucc: "智能体删除成功!",
+      activateSucc: "智能体已激活!",
+      deactivateSucc: "智能体已停用!",
     },
   },
   error: {
@@ -401,6 +414,8 @@ export default {
     importErrTip: "智能体配置导入失败！",
     name: "名称",
     id: "ID",
+    type: "类型",
+    state: "状态",
     status: "状态",
     operation: "操作",
     configs: "配置",
diff --git a/EdgeCraftRAG/ui/vue/src/theme/common.less b/EdgeCraftRAG/ui/vue/src/theme/common.less
index e4d9a17650..a1783e1750 100644
--- a/EdgeCraftRAG/ui/vue/src/theme/common.less
+++ b/EdgeCraftRAG/ui/vue/src/theme/common.less
@@ -239,18 +239,6 @@
   .ml-12;
 }
 
-:deep(.horizontal-form-item) {
-  // .intel-form-item-row {
-  //   display: flex;
-  //   flex-direction: row;
-  // }
-  // .intel-form-item-label {
-  //   width: 100px;
-  // }
-  // .intel-form-item-control {
-  //   flex: 1;
-  // }
-}
 .loopStyle(@counter) when (@counter > 0) {
   .p-@{counter} {
     padding: (1px * @counter);
diff --git a/EdgeCraftRAG/ui/vue/src/types/global.d.ts b/EdgeCraftRAG/ui/vue/src/types/global.d.ts
index 29cff5c80c..220b421b92 100644
--- a/EdgeCraftRAG/ui/vue/src/types/global.d.ts
+++ b/EdgeCraftRAG/ui/vue/src/types/global.d.ts
@@ -124,6 +124,7 @@ declare type TableColumns<T = any> = {
   visible?: boolean;
   fixed?: string | boolean;
   minWidth?: number;
+  width?: number;
   ellipsis?: boolean;
 };
 
diff --git a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/Chat.vue b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/Chat.vue
index f6149f0770..3a06a8312b 100644
--- a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/Chat.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/Chat.vue
@@ -3,10 +3,7 @@
     <div class="message-box" ref="scrollContainer" v-if="messagesLength">
       <div class="intel-markdown">
         <div ref="messageComponent">
-          <div
-            v-for="(msg, index) in messagesList"
-            :key="`session-${currentSessionId}-${index}`"
-          >
+          <div v-for="(msg, index) in messagesList" :key="`session-${currentSessionId}-${index}`">
             <MessageItem
               :message-key="`session-${currentSessionId}-${index}`"
               :message="msg"
@@ -26,9 +23,7 @@
     </div>
     <div class="initial-input" v-else>
       <div class="text-wrap">{{ $t("chat.tip2") }}</div>
-      <div class="tip-wrap">
-        <img :src="lightBulb" alt="" />{{ $t("chat.tip3") }}
-      </div>
+      <div class="tip-wrap"><img :src="lightBulb" alt="" />{{ $t("chat.tip3") }}</div>
     </div>
     <div class="input-wrap" ref="inputRef">
       <div class="bottom-wrap" v-if="showScrollToBottomBtn">
@@ -56,17 +51,14 @@
             <SvgIcon name="icon-deep-think" :size="16" inherit />
             {{ $t(`chat.${isThink ? "reason" : "think"}`) }}
           </span>
-          <span
-            :class="{ 'think-btn': true, 'is-deep': enableKB }"
-            @click="handleKBChange"
-          >
+          <span :class="{ 'think-btn': true, 'is-deep': enableKB }" @click="handleKBChange">
             <SvgIcon name="icon-kb" :size="16" inherit />
             {{ $t("knowledge.title") }}
           </span>
         </div>
 
         <div class="send-btn">
-          <a-tooltip placement="top" :title="$t('chat.new')">
+          <a-tooltip placement="top" :arrow="false" :title="$t('chat.new')">
             <span class="common-btn">
               <SvgIcon
                 name="icon-newChat"
@@ -76,7 +68,7 @@
               />
             </span>
           </a-tooltip>
-          <a-tooltip placement="top" :title="$t('generation.title')">
+          <a-tooltip placement="top" :arrow="false" :title="$t('generation.title')">
             <span class="common-btn">
               <SvgIcon
                 name="icon-setting1"
@@ -113,653 +105,648 @@
 </template>
 
 <script lang="ts" setup name="Chatbot">
-import { getBenchmark, getSessionDetailById } from "@/api/chatbot";
-import lightBulb from "@/assets/svgs/lightBulb.svg";
-import _ from "lodash";
-import { reactive, ref, computed, nextTick } from "vue";
-import { Benchmark, IMessage } from "../../type";
-import MessageItem from "./MessageItem.vue";
-import { handleMessageSend, StreamController } from "./SseService";
-import { Local } from "@/utils/storage";
-import { ArrowDownOutlined } from "@ant-design/icons-vue";
-import { throttle } from "lodash";
-import { chatbotAppStore } from "@/store/chatbot";
-import { sessionAppStore } from "@/store/session";
-import emitter from "@/utils/mitt";
-import router from "@/router";
-import { message } from "ant-design-vue";
-import { useI18n } from "vue-i18n";
-
-const { t } = useI18n();
-const route = useRoute();
-const chatbotStore = chatbotAppStore();
-const sessionStore = sessionAppStore();
-const emit = defineEmits(["config"]);
-const ENV_URL = import.meta.env;
-
-const defaultBenchmark = reactive<Benchmark>({
-  generator: "",
-  postprocessor: "",
-  retriever: "",
-});
-
-let streamController = ref<StreamController | null>(null);
-const messagesList = ref<IMessage[]>([]);
-const inputKeywords = ref<string>("");
-const scrollContainer = ref<HTMLElement | null>(null);
-const messageComponent = ref<HTMLElement | null>(null);
-let resizeObserver: ResizeObserver | null = null;
-const messageRef = ref<any>(null);
-const inResponse = ref<boolean>(false);
-const imgVisible = ref<boolean>(false);
-const imageSrc = ref<string>("");
-const isUserScrolling = ref(false);
-const showScrollToBottomBtn = ref(false);
-const resizeObserverRef = ref<ResizeObserver | null>(null);
-const enableKB = ref<boolean>(true);
-const isCreatingNewSession = ref(false);
-const shouldIgnoreRouteChange = ref(false);
-
-const inputRef = ref();
-const handleEnvUrl = () => {
-  const { VITE_CHATBOT_URL } = ENV_URL;
-  return `${VITE_CHATBOT_URL}v1/chatqna`;
-};
-
-const handleMessageDisplay = (data: any) => {
-  if (inResponse.value) {
-    isUserScrolling.value = false;
-    const regex = /code:0000(.*)/s;
-    const match = data.match(regex);
-    if (match) {
-      messagesList.value.pop();
-      messagesList.value[messagesList.value?.length - 1].errorMessage =
-        match[1].trim();
-      return;
-    }
+  import { getBenchmark, getSessionDetailById } from "@/api/chatbot";
+  import lightBulb from "@/assets/svgs/lightBulb.svg";
+  import router from "@/router";
+  import { chatbotAppStore } from "@/store/chatbot";
+  import { sessionAppStore } from "@/store/session";
+  import emitter from "@/utils/mitt";
+  import { Local } from "@/utils/storage";
+  import { ArrowDownOutlined } from "@ant-design/icons-vue";
+  import { message } from "ant-design-vue";
+  import _, { throttle } from "lodash";
+  import { computed, nextTick, reactive, ref } from "vue";
+  import { useI18n } from "vue-i18n";
+  import { Benchmark, IMessage } from "../../type";
+  import MessageItem from "./MessageItem.vue";
+  import { handleMessageSend, StreamController } from "./SseService";
+
+  const { t } = useI18n();
+  const route = useRoute();
+  const chatbotStore = chatbotAppStore();
+  const sessionStore = sessionAppStore();
+  const emit = defineEmits(["config"]);
+  const ENV_URL = import.meta.env;
+
+  const defaultBenchmark = reactive<Benchmark>({
+    generator: "",
+    postprocessor: "",
+    retriever: "",
+  });
 
-    messagesList.value[messagesList.value?.length - 1].content = data;
-  }
-};
+  let streamController = ref<StreamController | null>(null);
+  const messagesList = ref<IMessage[]>([]);
+  const inputKeywords = ref<string>("");
+  const scrollContainer = ref<HTMLElement | null>(null);
+  const messageComponent = ref<HTMLElement | null>(null);
+  let resizeObserver: ResizeObserver | null = null;
+  const messageRef = ref<any>(null);
+  const inResponse = ref<boolean>(false);
+  const imgVisible = ref<boolean>(false);
+  const imageSrc = ref<string>("");
+  const isUserScrolling = ref(false);
+  const showScrollToBottomBtn = ref(false);
+  const resizeObserverRef = ref<ResizeObserver | null>(null);
+  const enableKB = ref<boolean>(true);
+  const isCreatingNewSession = ref(false);
+  const shouldIgnoreRouteChange = ref(false);
+
+  const inputRef = ref();
+  const handleEnvUrl = () => {
+    const { VITE_CHATBOT_URL } = ENV_URL;
+    return `${VITE_CHATBOT_URL}v1/chatqna`;
+  };
+
+  const handleMessageDisplay = (data: any) => {
+    if (inResponse.value) {
+      isUserScrolling.value = false;
+      const regex = /code:0000(.*)/s;
+      const match = data.match(regex);
+      if (match) {
+        messagesList.value.pop();
+        messagesList.value[messagesList.value?.length - 1].errorMessage = match[1].trim();
+        return;
+      }
+
+      messagesList.value[messagesList.value?.length - 1].content = data;
+    }
+  };
 
-const notInput = computed(() => {
-  return inputKeywords.value.trim() === "";
-});
+  const notInput = computed(() => {
+    return inputKeywords.value.trim() === "";
+  });
 
-const messagesLength = computed(() => {
-  return messagesList.value?.length;
-});
+  const messagesLength = computed(() => {
+    return messagesList.value?.length;
+  });
 
-const lastQueryIndex = computed(() => {
-  for (let i = messagesList.value.length - 1; i >= 0; i--) {
-    if (messagesList.value[i].role === "user") {
-      return i;
+  const lastQueryIndex = computed(() => {
+    for (let i = messagesList.value.length - 1; i >= 0; i--) {
+      if (messagesList.value[i].role === "user") {
+        return i;
+      }
     }
-  }
-  return -1;
-});
+    return -1;
+  });
 
-const lastResponseIndex = computed(() => {
-  for (let i = messagesList.value.length - 1; i >= 0; i--) {
-    if (messagesList.value[i].role === "assistant") {
-      return i;
+  const lastResponseIndex = computed(() => {
+    for (let i = messagesList.value.length - 1; i >= 0; i--) {
+      if (messagesList.value[i].role === "assistant") {
+        return i;
+      }
     }
-  }
-  return -1;
-});
+    return -1;
+  });
 
-const isAgent = computed(() => {
-  return !!chatbotStore.agent.name;
-});
+  const isAgent = computed(() => {
+    return !!chatbotStore.agent.name;
+  });
 
-const isThink = computed({
-  get() {
-    const { enable_thinking = true } =
-      chatbotStore.configuration?.chat_template_kwargs;
-    return enable_thinking;
-  },
-  set(value: boolean) {
-    chatbotStore.setChatbotConfiguration({
-      chat_template_kwargs: {
-        ...chatbotStore.configuration?.chat_template_kwargs,
-        enable_thinking: value,
-      },
-    });
-  },
-});
+  const isThink = computed({
+    get() {
+      const { enable_thinking = true } = chatbotStore.configuration?.chat_template_kwargs;
+      return enable_thinking;
+    },
+    set(value: boolean) {
+      chatbotStore.setChatbotConfiguration({
+        chat_template_kwargs: {
+          ...chatbotStore.configuration?.chat_template_kwargs,
+          enable_thinking: value,
+        },
+      });
+    },
+  });
 
-const isLastQuery = (index: number) => index === lastQueryIndex.value;
-const isLastResponse = (index: number) => index === lastResponseIndex.value;
+  const isLastQuery = (index: number) => index === lastQueryIndex.value;
+  const isLastResponse = (index: number) => index === lastResponseIndex.value;
 
-const handleStreamEnd = () => {
-  handleStopDisplay();
-  queryBenchmark();
-  updateSessionId();
-  sessionStore.setResponseSessionId("");
-};
+  const handleStreamEnd = () => {
+    handleStopDisplay();
+    queryBenchmark();
+    updateSessionId();
+    sessionStore.setResponseSessionId("");
+  };
 
-const toggleConnection = () => {
-  if (inResponse.value) {
-    if (streamController.value) {
-      streamController.value.cancel();
+  const toggleConnection = () => {
+    if (inResponse.value) {
+      if (streamController.value) {
+        streamController.value.cancel();
+      }
+
+      streamController.value = handleMessageSend(
+        handleEnvUrl(),
+        formatFormParam(),
+        handleMessageDisplay,
+        handleStreamEnd
+      );
     }
+  };
 
-    streamController.value = handleMessageSend(
-      handleEnvUrl(),
-      formatFormParam(),
-      handleMessageDisplay,
-      handleStreamEnd
-    );
-  }
-};
+  // Format parameter
+  const formatFormParam = () => {
+    const { configuration = {} } = Local.get("chatbotConfiguration") || {};
+    return Object.assign({}, configuration, {
+      messages: inputKeywords.value,
+    });
+  };
 
-// Format parameter
-const formatFormParam = () => {
-  const { configuration = {} } = Local.get("chatbotConfiguration") || {};
-  return Object.assign({}, configuration, {
-    messages: inputKeywords.value,
-  });
-};
+  const handleEnter = (e: any) => {
+    e.preventDefault();
+    if (inResponse.value) {
+      return;
+    }
+    handleSendMessage();
+  };
 
-const handleEnter = (e: any) => {
-  e.preventDefault();
-  if (inResponse.value) {
-    return;
-  }
-  handleSendMessage();
-};
+  const handleSendMessage = async () => {
+    if (!inputKeywords.value.trim()) return;
+
+    messagesList.value.push(
+      {
+        role: "user",
+        content: inputKeywords.value,
+      },
+      {
+        role: "assistant",
+        content: "",
+        query: inputKeywords.value,
+        benchmark: _.cloneDeep(defaultBenchmark),
+      }
+    );
 
-const handleSendMessage = async () => {
-  if (!inputKeywords.value.trim()) return;
+    inResponse.value = true;
+    toggleConnection();
+    inputKeywords.value = "";
+    scrollToBottom();
 
-  messagesList.value.push(
-    {
-      role: "user",
-      content: inputKeywords.value,
-    },
-    {
-      role: "assistant",
-      content: "",
-      query: inputKeywords.value,
-      benchmark: _.cloneDeep(defaultBenchmark),
+    const { currentSession = "" } = sessionStore;
+    sessionStore.setResponseSessionId(currentSession);
+  };
+
+  const handleStopDisplay = () => {
+    inResponse.value = false;
+  };
+  const currentSessionId = computed(() => sessionStore.currentSession);
+  const updateSessionId = () => {
+    const sessionId = route.query?.sessionId;
+    const storedSessionId = sessionStore.currentSession;
+
+    if (!sessionId && storedSessionId) {
+      shouldIgnoreRouteChange.value = true;
+      router.replace({
+        query: {
+          ...route.query,
+          sessionId: storedSessionId,
+        },
+      });
+      nextTick(() => {
+        setTimeout(() => {
+          shouldIgnoreRouteChange.value = false;
+        }, 100);
+      });
     }
-  );
+  };
 
-  inResponse.value = true;
-  toggleConnection();
-  inputKeywords.value = "";
-  scrollToBottom();
+  const queryBenchmark = async () => {
+    const data = (await getBenchmark()) || {};
+
+    if (data["Benchmark enabled"]) {
+      const benchmarkData = data.last_benchmark_data || {};
+      if (benchmarkData.generator) {
+        const processedBenchmarkData = Object.fromEntries(
+          Object.entries(benchmarkData).map(([key, value]: any) => [
+            key,
+            value ? parseFloat(value.toFixed(4)) : 0,
+          ])
+        );
+        messagesList.value[messagesList.value.length - 1].benchmark = processedBenchmarkData;
+      }
+    }
+  };
 
-  const { currentSession = "" } = sessionStore;
-  sessionStore.setResponseSessionId(currentSession);
-};
+  const handleImagePreview = (url: string) => {
+    imageSrc.value = url;
+    handleImageVisible(true);
+  };
 
-const handleStopDisplay = () => {
-  inResponse.value = false;
-};
-const currentSessionId = computed(() => sessionStore.currentSession);
-const updateSessionId = () => {
-  const sessionId = route.query?.sessionId;
-  const storedSessionId = sessionStore.currentSession;
+  const handleImageVisible = (value: boolean = false) => {
+    imgVisible.value = value;
+  };
 
-  if (!sessionId && storedSessionId) {
+  const handleNewChat = () => {
+    isCreatingNewSession.value = true;
     shouldIgnoreRouteChange.value = true;
+
+    inputKeywords.value = "";
+    messagesList.value = [];
+    sessionStore.setSessionId("");
     router.replace({
-      query: {
-        ...route.query,
-        sessionId: storedSessionId,
-      },
+      query: {},
     });
     nextTick(() => {
       setTimeout(() => {
+        isCreatingNewSession.value = false;
         shouldIgnoreRouteChange.value = false;
       }, 100);
     });
-  }
-};
-
-const queryBenchmark = async () => {
-  const data = (await getBenchmark()) || {};
-
-  if (data["Benchmark enabled"]) {
-    const benchmarkData = data.last_benchmark_data || {};
-    if (benchmarkData.generator) {
-      const processedBenchmarkData = Object.fromEntries(
-        Object.entries(benchmarkData).map(([key, value]: any) => [
-          key,
-          value ? parseFloat(value.toFixed(4)) : 0,
-        ])
-      );
-      messagesList.value[messagesList.value.length - 1].benchmark =
-        processedBenchmarkData;
-    }
-  }
-};
-
-const handleImagePreview = (url: string) => {
-  imageSrc.value = url;
-  handleImageVisible(true);
-};
-
-const handleImageVisible = (value: boolean = false) => {
-  imgVisible.value = value;
-};
-
-const handleNewChat = () => {
-  isCreatingNewSession.value = true;
-  shouldIgnoreRouteChange.value = true;
-
-  inputKeywords.value = "";
-  messagesList.value = [];
-  sessionStore.setSessionId("");
-  router.replace({
-    query: {},
-  });
-  nextTick(() => {
-    setTimeout(() => {
-      isCreatingNewSession.value = false;
-      shouldIgnoreRouteChange.value = false;
-    }, 100);
-  });
-};
+  };
+
+  const handleThinkChange = () => {
+    if (isAgent.value) return;
+    isThink.value = !isThink.value;
+  };
 
-const handleThinkChange = () => {
-  if (isAgent.value) return;
-  isThink.value = !isThink.value;
-};
+  const handleKBChange = () => {
+    enableKB.value = !enableKB.value;
+    const { chat_template_kwargs } = chatbotStore.configuration;
 
-const handleKBChange = () => {
-  enableKB.value = !enableKB.value;
-  const { chat_template_kwargs } = chatbotStore.configuration;
+    const chat_template = {
+      ...chat_template_kwargs,
+      enable_rag_retrieval: enableKB.value,
+    };
 
-  const chat_template = {
-    ...chat_template_kwargs,
-    enable_rag_retrieval: enableKB.value,
+    chatbotStore.setChatbotConfiguration({
+      chat_template_kwargs: chat_template,
+    });
   };
 
-  chatbotStore.setChatbotConfiguration({
-    chat_template_kwargs: chat_template,
-  });
-};
-
-const handleConfig = () => {
-  emit("config");
-};
-
-const handleRegenerate = (query: string) => {
-  inputKeywords.value = query;
-  handleSendMessage();
-};
-
-const handleDelete = ({ index, query }: { index: number; query: string }) => {
-  messagesList.value.splice(index);
-  inputKeywords.value = query;
-  handleSendMessage();
-};
-
-const handleStopChat = async () => {
-  if (streamController.value) {
-    streamController.value.cancel();
-    streamController.value = null;
-  }
-};
+  const handleConfig = () => {
+    emit("config");
+  };
 
-const scrollToBottom = () => {
-  if (!scrollContainer.value) return;
+  const handleRegenerate = (query: string) => {
+    inputKeywords.value = query;
+    handleSendMessage();
+  };
 
-  scrollContainer.value?.scrollTo({
-    top: scrollContainer.value.scrollHeight,
-    behavior: "smooth",
-  });
-  isUserScrolling.value = false;
-  showScrollToBottomBtn.value = false;
-};
+  const handleDelete = ({ index, query }: { index: number; query: string }) => {
+    messagesList.value.splice(index);
+    inputKeywords.value = query;
+    handleSendMessage();
+  };
+
+  const handleStopChat = async () => {
+    if (streamController.value) {
+      streamController.value.cancel();
+      streamController.value = null;
+    }
+  };
 
-const handleResize = (entries: ResizeObserverEntry[]) => {
-  for (const entry of entries) {
-    if (!scrollContainer.value || isUserScrolling.value) return;
+  const scrollToBottom = () => {
+    if (!scrollContainer.value) return;
 
     scrollContainer.value?.scrollTo({
-      top: entry.contentRect.height,
+      top: scrollContainer.value.scrollHeight,
       behavior: "smooth",
     });
-  }
-};
-
-const handleScroll = () => {
-  const container = scrollContainer.value;
-  if (!container) return;
-  const distanceToBottom =
-    container.scrollHeight - container.scrollTop - container.clientHeight;
-  if (distanceToBottom > 80) {
-    isUserScrolling.value = true;
-    showScrollToBottomBtn.value = true;
-    if (resizeObserverRef.value) resizeObserverRef.value.disconnect();
-  } else {
     isUserScrolling.value = false;
     showScrollToBottomBtn.value = false;
-    if (messageComponent.value && resizeObserverRef.value)
-      resizeObserverRef.value.observe(messageComponent.value);
-  }
-};
+  };
 
-const initResizeObserver = () => {
-  if (messageComponent.value) {
-    if (resizeObserverRef.value) {
-      resizeObserverRef.value.disconnect();
+  const handleResize = (entries: ResizeObserverEntry[]) => {
+    for (const entry of entries) {
+      if (!scrollContainer.value || isUserScrolling.value) return;
+
+      scrollContainer.value?.scrollTo({
+        top: entry.contentRect.height,
+        behavior: "smooth",
+      });
     }
+  };
 
-    resizeObserverRef.value = new ResizeObserver(handleResize);
-    resizeObserverRef.value.observe(messageComponent.value);
+  const handleScroll = () => {
+    const container = scrollContainer.value;
+    if (!container) return;
+    const distanceToBottom = container.scrollHeight - container.scrollTop - container.clientHeight;
+    if (distanceToBottom > 80) {
+      isUserScrolling.value = true;
+      showScrollToBottomBtn.value = true;
+      if (resizeObserverRef.value) resizeObserverRef.value.disconnect();
+    } else {
+      isUserScrolling.value = false;
+      showScrollToBottomBtn.value = false;
+      if (messageComponent.value && resizeObserverRef.value)
+        resizeObserverRef.value.observe(messageComponent.value);
+    }
+  };
 
-    const throttledHandleScroll = throttle(handleScroll, 100);
+  const initResizeObserver = () => {
+    if (messageComponent.value) {
+      if (resizeObserverRef.value) {
+        resizeObserverRef.value.disconnect();
+      }
 
-    scrollContainer.value?.addEventListener("scroll", throttledHandleScroll);
-  }
-};
-
-const initialSessionDetail = (messages: IMessage[]): IMessage[] => {
-  return messages?.map((msg, i, arr) => {
-    if (msg.role === "assistant" && i > 0 && arr[i - 1].role === "user") {
-      return {
-        ...msg,
-        query: arr[i - 1].content,
-      };
-    }
-    return msg;
-  });
-};
-
-const handleViewSessionDetail = async (sessionId: string) => {
-  try {
-    const data: any = await getSessionDetailById(sessionId);
-    if (!data?.session_content?.messages) {
-      handleNewChat();
-      message.error(t("chat.notExist"));
-      return;
+      resizeObserverRef.value = new ResizeObserver(handleResize);
+      resizeObserverRef.value.observe(messageComponent.value);
+
+      const throttledHandleScroll = throttle(handleScroll, 100);
+
+      scrollContainer.value?.addEventListener("scroll", throttledHandleScroll);
     }
-    messagesList.value = initialSessionDetail(data?.session_content?.messages);
-    nextTick(() => {
-      scrollToBottom();
+  };
+
+  const initialSessionDetail = (messages: IMessage[]): IMessage[] => {
+    return messages?.map((msg, i, arr) => {
+      if (msg.role === "assistant" && i > 0 && arr[i - 1].role === "user") {
+        return {
+          ...msg,
+          query: arr[i - 1].content,
+        };
+      }
+      return msg;
     });
-  } catch (error) {
-    console.error(error);
-  }
-};
+  };
 
-watch(
-  () => messageComponent.value,
-  (value) => {
-    if (value) {
+  const handleViewSessionDetail = async (sessionId: string) => {
+    try {
+      const data: any = await getSessionDetailById(sessionId);
+      if (!data?.session_content?.messages) {
+        handleNewChat();
+        message.error(t("chat.notExist"));
+        return;
+      }
+      messagesList.value = initialSessionDetail(data?.session_content?.messages);
       nextTick(() => {
-        initResizeObserver();
+        scrollToBottom();
       });
+    } catch (error) {
+      console.error(error);
     }
-  },
-  { immediate: true }
-);
-
-watch(
-  () => route.query?.sessionId,
-  (sessionId) => {
-    if (shouldIgnoreRouteChange.value || isCreatingNewSession.value) {
-      shouldIgnoreRouteChange.value = false;
-      return;
-    }
+  };
+
+  watch(
+    () => messageComponent.value,
+    value => {
+      if (value) {
+        nextTick(() => {
+          initResizeObserver();
+        });
+      }
+    },
+    { immediate: true }
+  );
+
+  watch(
+    () => route.query?.sessionId,
+    sessionId => {
+      if (shouldIgnoreRouteChange.value || isCreatingNewSession.value) {
+        shouldIgnoreRouteChange.value = false;
+        return;
+      }
 
-    if (sessionId) {
-      handleViewSessionDetail(String(sessionId));
-      if (sessionId !== sessionStore.responseSession) {
-        inResponse.value = false;
+      if (sessionId) {
+        handleViewSessionDetail(String(sessionId));
+        if (sessionId !== sessionStore.responseSession) {
+          inResponse.value = false;
+        } else {
+          inResponse.value = true;
+        }
       } else {
-        inResponse.value = true;
+        messagesList.value = [];
       }
-    } else {
-      messagesList.value = [];
+    },
+    { immediate: true }
+  );
+
+  onMounted(() => {
+    const { enable_thinking = true, enable_rag_retrieval = false } =
+      chatbotStore.configuration?.chat_template_kwargs;
+    isThink.value = enable_thinking;
+    enableKB.value = enable_rag_retrieval;
+    emitter.on("new-chat", handleNewChat);
+    if (!route.query?.sessionId) {
+      sessionStore.setSessionId("");
     }
-  },
-  { immediate: true }
-);
-
-onMounted(() => {
-  const { enable_thinking = true, enable_rag_retrieval = false } =
-    chatbotStore.configuration?.chat_template_kwargs;
-  isThink.value = enable_thinking;
-  enableKB.value = enable_rag_retrieval;
-  emitter.on("new-chat", handleNewChat);
-  if (!route.query?.sessionId) {
-    sessionStore.setSessionId("");
-  }
-});
+  });
 
-onBeforeUnmount(() => {
-  if (resizeObserver && messageComponent.value) {
-    resizeObserver.unobserve(messageComponent.value);
-    resizeObserver = null;
-  }
-  scrollContainer.value?.removeEventListener("scroll", handleScroll);
-});
+  onBeforeUnmount(() => {
+    if (resizeObserver && messageComponent.value) {
+      resizeObserver.unobserve(messageComponent.value);
+      resizeObserver = null;
+    }
+    scrollContainer.value?.removeEventListener("scroll", handleScroll);
+  });
 
-onUnmounted(() => {
-  emitter.off("new-chat", handleNewChat);
-  sessionStore.setSessionId("");
-});
+  onUnmounted(() => {
+    emitter.off("new-chat", handleNewChat);
+    sessionStore.setSessionId("");
+  });
 </script>
 
 <style scoped lang="less">
-.chatbot-wrap {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  width: 100%;
-  height: 100%;
-
-  .initial-input {
-    text-align: center;
-
-    .title-wrap {
-      font-size: 28px;
-      line-height: 36px;
-      color: var(--font-main-color);
-    }
+  .chatbot-wrap {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    width: 100%;
+    height: 100%;
 
-    .text-wrap {
-      font-size: 20px;
-      color: var(--font-main-color);
-    }
+    .initial-input {
+      text-align: center;
 
-    .tip-wrap {
-      font-size: 12px;
-      color: var(--font-tip-color);
-      margin-top: 24px;
+      .title-wrap {
+        font-size: 28px;
+        line-height: 36px;
+        color: var(--font-main-color);
+      }
 
-      img {
-        margin-right: 4px;
+      .text-wrap {
+        font-size: 20px;
+        color: var(--font-main-color);
+      }
+
+      .tip-wrap {
+        font-size: 12px;
+        color: var(--font-tip-color);
+        margin-top: 24px;
+
+        img {
+          margin-right: 4px;
+        }
       }
     }
-  }
 
-  .message-box {
-    flex: 1;
-    width: 100%;
-    overflow-y: auto;
-    position: relative;
-    display: flex;
-    justify-content: center;
+    .message-box {
+      flex: 1;
+      width: 100%;
+      overflow-y: auto;
+      position: relative;
+      display: flex;
+      justify-content: center;
+
+      .intel-markdown {
+        max-width: 960px;
+        width: 75%;
+        position: relative;
+        transition: all 0.2s;
+        // height: 100%;
+        min-height: 0;
+        margin: 0 48px;
+        padding: 24px 0;
+      }
+    }
 
-    .intel-markdown {
+    .input-wrap {
+      padding: 8px;
+      margin: 24px 0;
+      border: 1px solid var(--color-primary);
+      border-radius: 20px;
+      background-color: var(--input-bg);
       max-width: 960px;
+      min-width: 500px;
+      transition: all 0.2s;
       width: 75%;
+      display: flow-root;
       position: relative;
-      transition: all 0.2s;
-      // height: 100%;
-      min-height: 0;
-      margin: 0 48px;
-      padding: 24px 0;
-    }
-  }
+      left: -2px;
+      text-align: center;
 
-  .input-wrap {
-    padding: 8px;
-    margin: 24px 0;
-    border: 1px solid var(--color-primary);
-    border-radius: 20px;
-    background-color: var(--input-bg);
-    max-width: 960px;
-    min-width: 500px;
-    transition: all 0.2s;
-    width: 75%;
-    display: flow-root;
-    position: relative;
-    left: -2px;
-    text-align: center;
-
-    &:hover {
-      box-shadow: 0 4px 12px var(--bg-primary-shadow);
-      border: 1px solid var(--color-primary-hover);
-    }
-    .bottom-wrap {
-      position: absolute;
-      top: -40px;
-      width: 100%;
-      height: 32px;
-      .vertical-center;
-      .to-bottom {
-        .vertical-center;
-        width: 32px;
+      &:hover {
+        box-shadow: 0 4px 12px var(--bg-primary-shadow);
+        border: 1px solid var(--color-primary-hover);
+      }
+      .bottom-wrap {
+        position: absolute;
+        top: -40px;
+        width: 100%;
         height: 32px;
-        cursor: pointer;
-        z-index: 20;
-        border-radius: 50%;
-        background-color: var(--bg-card-color);
-        border: 1px solid var(--border-main-color);
-        box-shadow: 0px 2px 4px 0px var(--bg-box-shadow);
-        &:hover {
-          background-color: var(--color-second-primaryBg);
-          border: 1px solid var(--color-primary-second);
-          .anticon-arrow-down {
-            color: var(--color-primary-second);
+        .vertical-center;
+        .to-bottom {
+          .vertical-center;
+          width: 32px;
+          height: 32px;
+          cursor: pointer;
+          z-index: 20;
+          border-radius: 50%;
+          background-color: var(--bg-card-color);
+          border: 1px solid var(--border-main-color);
+          box-shadow: 0px 2px 4px 0px var(--bg-box-shadow);
+          &:hover {
+            background-color: var(--color-second-primaryBg);
+            border: 1px solid var(--color-primary-second);
+            .anticon-arrow-down {
+              color: var(--color-primary-second);
+            }
           }
         }
       }
-    }
 
-    textarea {
-      resize: none;
-    }
-
-    .button-wrap {
-      display: flex;
-      align-items: center;
-      justify-content: space-between;
-      padding: 6px 12px;
-      .flex-left {
-        gap: 8px;
+      textarea {
+        resize: none;
       }
-      .think-btn {
-        height: 24px;
-        line-height: 24px;
-        padding: 0 8px;
-        border: 1px solid var(--border-main-color);
-        color: var(--font-text-color);
-        cursor: pointer;
-        border-radius: 12px;
-        font-size: 12px;
-        .mt-12;
-        .vertical-center;
-        gap: 4px;
-        &:hover {
-          border: 1px solid var(--color-primary-second);
-          color: var(--color-primary-second);
-          background-color: var(--color-primaryBg);
-        }
-        &.is-deep {
-          border: 1px solid var(--color-primary-second);
-          color: var(--color-primary-second);
-          background-color: var(--color-primaryBg);
+
+      .button-wrap {
+        display: flex;
+        align-items: center;
+        justify-content: space-between;
+        padding: 6px 12px;
+        .flex-left {
+          gap: 8px;
         }
-        &.is-disabled,
-        .is-disabled:hover {
+        .think-btn {
+          height: 24px;
+          line-height: 24px;
+          padding: 0 8px;
           border: 1px solid var(--border-main-color);
           color: var(--font-text-color);
-          background-color: var(--bg-main-color);
-          cursor: no-drop;
-        }
-      }
-      .send-btn {
-        display: flex;
-
-        .common-btn {
-          width: 36px;
-          height: 36px;
-          margin-left: 8px;
           cursor: pointer;
+          border-radius: 12px;
+          font-size: 12px;
+          .mt-12;
           .vertical-center;
-
-          &:hover .icon-intel {
-            color: var(--color-primary) !important;
-          }
-        }
-
-        .icon-send {
+          gap: 4px;
           &:hover {
-            color: var(--color-white);
+            border: 1px solid var(--color-primary-second);
+            color: var(--color-primary-second);
+            background-color: var(--color-primaryBg);
+          }
+          &.is-deep {
+            border: 1px solid var(--color-primary-second);
+            color: var(--color-primary-second);
+            background-color: var(--color-primaryBg);
+          }
+          &.is-disabled,
+          .is-disabled:hover {
+            border: 1px solid var(--border-main-color);
+            color: var(--font-text-color);
+            background-color: var(--bg-main-color);
+            cursor: no-drop;
           }
         }
+        .send-btn {
+          display: flex;
+
+          .common-btn {
+            width: 36px;
+            height: 36px;
+            margin-left: 8px;
+            cursor: pointer;
+            .vertical-center;
+
+            &:hover .icon-intel {
+              color: var(--color-primary) !important;
+            }
+          }
 
-        .intel-divider-vertical {
-          height: 28px;
-          margin: 0 12px 0 8px;
-          top: 4px;
-        }
+          .icon-send {
+            &:hover {
+              color: var(--color-white);
+            }
+          }
 
-        .intel-btn,
-        .ant-btn {
-          width: 36px;
-          height: 36px;
-          padding: 0;
-          .vertical-center;
+          .intel-divider-vertical {
+            height: 28px;
+            margin: 0 12px 0 8px;
+            top: 4px;
+          }
 
-          .icon-intel {
-            position: relative;
-            top: 1px;
-            font-size: 16px;
+          .intel-btn,
+          .ant-btn {
+            width: 36px;
+            height: 36px;
+            padding: 0;
+            .vertical-center;
+
+            .icon-intel {
+              position: relative;
+              top: 1px;
+              font-size: 16px;
+            }
           }
         }
-      }
 
-      .intel-btn-primary:disabled {
-        background-color: var(--color-info);
+        .intel-btn-primary:disabled {
+          background-color: var(--color-info);
 
-        .icon-intel {
-          color: var(--color-white) !important;
+          .icon-intel {
+            color: var(--color-white) !important;
+          }
         }
       }
     }
-  }
-  .error-tip {
-    border: 1px solid var(--border-warning);
-    background-color: var(--color-warningBg);
-    color: var(--color-second-warning);
-    padding: 8px 12px;
-    border-radius: 0 4px 4px 0;
-    margin-bottom: 12px;
-    font-size: 12px;
-    .flex-between;
-    &:hover {
-      .card-shadow;
-    }
-    .message-wrap {
-      flex: 1;
-    }
-    .close-btn {
-      cursor: pointer;
-      text-align: end;
+    .error-tip {
+      border: 1px solid var(--border-warning);
+      background-color: var(--color-warningBg);
+      color: var(--color-second-warning);
+      padding: 8px 12px;
+      border-radius: 0 4px 4px 0;
+      margin-bottom: 12px;
+      font-size: 12px;
+      .flex-between;
       &:hover {
-        color: var(--color-error);
+        .card-shadow;
+      }
+      .message-wrap {
+        flex: 1;
+      }
+      .close-btn {
+        cursor: pointer;
+        text-align: end;
+        &:hover {
+          color: var(--color-error);
+        }
       }
     }
   }
-}
 </style>
diff --git a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/ChatHistory.vue b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/ChatHistory.vue
index b3005be80c..5ee054192a 100644
--- a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/ChatHistory.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/ChatHistory.vue
@@ -88,16 +88,16 @@
 </template>
 
 <script setup lang="ts">
-  import { getAgentList, requestAgentUpdate } from "@/api/agent";
-  import { getHistorySessionList, requestSessionDelete } from "@/api/chatbot";
-  import router from "@/router";
-  import { chatbotAppStore } from "@/store/chatbot";
-  import { sessionAppStore } from "@/store/session";
-  import emitter from "@/utils/mitt";
-  import { CloseCircleFilled, DeleteFilled, MessageOutlined } from "@ant-design/icons-vue";
-  import { Modal } from "ant-design-vue";
-  import { createVNode, onMounted, ref, watch } from "vue";
-  import { useI18n } from "vue-i18n";
+  import { getAgentList, requestAgentSetActive } from "@/api/agent";
+import { getHistorySessionList, requestSessionDelete } from "@/api/chatbot";
+import router from "@/router";
+import { chatbotAppStore } from "@/store/chatbot";
+import { sessionAppStore } from "@/store/session";
+import emitter from "@/utils/mitt";
+import { CloseCircleFilled, DeleteFilled, MessageOutlined } from "@ant-design/icons-vue";
+import { Modal } from "ant-design-vue";
+import { createVNode, onMounted, ref, watch } from "vue";
+import { useI18n } from "vue-i18n";
 
   const chatbotStore = chatbotAppStore();
   const sessionStore = sessionAppStore();
@@ -140,6 +140,7 @@
   const handleAgentClick = (agent: AgentItem, index: number) => {
     const { active } = agent;
     const { name, type } = agent;
+    const willActivate = !active;
 
     const text = active ? t("agent.deactivateTip") : t("agent.activeTip");
     Modal.confirm({
@@ -147,7 +148,7 @@
       content: text,
       okText: t("common.confirm"),
       async onOk() {
-        await requestAgentUpdate(name, { active: !active });
+        await requestAgentSetActive(name, willActivate);
         queryAgentList();
         if (!active) {
           handleThinkState();
diff --git a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/MessageItem.vue b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/MessageItem.vue
index 0c22b921b7..00a869559a 100644
--- a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/MessageItem.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/Chatbot/MessageItem.vue
@@ -94,6 +94,7 @@
           <div v-if="!inResponse && readResponse" class="footer-btn">
             <a-tooltip
               placement="top"
+              :arrow="false"
               :title="$t('common.copy')"
               v-if="readResponse.length"
             >
@@ -103,6 +104,7 @@
             <a-tooltip
               v-if="lastResponse"
               placement="top"
+              :arrow="false"
               :title="$t('common.regenerate')"
             >
               <span class="icon-style" @click="handleRegenerate()">
@@ -187,13 +189,14 @@
         <div class="message-wrap">
           {{ message.content }}
           <div class="footer-btn">
-            <a-tooltip placement="top" :title="$t('common.copy')">
+            <a-tooltip placement="top" :arrow="false" :title="$t('common.copy')">
               <span class="icon-style" @click="handleCopyQuery()">
                 <CopyOutlined /></span
             ></a-tooltip>
             <a-tooltip
               v-if="!inResponse && lastQuery"
               placement="top"
+              :arrow="false"
               :title="$t('common.edit')"
             >
               <span class="icon-style" @click="handleEdit()">
@@ -202,6 +205,7 @@
             <a-tooltip
               v-if="!inResponse && message.errorMessage && lastQuery"
               placement="top"
+              :arrow="false"
               :title="$t('common.regenerate')"
             >
               <span class="icon-style" @click="handleRetry()">
@@ -218,21 +222,21 @@
 </template>
 
 <script lang="ts" setup name="MessageItem">
-import { marked } from "marked";
-import { PropType, ref, onMounted, computed, watch } from "vue";
+import { chatbotAppStore } from "@/store/chatbot";
+import { useClipboard } from "@/utils/clipboard";
+import CustomRenderer from "@/utils/customRenderer";
 import {
   CheckCircleFilled,
-  UpOutlined,
   CopyOutlined,
-  SyncOutlined,
   EditOutlined,
   ExclamationCircleFilled,
+  SyncOutlined,
+  UpOutlined,
 } from "@ant-design/icons-vue";
-import { IMessage, Benchmark } from "../../type";
-import CustomRenderer from "@/utils/customRenderer";
-import { useClipboard } from "@/utils/clipboard";
 import "highlight.js/styles/atom-one-dark.css";
-import { chatbotAppStore } from "@/store/chatbot";
+import { marked } from "marked";
+import { computed, onMounted, PropType, ref, watch } from "vue";
+import { Benchmark, IMessage } from "../../type";
 
 const chatbotStore = chatbotAppStore();
 const { copy } = useClipboard();
diff --git a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Basic.vue b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Basic.vue
index 0b20f99abe..a51165a165 100644
--- a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Basic.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Basic.vue
@@ -8,7 +8,11 @@
     autocomplete="off"
     class="form-wrap"
   >
-    <a-form-item :label="$t('knowledge.type')" name="comp_subtype" class="horizontal-form-item">
+    <a-form-item
+      :label="$t('knowledge.type')"
+      name="comp_subtype"
+      class="horizontal-form-item"
+    >
       <a-radio-group
         v-model:value="form.comp_subtype"
         @change="handleTypeChange"
@@ -64,7 +68,11 @@
         </a-input>
         <FormTooltip :title="$t('pipeline.desc.vector_url')" />
       </a-form-item>
-      <a-form-item :label="$t('knowledge.name')" :rules="rules.kbName" name="name">
+      <a-form-item
+        :label="$t('knowledge.name')"
+        :rules="rules.kbName"
+        name="name"
+      >
         <a-select
           showSearch
           v-model:value="form.name"
@@ -91,224 +99,229 @@
 </template>
 
 <script lang="ts" setup name="Basic">
-  import { getkbadminList } from "@/api/knowledgeBase";
-  import { useNotification } from "@/utils/common";
-  import { isValidName, validateServiceAddress } from "@/utils/validate";
-  import { CheckCircleFilled } from "@ant-design/icons-vue";
-  import type { FormInstance } from "ant-design-vue";
-  import { RuleObject } from "ant-design-vue/es/form";
-  import { computed, reactive, ref } from "vue";
-  import { useI18n } from "vue-i18n";
+import { getkbadminList } from "@/api/knowledgeBase";
+import { useNotification } from "@/utils/common";
+import { isValidName, validateServiceAddress } from "@/utils/validate";
+import { CheckCircleFilled } from "@ant-design/icons-vue";
+import type { FormInstance } from "ant-design-vue";
+import { RuleObject } from "ant-design-vue/es/form";
+import { computed, reactive, ref } from "vue";
+import { useI18n } from "vue-i18n";
 
-  const { t } = useI18n();
-  const { antNotification } = useNotification();
+const { t } = useI18n();
+const { antNotification } = useNotification();
 
-  const props = defineProps({
-    formData: {
-      type: Object,
-      default: () => {},
-    },
-    formType: {
-      type: String,
-      default: "create",
-    },
-  });
+const props = defineProps({
+  formData: {
+    type: Object,
+    default: () => {},
+  },
+  formType: {
+    type: String,
+    default: "create",
+  },
+});
 
-  interface IndexerType {
-    vector_url?: string;
+interface IndexerType {
+  vector_url?: string;
+}
+interface FormType {
+  name: string | undefined;
+  description: string;
+  comp_type: string;
+  comp_subtype: string;
+  indexer: IndexerType;
+}
+const validateName = async (rule: any, value: string) => {
+  if (!value) {
+    return Promise.reject(t("knowledge.nameValid1"));
+  }
+  const len = value.length;
+  if (len < 2 || len > 30) {
+    return Promise.reject(t("knowledge.nameValid2"));
   }
-  interface FormType {
-    name: string | undefined;
-    description: string;
-    comp_type: string;
-    comp_subtype: string;
-    indexer: IndexerType;
+  if (!isValidName(value)) {
+    return Promise.reject(t("knowledge.nameValid3"));
   }
-  const validateName = async (rule: any, value: string) => {
+  return Promise.resolve();
+};
+const validateUnique = () => {
+  return async (_rule: RuleObject, value: string) => {
     if (!value) {
-      return Promise.reject(t("knowledge.nameValid1"));
+      return Promise.reject(new Error(t("pipeline.valid.urlValid1")));
     }
-    const len = value.length;
-    if (len < 2 || len > 30) {
-      return Promise.reject(t("knowledge.nameValid2"));
-    }
-    if (!isValidName(value)) {
-      return Promise.reject(t("knowledge.nameValid3"));
-    }
-    return Promise.resolve();
-  };
-  const validateUnique = () => {
-    return async (_rule: RuleObject, value: string) => {
-      if (!value) {
-        return Promise.reject(new Error(t("pipeline.valid.urlValid1")));
-      }
 
-      const serverUrl = protocol.value + value;
-      if (!validateServiceAddress(serverUrl)) {
-        return Promise.reject(new Error(t("pipeline.valid.urlValid2")));
-      }
+    const serverUrl = protocol.value + value;
+    if (!validateServiceAddress(serverUrl)) {
+      return Promise.reject(new Error(t("pipeline.valid.urlValid2")));
+    }
 
-      isVectorUrlPass.value = true;
+    isVectorUrlPass.value = true;
 
-      return Promise.resolve();
-    };
+    return Promise.resolve();
   };
-  const {
-    comp_subtype = "origin_kb",
-    name = "default_kb",
-    description = "",
-    comp_type = "knowledge",
-  } = props.formData;
+};
+const {
+  comp_subtype = "origin_kb",
+  name = "default_kb",
+  description = "",
+  comp_type = "knowledge",
+} = props.formData;
 
-  const host = window.location.hostname;
-  const handleUrlFormat = (url: string) => {
-    return url ? url.replace(/https?:\/\//g, "") : "";
-  };
-  const { vector_url = "" } = props.formData?.indexer || {};
-  const kbList = ref<EmptyArrayType>([]);
-  const formRef = ref<FormInstance>();
-  const form = reactive<FormType>({
-    comp_subtype,
-    name,
-    description,
-    comp_type,
-    indexer: { vector_url: vector_url ? handleUrlFormat(vector_url) : `${host}:29530` },
-  });
-  const isVectorUrlPass = ref<boolean>(false);
-  const isConnected = ref<boolean>(false);
-  const protocol = ref<string>("http://");
-  const isEdit = computed(() => {
-    const { formType } = props;
-    return formType === "update";
-  });
-  const isOriginal = computed(() => {
-    return form.comp_subtype === "origin_kb";
-  });
+const host = window.location.hostname;
+const handleUrlFormat = (url: string) => {
+  return url ? url.replace(/https?:\/\//g, "") : "";
+};
+const { vector_url = "" } = props.formData?.indexer || {};
+const kbList = ref<EmptyArrayType>([]);
+const formRef = ref<FormInstance>();
+const form = reactive<FormType>({
+  comp_subtype,
+  name,
+  description,
+  comp_type,
+  indexer: {
+    vector_url: vector_url ? handleUrlFormat(vector_url) : `${host}:19530`,
+  },
+});
+const isVectorUrlPass = ref<boolean>(false);
+const isConnected = ref<boolean>(false);
+const protocol = ref<string>("http://");
+const isEdit = computed(() => {
+  const { formType } = props;
+  return formType === "update";
+});
+const isOriginal = computed(() => {
+  return form.comp_subtype === "origin_kb";
+});
 
-  const rules: FormRules = reactive({
-    comp_subtype: [
-      {
-        required: true,
-        message: t("knowledge.typeValid"),
-        trigger: "change",
-      },
-    ],
-    name: [
-      {
-        required: true,
-        validator: validateName,
-        trigger: ["blur", "change"],
-      },
-    ],
-    kbName: [
-      {
-        required: true,
-        message: t("knowledge.nameRequired"),
-        trigger: "change",
-      },
-    ],
-    vector_url: [
-      {
-        required: true,
-        validator: validateUnique(),
-        trigger: "blur",
-      },
-    ],
-  });
+const rules: FormRules = reactive({
+  comp_subtype: [
+    {
+      required: true,
+      message: t("knowledge.typeValid"),
+      trigger: "change",
+    },
+  ],
+  name: [
+    {
+      required: true,
+      validator: validateName,
+      trigger: ["blur", "change"],
+    },
+  ],
+  kbName: [
+    {
+      required: true,
+      message: t("knowledge.nameRequired"),
+      trigger: "change",
+    },
+  ],
+  vector_url: [
+    {
+      required: true,
+      validator: validateUnique(),
+      trigger: "blur",
+    },
+  ],
+});
 
-  const handleTypeChange = () => {
-    form.name = undefined;
-    isVectorUrlPass.value = false;
-    isConnected.value = false;
+const handleTypeChange = () => {
+  form.name = undefined;
+  isVectorUrlPass.value = false;
+  isConnected.value = false;
 
-    if (form.comp_subtype === "kbadmin_kb") {
-      if (form.indexer.vector_url) {
-        nextTick(() => formRef.value?.validateFields([["indexer", "vector_url"]]));
-      }
-    }
-  };
+  if (form.comp_subtype === "kbadmin_kb") {
+    form.indexer.vector_url = `${host}:29530`;
+    nextTick(() => formRef.value?.validateFields([["indexer", "vector_url"]]));
+  }
+};
 
-  const handleUriChange = () => {
-    isVectorUrlPass.value = false;
-    isConnected.value = false;
-    form.name = undefined;
-  };
-  const handleKBVisible = async (visible: boolean) => {
-    if (visible) {
-      try {
-        if (!form.indexer.vector_url) {
-          antNotification("warning", t("common.prompt"), t("pipeline.valid.urlValid1"));
-          return;
-        }
-      } catch (err) {
-        console.error(err);
+const handleUriChange = () => {
+  isVectorUrlPass.value = false;
+  isConnected.value = false;
+  form.name = undefined;
+};
+const handleKBVisible = async (visible: boolean) => {
+  if (visible) {
+    try {
+      if (!form.indexer.vector_url) {
+        antNotification(
+          "warning",
+          t("common.prompt"),
+          t("pipeline.valid.urlValid1"),
+        );
+        return;
       }
+    } catch (err) {
+      console.error(err);
     }
-  };
-  const handlleQueryKB = () => {
-    queryKbadmin();
-  };
-  const queryKbadmin = async () => {
-    const { vector_url } = form.indexer;
-    const url = protocol.value + vector_url;
+  }
+};
+const handlleQueryKB = () => {
+  queryKbadmin();
+};
+const queryKbadmin = async () => {
+  const { vector_url } = form.indexer;
+  const url = protocol.value + vector_url;
 
-    const data: any = await getkbadminList({ vector_url: url });
-    kbList.value = [].concat(data);
-    if (kbList.value.length) isConnected.value = true;
-  };
-  const formatFormParam = () => {
-    const { indexer = {} } = props.formData || {};
-    const { vector_url, ...otherParams } = indexer;
+  const data: any = await getkbadminList({ vector_url: url });
+  kbList.value = [].concat(data);
+  if (kbList.value.length) isConnected.value = true;
+};
+const formatFormParam = () => {
+  const { indexer = {} } = props.formData || {};
+  const { vector_url, ...otherParams } = indexer;
 
-    return {
-      ...form,
-      indexer: {
-        vector_url: form.indexer.vector_url,
-        ...otherParams,
-      },
-    };
+  return {
+    ...form,
+    indexer: {
+      vector_url: form.indexer.vector_url,
+      ...otherParams,
+    },
   };
-  // Validate the form, throw results form
-  const handleValidate = (): Promise<object> => {
-    return new Promise(resolve => {
-      formRef.value
-        ?.validate()
-        .then(() => {
-          resolve({
-            result: true,
-            data: formatFormParam(),
-          });
-        })
-        .catch(() => {
-          resolve({ result: false });
+};
+// Validate the form, throw results form
+const handleValidate = (): Promise<object> => {
+  return new Promise((resolve) => {
+    formRef.value
+      ?.validate()
+      .then(() => {
+        resolve({
+          result: true,
+          data: formatFormParam(),
         });
-    });
-  };
-  defineExpose({
-    validate: handleValidate,
-  });
-  watch(
-    () => form.comp_subtype,
-    val => {
-      props.formData.comp_subtype = val;
-    },
-    { immediate: true }
-  );
-  onMounted(() => {
-    if (props.formType === "update" && !isOriginal.value) {
-      isVectorUrlPass.value = isConnected.value = true;
-      queryKbadmin();
-    }
+      })
+      .catch(() => {
+        resolve({ result: false });
+      });
   });
+};
+defineExpose({
+  validate: handleValidate,
+});
+watch(
+  () => form.comp_subtype,
+  (val) => {
+    props.formData.comp_subtype = val;
+  },
+  { immediate: true },
+);
+onMounted(() => {
+  if (props.formType === "update" && !isOriginal.value) {
+    isVectorUrlPass.value = isConnected.value = true;
+    queryKbadmin();
+  }
+});
 </script>
 
 <style scoped lang="less">
-  .text-btn {
-    width: 72px;
-    height: 30px;
-    margin: 0 -11px;
-    border-radius: 0 6px 6px 0;
-    padding: 0;
-    .vertical-center;
-  }
+.text-btn {
+  width: 72px;
+  height: 30px;
+  margin: 0 -11px;
+  border-radius: 0 6px 6px 0;
+  padding: 0;
+  .vertical-center;
+}
 </style>
diff --git a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Indexer.vue b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Indexer.vue
index a57a63b74d..e9c176e1a5 100644
--- a/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Indexer.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/chatbot/components/KnowledgeBase/UpdateKBDialog/Indexer.vue
@@ -9,7 +9,11 @@
     class="form-wrap"
   >
     <div class="column-wrap">
-      <a-form-item :label="$t('pipeline.config.indexerType')" class="row-item" name="indexer_type">
+      <a-form-item
+        :label="$t('pipeline.config.indexerType')"
+        class="row-item"
+        name="indexer_type"
+      >
         <div class="flex-left">
           <a-select
             showSearch
@@ -17,7 +21,11 @@
             :placeholder="$t('pipeline.valid.indexerType')"
             @change="handleTypeChange"
           >
-            <a-select-option v-for="item in indexerList" :key="item.value" :value="item.value">
+            <a-select-option
+              v-for="item in indexerList"
+              :key="item.value"
+              :value="item.value"
+            >
               {{ item.name }}
             </a-select-option>
           </a-select>
@@ -35,19 +43,29 @@
       :label="$t('pipeline.config.embeddingType')"
       name="inference_type"
     >
-      <a-radio-group v-model:value="form.inference_type" @change="handleInferenceTypeChange">
+      <a-radio-group
+        v-model:value="form.inference_type"
+        @change="handleInferenceTypeChange"
+      >
         <a-radio value="vllm">{{ $t("pipeline.config.vllm") }}</a-radio>
         <a-radio value="local">{{ $t("pipeline.config.local") }}</a-radio>
       </a-radio-group>
     </a-form-item>
-    <a-form-item name="embedding_url" :rules="rules.embedding_url" v-if="isKbadmin">
+    <a-form-item
+      name="embedding_url"
+      :rules="rules.embedding_url"
+      v-if="isKbadmin"
+    >
       <template #label>
         {{ $t("pipeline.config.embeddingUrl") }}
         <span class="eg-wrap">
           {{ $t("pipeline.valid.embeddingUrl") }}
         </span>
       </template>
-      <a-input v-model:value="form.embedding_url" :placeholder="$t('pipeline.valid.embeddingUrl')">
+      <a-input
+        v-model:value="form.embedding_url"
+        :placeholder="$t('pipeline.valid.embeddingUrl')"
+      >
         <template #addonBefore>
           <a-select v-model:value="modelProtocol">
             <a-select-option value="http://">Http://</a-select-option>
@@ -58,7 +76,10 @@
       <FormTooltip :title="$t('pipeline.desc.embeddingUrl')" />
     </a-form-item>
     <template v-if="isVllm">
-      <a-form-item :name="['embedding_model', 'api_base']" :rules="rules.embedding_url">
+      <a-form-item
+        :name="['embedding_model', 'api_base']"
+        :rules="rules.embedding_url"
+      >
         <template #label>
           {{ $t("pipeline.config.embeddingUrl") }}
           <span class="eg-wrap">
@@ -107,7 +128,11 @@
             @dropdownVisibleChange="handleEmbeddingModelVisible"
             class="select-wrap"
           >
-            <a-select-option v-for="item in vllmModelList" :key="item" :value="item">
+            <a-select-option
+              v-for="item in vllmModelList"
+              :key="item"
+              :value="item"
+            >
               {{ item }}
             </a-select-option>
           </a-select>
@@ -205,453 +230,513 @@
 </template>
 
 <script lang="ts" setup name="Indexer">
-  import { getModelList, getRunDevice, requestUrlVerify, requestUrlVllm } from "@/api/pipeline";
-  import { useNotification } from "@/utils/common";
-  import { validateServiceAddress } from "@/utils/validate.ts";
-  import { CheckCircleFilled, InfoCircleOutlined } from "@ant-design/icons-vue";
-  import type { FormInstance, RadioChangeEvent } from "ant-design-vue";
-  import { RuleObject } from "ant-design-vue/es/form/interface";
-  import { SelectValue } from "ant-design-vue/es/select/index";
-  import { computed, nextTick, onMounted, reactive, ref } from "vue";
-  import { useI18n } from "vue-i18n";
-  import { Indexer } from "./enum.ts";
-
-  const { t } = useI18n();
-  const { antNotification } = useNotification();
-
-  const props = defineProps({
-    formData: {
-      type: Object,
-      default: () => ({}),
-    },
-    formType: {
-      type: String,
-      default: "create",
-    },
-  });
-
-  interface ModelType {
-    model_id: string | undefined;
-    model_path: string;
-    model_url?: string;
-    device: string;
-    weight?: string;
-    api_base?: string;
+import {
+  getModelList,
+  getRunDevice,
+  requestUrlVerify,
+  requestUrlVllm,
+} from "@/api/pipeline";
+import { useNotification } from "@/utils/common";
+import { validateServiceAddress } from "@/utils/validate.ts";
+import { CheckCircleFilled, InfoCircleOutlined } from "@ant-design/icons-vue";
+import type { FormInstance, RadioChangeEvent } from "ant-design-vue";
+import { RuleObject } from "ant-design-vue/es/form/interface";
+import { SelectValue } from "ant-design-vue/es/select/index";
+import { computed, nextTick, onMounted, reactive, ref } from "vue";
+import { useI18n } from "vue-i18n";
+import { Indexer } from "./enum.ts";
+
+const { t } = useI18n();
+const { antNotification } = useNotification();
+
+const props = defineProps({
+  formData: {
+    type: Object,
+    default: () => ({}),
+  },
+  formType: {
+    type: String,
+    default: "create",
+  },
+});
+
+interface ModelType {
+  model_id: string | undefined;
+  model_path: string;
+  model_url?: string;
+  device: string;
+  weight?: string;
+  api_base?: string;
+}
+
+interface FormType {
+  indexer_type: string;
+  inference_type: string;
+  vector_url?: string;
+  embedding_url?: string;
+  embedding_model: ModelType;
+}
+
+const TIP_MESSAGES = {
+  kbadmin_indexer: "pipeline.valid.kb_vector_url",
+  milvus_vector: "pipeline.valid.vector_url",
+} as any;
+
+const host = window.location.hostname;
+const isKbadmin = computed(() => props.formData.comp_subtype === "kbadmin_kb");
+
+const {
+  indexer_type = isKbadmin.value ? "kbadmin_indexer" : "faiss_vector",
+  inference_type = "local",
+  vector_url = "",
+  embedding_url = `${host}:13020`,
+  embedding_model = {
+    api_base: `${host}:8087`,
+    model_id: undefined,
+    model_path: "",
+    device: "AUTO",
+    weight: "INT4",
+  },
+} = props.formData?.indexer || {};
+
+const formRef = ref<FormInstance>();
+const protocol = ref<string>("http://");
+const modelProtocol = ref<string>("http://");
+
+const isVectorUrlPass = ref<boolean>(false);
+const isModelUrlPass = ref<boolean>(false);
+const validatePass = ref<boolean>(false);
+const vllmValidatePass = ref<boolean>(false);
+const isConnected = ref<boolean>(false);
+
+const deviceList = ref<EmptyArrayType>([]);
+const modelList = ref<EmptyArrayType>([]);
+const vllmModelList = ref<EmptyArrayType>([]);
+
+const handleUrlFormat = (url: string) => {
+  return url ? url.replace(/https?:\/\//g, "") : "";
+};
+
+const initialFormData: FormType = {
+  indexer_type,
+  inference_type,
+  vector_url: vector_url
+    ? handleUrlFormat(vector_url)
+    : `${host}:${indexer_type === "kbadmin_indexer" ? "29530" : "19530"}`,
+  embedding_url: handleUrlFormat(embedding_url),
+  embedding_model: {
+    ...embedding_model,
+    api_base: handleUrlFormat(embedding_model.api_base),
+    model_id:
+      props.formType === "update"
+        ? embedding_model?.model_id
+        : indexer_type === "kbadmin_indexer"
+          ? undefined
+          : embedding_model?.model_id,
+  },
+};
+
+const form = reactive<FormType>(initialFormData);
+
+const hasUntestedEndpoint = computed(() => {
+  if (form.inference_type === "vllm" && !vllmValidatePass.value) {
+    return true;
   }
 
-  interface FormType {
-    indexer_type: string;
-    inference_type: string;
-    vector_url?: string;
-    embedding_url?: string;
-    embedding_model: ModelType;
+  if (form.indexer_type === "milvus_vector" && !validatePass.value) {
+    return true;
   }
 
-  const TIP_MESSAGES = {
-    kbadmin_indexer: "pipeline.valid.kb_vector_url",
-    milvus_vector: "pipeline.valid.vector_url",
-  } as any;
-
-  const host = window.location.hostname;
-  const isKbadmin = computed(() => props.formData.comp_subtype === "kbadmin_kb");
-
-  const {
-    indexer_type = isKbadmin.value ? "kbadmin_indexer" : "faiss_vector",
-    inference_type = "local",
-    vector_url = "",
-    embedding_url = `${host}:13020`,
-    embedding_model = {
-      api_base: `${host}:8087`,
-      model_id: undefined,
-      model_path: "",
-      device: "AUTO",
-      weight: "INT4",
+  return false;
+});
+
+const isProceed = computed(() => !hasUntestedEndpoint.value);
+const isMilvus = computed(() => form.indexer_type === "milvus_vector");
+const isVllm = computed(() => form.inference_type === "vllm");
+
+const indexerList = computed(() => {
+  return Indexer.filter((item) =>
+    isKbadmin.value
+      ? item.value === "kbadmin_indexer"
+      : item.value !== "kbadmin_indexer",
+  );
+});
+
+const optionIntroduction = computed(() => {
+  return Indexer.find((item) => item.value === form.indexer_type)?.describe;
+});
+
+const validateUnique = (urlType: "vector" | "model") => {
+  const messages = {
+    vector: {
+      required: "pipeline.valid.urlValid1",
+      format: "pipeline.valid.urlValid2",
     },
-  } = props.formData?.indexer || {};
-
-  const formRef = ref<FormInstance>();
-  const protocol = ref<string>("http://");
-  const modelProtocol = ref<string>("http://");
-
-  const isVectorUrlPass = ref<boolean>(false);
-  const isModelUrlPass = ref<boolean>(false);
-  const validatePass = ref<boolean>(false);
-  const vllmValidatePass = ref<boolean>(false);
-  const isConnected = ref<boolean>(false);
-
-  const deviceList = ref<EmptyArrayType>([]);
-  const modelList = ref<EmptyArrayType>([]);
-  const vllmModelList = ref<EmptyArrayType>([]);
-
-  const handleUrlFormat = (url: string) => {
-    return url ? url.replace(/https?:\/\//g, "") : "";
-  };
-
-  const initialFormData: FormType = {
-    indexer_type,
-    inference_type,
-    vector_url: vector_url
-      ? handleUrlFormat(vector_url)
-      : `${host}:${indexer_type === "kbadmin_indexer" ? "29530" : "19530"}`,
-    embedding_url: handleUrlFormat(embedding_url),
-    embedding_model: {
-      ...embedding_model,
-      api_base: handleUrlFormat(embedding_model.api_base),
-      model_id:
-        props.formType === "update"
-          ? embedding_model?.model_id
-          : indexer_type === "kbadmin_indexer"
-            ? undefined
-            : embedding_model?.model_id,
+    model: {
+      required: "pipeline.valid.modelRequired",
+      format: "pipeline.valid.modelFormat",
     },
-  };
-
-  const form = reactive<FormType>(initialFormData);
+  }[urlType];
 
-  const hasUntestedEndpoint = computed(() => {
-    if (form.inference_type === "vllm" && !vllmValidatePass.value) {
-      return true;
+  return async (_rule: RuleObject, value: string) => {
+    if (!value) {
+      return Promise.reject(new Error(t(messages.required)));
     }
 
-    if (form.indexer_type === "milvus_vector" && !validatePass.value) {
-      return true;
+    const serverUrl =
+      (urlType === "vector" ? protocol.value : modelProtocol.value) + value;
+    if (!validateServiceAddress(serverUrl)) {
+      return Promise.reject(new Error(t(messages.format)));
     }
 
-    return false;
-  });
-
-  const isProceed = computed(() => !hasUntestedEndpoint.value);
-  const isMilvus = computed(() => form.indexer_type === "milvus_vector");
-  const isVllm = computed(() => form.inference_type === "vllm");
-
-  const indexerList = computed(() => {
-    return Indexer.filter(item =>
-      isKbadmin.value ? item.value === "kbadmin_indexer" : item.value !== "kbadmin_indexer"
-    );
-  });
-
-  const optionIntroduction = computed(() => {
-    return Indexer.find(item => item.value === form.indexer_type)?.describe;
-  });
-
-  const validateUnique = (urlType: "vector" | "model") => {
-    const messages = {
-      vector: {
-        required: "pipeline.valid.urlValid1",
-        format: "pipeline.valid.urlValid2",
-      },
-      model: {
-        required: "pipeline.valid.modelRequired",
-        format: "pipeline.valid.modelFormat",
-      },
-    }[urlType];
-
-    return async (_rule: RuleObject, value: string) => {
-      if (!value) {
-        return Promise.reject(new Error(t(messages.required)));
-      }
-
-      const serverUrl = (urlType === "vector" ? protocol.value : modelProtocol.value) + value;
-      if (!validateServiceAddress(serverUrl)) {
-        return Promise.reject(new Error(t(messages.format)));
-      }
-
-      if (urlType === "model") {
-        isModelUrlPass.value = true;
-      } else {
-        isVectorUrlPass.value = true;
-      }
-
-      return Promise.resolve();
-    };
-  };
-
-  const rules: FormRules = reactive({
-    indexer_type: [
-      {
-        required: true,
-        message: t("pipeline.valid.indexerType"),
-        trigger: "change",
-      },
-    ],
-    inference_type: [
-      {
-        required: true,
-        message: t("pipeline.valid.generatorType"),
-        trigger: "change",
-      },
-    ],
-    model_id: [
-      {
-        required: true,
-        message: t("pipeline.valid.embedding"),
-        trigger: "change",
-      },
-    ],
-    device: [
-      {
-        required: true,
-        message: t("pipeline.valid.embeddingDevice"),
-        trigger: "change",
-      },
-    ],
-    vector_url: [
-      {
-        required: true,
-        validator: validateUnique("vector"),
-        trigger: "blur",
-      },
-    ],
-    embedding_url: [
-      {
-        required: true,
-        validator: validateUnique("model"),
-        trigger: "change",
-      },
-    ],
-  });
-
-  const queryModelList = async (modelType: string, params?: any) => {
-    try {
-      const data = await getModelList(modelType, params);
-      return Array.isArray(data) ? data : [];
-    } catch (error) {
-      console.error(`Failed to load ${modelType} model list:`, error);
-      return [];
-    }
-  };
-
-  const handleTypeChange = (value: SelectValue) => {
-    isVectorUrlPass.value = false;
-    validatePass.value = false;
-
-    if (value === "milvus_vector") {
-      if (props.formType !== "update") form.vector_url = `${host}:19530`;
-      nextTick(() => formRef.value?.validateFields([["vector_url"]]));
-    }
-  };
-
-  const handleInferenceTypeChange = (e: RadioChangeEvent) => {
-    form.embedding_model.model_id = undefined;
-    form.embedding_model.weight = undefined;
-    isConnected.value = false;
-    isModelUrlPass.value = false;
-    vllmValidatePass.value = false;
-
-    if (e.target?.value === "vllm") {
-      nextTick(() => formRef.value?.validateFields([["embedding_model", "api_base"]]));
+    if (urlType === "model") {
+      isModelUrlPass.value = true;
+    } else {
+      isVectorUrlPass.value = true;
     }
-  };
 
-  const handleUrlChange = () => {
-    isModelUrlPass.value = false;
-    vllmValidatePass.value = false;
-    isConnected.value = false;
-    form.embedding_model.model_id = undefined;
-    vllmModelList.value = [];
-  };
-
-  const handleUriChange = () => {
-    isVectorUrlPass.value = false;
-    validatePass.value = false;
+    return Promise.resolve();
   };
+};
+
+const rules: FormRules = reactive({
+  indexer_type: [
+    {
+      required: true,
+      message: t("pipeline.valid.indexerType"),
+      trigger: "change",
+    },
+  ],
+  inference_type: [
+    {
+      required: true,
+      message: t("pipeline.valid.generatorType"),
+      trigger: "change",
+    },
+  ],
+  model_id: [
+    {
+      required: true,
+      message: t("pipeline.valid.embedding"),
+      trigger: "change",
+    },
+  ],
+  device: [
+    {
+      required: true,
+      message: t("pipeline.valid.embeddingDevice"),
+      trigger: "change",
+    },
+  ],
+  vector_url: [
+    {
+      required: true,
+      validator: validateUnique("vector"),
+      trigger: "blur",
+    },
+  ],
+  embedding_url: [
+    {
+      required: true,
+      validator: validateUnique("model"),
+      trigger: "change",
+    },
+  ],
+});
+
+const queryModelList = async (modelType: string, params?: any) => {
+  try {
+    const data = await getModelList(modelType, params);
+    return Array.isArray(data) ? data : [];
+  } catch (error) {
+    console.error(`Failed to load ${modelType} model list:`, error);
+    return [];
+  }
+};
 
-  const handleEmbeddingModelChange = () => {
-    vllmValidatePass.value = false;
-  };
+const handleTypeChange = (value: SelectValue) => {
+  isVectorUrlPass.value = false;
+  validatePass.value = false;
 
-  const handleModelChange = (value: SelectValue) => {
-    form.embedding_model.model_path = `./models/${value}`;
-  };
+  if (value === "milvus_vector") {
+    if (props.formType !== "update") form.vector_url = `${host}:19530`;
 
-  const handleDeviceVisible = async (visible: boolean) => {
-    if (visible) {
-      try {
-        const data: any = await getRunDevice();
-        deviceList.value = [].concat(data);
-      } catch (err) {
-        console.error(err);
-      }
+    nextTick(() => formRef.value?.validateFields([["vector_url"]]));
+  }
+};
+
+const handleInferenceTypeChange = (e: RadioChangeEvent) => {
+  form.embedding_model.model_id = undefined;
+  form.embedding_model.weight = undefined;
+  isConnected.value = false;
+  isModelUrlPass.value = false;
+  vllmValidatePass.value = false;
+
+  if (e.target?.value === "vllm") {
+    nextTick(() =>
+      formRef.value?.validateFields([["embedding_model", "api_base"]]),
+    );
+  }
+};
+
+const handleUrlChange = () => {
+  isModelUrlPass.value = false;
+  vllmValidatePass.value = false;
+  isConnected.value = false;
+  form.embedding_model.model_id = undefined;
+  vllmModelList.value = [];
+};
+
+const handleUriChange = () => {
+  isVectorUrlPass.value = false;
+  validatePass.value = false;
+};
+
+const handleEmbeddingModelChange = () => {
+  vllmValidatePass.value = false;
+};
+
+const handleModelChange = (value: SelectValue) => {
+  form.embedding_model.model_path = `./models/${value}`;
+};
+
+const handleDeviceVisible = async (visible: boolean) => {
+  if (visible) {
+    try {
+      const data: any = await getRunDevice();
+      deviceList.value = [].concat(data);
+    } catch (err) {
+      console.error(err);
     }
-  };
+  }
+};
 
-  const handleModelVisible = async (visible: boolean) => {
-    if (visible) {
-      try {
-        const modelType = isKbadmin.value ? "kbadmin_embedding_model" : "embedding";
-        modelList.value = await queryModelList(modelType);
-      } catch (err) {
-        console.error(err);
-      }
+const handleModelVisible = async (visible: boolean) => {
+  if (visible) {
+    try {
+      const modelType = isKbadmin.value
+        ? "kbadmin_embedding_model"
+        : "embedding";
+      modelList.value = await queryModelList(modelType);
+    } catch (err) {
+      console.error(err);
     }
-  };
+  }
+};
 
-  const handleEmbeddingModelVisible = async (visible: boolean) => {
-    if (visible) {
-      try {
-        if (!isConnected.value) {
-          antNotification("warning", t("common.prompt"), t("pipeline.valid.modelTip"));
-          return;
-        }
-        handleQueryModel();
-      } catch (err) {
-        console.error(err);
+const handleEmbeddingModelVisible = async (visible: boolean) => {
+  if (visible) {
+    try {
+      if (!isConnected.value) {
+        antNotification(
+          "warning",
+          t("common.prompt"),
+          t("pipeline.valid.modelTip"),
+        );
+        return;
       }
+      handleQueryModel();
+    } catch (err) {
+      console.error(err);
     }
-  };
+  }
+};
 
-  const handleQueryModel = async () => {
-    try {
-      const server_address = protocol.value + form.embedding_model.api_base;
-      const data: any = await queryModelList("vLLM_embedding", { server_address });
-      vllmModelList.value = [].concat(data);
-      isConnected.value = vllmModelList.value.length > 0;
-    } catch (error) {
-      console.error(error);
-    }
-  };
+const handleQueryModel = async () => {
+  try {
+    const server_address = protocol.value + form.embedding_model.api_base;
+    const data: any = await queryModelList("vLLM_embedding", {
+      server_address,
+    });
+    vllmModelList.value = [].concat(data);
+    isConnected.value = vllmModelList.value.length > 0;
+  } catch (error) {
+    console.error(error);
+  }
+};
 
-  const handleTestModelUrl = async () => {
-    const { model_id, api_base } = form.embedding_model;
-    if (!model_id) return;
+const handleTestModelUrl = async () => {
+  const { model_id, api_base } = form.embedding_model;
+  if (!model_id) return;
 
-    try {
-      const server_address = protocol.value + api_base;
-      const response: any = await requestUrlVllm({
-        server_address,
-        model_name: model_id,
-      });
+  try {
+    const server_address = protocol.value + api_base;
+    const response: any = await requestUrlVllm({
+      server_address,
+      model_name: model_id,
+    });
 
-      if (response?.status === "200") {
-        vllmValidatePass.value = true;
-        antNotification("success", t("common.success"), t("pipeline.valid.vllmUrlValid4"));
-      } else {
-        vllmValidatePass.value = false;
-        antNotification("error", t("common.error"), t("pipeline.valid.vllmUrlValid3"));
-      }
-    } catch (error) {
-      console.error("Failed to test model URL:", error);
+    if (response?.status === "200") {
+      vllmValidatePass.value = true;
+      antNotification(
+        "success",
+        t("common.success"),
+        t("pipeline.valid.vllmUrlValid4"),
+      );
+    } else {
       vllmValidatePass.value = false;
-      antNotification("error", t("common.error"), t("pipeline.valid.vllmUrlValid3"));
+      antNotification(
+        "error",
+        t("common.error"),
+        t("pipeline.valid.vllmUrlValid3"),
+      );
     }
-  };
-
-  const handleTestUrl = async () => {
-    try {
-      const vector_url = protocol.value + form.vector_url;
-      const response: any = await requestUrlVerify({ vector_url });
-
-      if (response?.status === "200") {
-        validatePass.value = true;
-        antNotification("success", t("common.success"), t("pipeline.valid.urlValid4"));
-      } else {
-        validatePass.value = false;
-        antNotification("error", t("common.error"), t("pipeline.valid.urlValid3"));
-      }
-    } catch (error) {
-      console.error("Failed to test URL:", error);
+  } catch (error) {
+    console.error("Failed to test model URL:", error);
+    vllmValidatePass.value = false;
+    antNotification(
+      "error",
+      t("common.error"),
+      t("pipeline.valid.vllmUrlValid3"),
+    );
+  }
+};
+
+const handleTestUrl = async () => {
+  try {
+    const vector_url = protocol.value + form.vector_url;
+    const response: any = await requestUrlVerify({ vector_url });
+
+    if (response?.status === "200") {
+      validatePass.value = true;
+      antNotification(
+        "success",
+        t("common.success"),
+        t("pipeline.valid.urlValid4"),
+      );
+    } else {
       validatePass.value = false;
-      antNotification("error", t("common.error"), t("pipeline.valid.urlValid3"));
+      antNotification(
+        "error",
+        t("common.error"),
+        t("pipeline.valid.urlValid3"),
+      );
     }
-  };
+  } catch (error) {
+    console.error("Failed to test URL:", error);
+    validatePass.value = false;
+    antNotification("error", t("common.error"), t("pipeline.valid.urlValid3"));
+  }
+};
 
-  const formatFormParam = () => {
-    const { indexer_type, inference_type, vector_url, embedding_url, embedding_model } = form;
-
-    return {
-      indexer_type,
-      inference_type: !isKbadmin.value ? inference_type : undefined,
-      embedding_model: {
-        ...embedding_model,
-        api_base: isVllm.value ? modelProtocol.value + embedding_model.api_base : undefined,
-      },
-      vector_url: isMilvus.value || isKbadmin.value ? protocol.value + vector_url : undefined,
-      embedding_url: isKbadmin.value ? modelProtocol.value + embedding_url : undefined,
-    };
-  };
+const formatFormParam = () => {
+  const {
+    indexer_type,
+    inference_type,
+    vector_url,
+    embedding_url,
+    embedding_model,
+  } = form;
 
-  const handleValidate = (): Promise<object> => {
-    return new Promise(resolve => {
-      formRef.value
-        ?.validate()
-        .then(() => {
-          if (isMilvus.value && !validatePass.value) {
-            antNotification("warning", t("common.prompt"), t("pipeline.valid.urlValid5"));
-            resolve({ result: false });
-            return;
-          } else if (isVllm.value && !vllmValidatePass.value) {
-            antNotification("warning", t("common.prompt"), t("pipeline.valid.vllmUrlValid5"));
-            resolve({ result: false });
-            return;
-          }
-          resolve({
-            result: true,
-            data: { indexer: formatFormParam() },
-          });
-        })
-        .catch(() => {
+  return {
+    indexer_type,
+    inference_type: !isKbadmin.value ? inference_type : undefined,
+    embedding_model: {
+      ...embedding_model,
+      api_base: isVllm.value
+        ? modelProtocol.value + embedding_model.api_base
+        : undefined,
+    },
+    vector_url:
+      isMilvus.value || isKbadmin.value
+        ? protocol.value + vector_url
+        : undefined,
+    embedding_url: isKbadmin.value
+      ? modelProtocol.value + embedding_url
+      : undefined,
+  };
+};
+
+const handleValidate = (): Promise<object> => {
+  return new Promise((resolve) => {
+    formRef.value
+      ?.validate()
+      .then(() => {
+        if (isMilvus.value && !validatePass.value) {
+          antNotification(
+            "warning",
+            t("common.prompt"),
+            t("pipeline.valid.urlValid5"),
+          );
           resolve({ result: false });
+          return;
+        } else if (isVllm.value && !vllmValidatePass.value) {
+          antNotification(
+            "warning",
+            t("common.prompt"),
+            t("pipeline.valid.vllmUrlValid5"),
+          );
+          resolve({ result: false });
+          return;
+        }
+        resolve({
+          result: true,
+          data: { indexer: formatFormParam() },
         });
-    });
-  };
-
-  defineExpose({
-    validate: handleValidate,
-    isProceed,
+      })
+      .catch(() => {
+        resolve({ result: false });
+      });
   });
-  onMounted(() => {
-    if (props.formType === "update") {
-      if (isMilvus.value) {
-        isVectorUrlPass.value = validatePass.value = true;
-        isConnected.value = true;
-      }
+};
+
+defineExpose({
+  validate: handleValidate,
+  isProceed,
+});
+onMounted(() => {
+  if (props.formType === "update") {
+    if (isMilvus.value) {
+      isVectorUrlPass.value = validatePass.value = true;
+      isConnected.value = true;
+    }
 
-      if (isVllm.value) {
-        isConnected.value = true;
-        isModelUrlPass.value = vllmValidatePass.value = true;
-        handleQueryModel();
-      }
-    } else {
-      if (form.vector_url) formRef.value?.validateFields([["vector_url"]]);
-      if (form.embedding_model.api_base)
-        formRef.value?.validateFields([["embedding_model", "api_base"]]);
-      if (form.embedding_url) formRef.value?.validateFields([["embedding_url"]]);
+    if (isVllm.value) {
+      isConnected.value = true;
+      isModelUrlPass.value = vllmValidatePass.value = true;
+      handleQueryModel();
     }
-  });
+  } else {
+    if (form.vector_url) formRef.value?.validateFields([["vector_url"]]);
+    if (form.embedding_model.api_base)
+      formRef.value?.validateFields([["embedding_model", "api_base"]]);
+    if (form.embedding_url) formRef.value?.validateFields([["embedding_url"]]);
+  }
+});
 </script>
 
 <style scoped lang="less">
-  :deep(.intel-input-group) {
-    .intel-input-group-addon {
-      overflow: hidden;
+:deep(.intel-input-group) {
+  .intel-input-group-addon {
+    overflow: hidden;
 
-      .intel-select-selector {
-        border: 1px solid transparent !important;
-      }
+    .intel-select-selector {
+      border: 1px solid transparent !important;
     }
   }
-  .model-wrap {
-    flex: 1;
-    .flex-left;
-
-    :deep(.intel-select-selector) {
-      border-radius: 6px 0 0 6px;
-    }
+}
+.model-wrap {
+  flex: 1;
+  .flex-left;
 
-    .select-wrap {
-      width: calc(100% - 72px);
-    }
+  :deep(.intel-select-selector) {
+    border-radius: 6px 0 0 6px;
+  }
 
-    .text-btn {
-      margin: 0;
-    }
+  .select-wrap {
+    width: calc(100% - 72px);
   }
+
   .text-btn {
-    width: 72px;
-    height: 30px;
-    margin: 0 -11px;
-    border-radius: 0 6px 6px 0;
-    padding: 0;
-    .vertical-center;
+    margin: 0;
   }
+}
+.text-btn {
+  width: 72px;
+  height: 30px;
+  margin: 0 -11px;
+  border-radius: 0 6px 6px 0;
+  padding: 0;
+  .vertical-center;
+}
 </style>
diff --git a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/columnsList.ts b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/columnsList.ts
index 125d512380..301334f079 100644
--- a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/columnsList.ts
+++ b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/columnsList.ts
@@ -7,7 +7,7 @@ const getTableColumns = (t: (key: string) => string): TableColumns[] => [
     key: "name",
     dataIndex: "name",
     fixed: "left",
-    minWidth: 100,
+    width: 100,
     visible: true,
     disabled: true,
   },
@@ -15,23 +15,22 @@ const getTableColumns = (t: (key: string) => string): TableColumns[] => [
     title: t("agent.id"),
     dataIndex: "idx",
     key: "idx",
-    minWidth: 100,
+    width: 200,
     ellipsis: true,
-    visible: true,
   },
   {
     title: t("agent.pipeline"),
     dataIndex: "pipeline_name",
     key: "pipeline_name",
-    minWidth: 100,
+    width: 100,
     ellipsis: true,
     visible: true,
   },
   {
-    title: t("agent.label.type"),
+    title: t("agent.type"),
     dataIndex: "type",
     key: "type",
-    minWidth: 60,
+    width: 100,
     ellipsis: true,
     visible: true,
   },
@@ -39,15 +38,14 @@ const getTableColumns = (t: (key: string) => string): TableColumns[] => [
     title: t("agent.configs"),
     dataIndex: "configs",
     key: "configs",
-    minWidth: 120,
-    ellipsis: true,
+    width: 80,
     visible: true,
   },
   {
     title: t("agent.status"),
     dataIndex: "active",
     key: "active",
-    minWidth: 80,
+    width: 80,
     ellipsis: true,
     visible: true,
   },
@@ -56,6 +54,7 @@ const getTableColumns = (t: (key: string) => string): TableColumns[] => [
     key: "operation",
     dataIndex: "operation",
     fixed: "right",
+    width: 200,
     visible: true,
     disabled: true,
   },
diff --git a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/DynamicConfigs.vue b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/DynamicConfigs.vue
index b767282194..224397ebe0 100644
--- a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/DynamicConfigs.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/DynamicConfigs.vue
@@ -1,27 +1,12 @@
 <template>
   <div class="dynamic-configs-form">
-    <a-form
-      :model="form"
-      layout="vertical"
-      autocomplete="off"
-      class="form-wrap"
-      name="Configs"
-    >
+    <a-form :model="form" layout="vertical" autocomplete="off" class="form-wrap" name="Configs">
       <template v-for="field in schema" :key="field.key">
         <a-form-item :label="field.label" class="slider-wrap">
           <div v-if="field.type === 'number'" class="flex-left">
-            <a-slider
-              v-model:value="form[field.key]"
-              :min="0"
-              :max="200"
-              :marks="sliderMarks"
-            />
+            <a-slider v-model:value="form[field.key]" :min="0" :max="200" :marks="sliderMarks" />
             <a-form-item noStyle>
-              <a-input-number
-                v-model:value="form[field.key]"
-                :min="0"
-                :max="200"
-              />
+              <a-input-number v-model:value="form[field.key]" :min="0" :max="200" />
             </a-form-item>
           </div>
           <template v-else-if="field.type === 'boolean'">
@@ -29,12 +14,27 @@
             <a-switch v-model:checked="form[field.key]" size="small" />
             {{ $t("common.yes") }}
           </template>
-          <a-input
-            v-else
-            allowClear
-            v-model:value="form[field.key]"
-            :placeholder="$t('common.inputTip')"
-          />
+          <div v-else class="instruction-field">
+            <div
+              v-if="!expandedFields.has(field.key)"
+              class="instruction-preview"
+              @click="expandedFields.add(field.key)"
+            >
+              <span class="text-wrap"> {{ form[field.key] }}</span>
+              <span class="expand-btn">
+                <EditOutlined :style="{ fontSize: '16px' }" />
+              </span>
+            </div>
+            <a-textarea
+              v-else
+              allowClear
+              v-model:value="form[field.key]"
+              :placeholder="$t('common.inputTip')"
+              :rows="4"
+              :auto-size="false"
+              @blur="expandedFields.delete(field.key)"
+            />
+          </div>
         </a-form-item>
       </template>
     </a-form>
@@ -42,75 +42,111 @@
 </template>
 
 <script lang="ts" setup name="DynamicConfigs">
-import { formatTextStrict } from "@/utils/common";
-import { computed, reactive, watch } from "vue";
-import type { PropType } from "vue";
-
-type RawConfigValue = number | boolean | string | { [k: string]: any };
-
-const props = defineProps({
-  configs: {
-    type: Object as PropType<EmptyObjectType>,
-    default: () => ({}),
-    required: true,
-  },
-  modelValue: {
-    type: Object as PropType<EmptyObjectType>,
-    default: () => ({}),
-  },
-});
-const emit = defineEmits(["update:modelValue"]);
-
-const typeMap: Record<string, Field["type"]> = {
-  string: "string",
-  number: "number",
-  boolean: "boolean",
-};
-type Field = {
-  key: string;
-  label: string;
-  type: "number" | "boolean" | "string";
-  params: EmptyObjectType;
-};
-
-const sliderMarks = reactive({ 0: "0", 200: "200" });
-
-const inferField = (key: string, value: RawConfigValue): Field => {
-  const valueType = typeof value;
-  const type = typeMap[valueType] || "string";
-  const params = { default: value };
-
-  const label = formatTextStrict(key);
-  return { key, label, type, params };
-};
-
-const schema = computed(() =>
-  Object.entries(props.configs).map(([k, v]) => inferField(k, v))
-);
-
-const form = reactive<EmptyObjectType>({ ...props.modelValue });
-
-watch(
-  () => props.modelValue,
-  (data) => {
-    Object.assign(form, data);
-  },
-  { deep: true }
-);
-
-watch(form, (newForm) => emit("update:modelValue", { ...newForm }), {
-  deep: true,
-});
+  import { formatTextStrict } from "@/utils/common";
+  import { EditOutlined } from "@ant-design/icons-vue";
+  import type { PropType } from "vue";
+  import { computed, reactive, watch } from "vue";
+
+  type RawConfigValue = number | boolean | string | { [k: string]: any };
+
+  const props = defineProps({
+    configs: {
+      type: Object as PropType<EmptyObjectType>,
+      default: () => ({}),
+      required: true,
+    },
+    modelValue: {
+      type: Object as PropType<EmptyObjectType>,
+      default: () => ({}),
+    },
+  });
+  const emit = defineEmits(["update:modelValue"]);
+
+  const typeMap: Record<string, Field["type"]> = {
+    string: "string",
+    number: "number",
+    boolean: "boolean",
+  };
+  type Field = {
+    key: string;
+    label: string;
+    type: "number" | "boolean" | "string";
+    params: EmptyObjectType;
+  };
+
+  const sliderMarks = reactive({ 0: "0", 200: "200" });
+  const expandedFields = reactive(new Set<string>());
+
+  const inferField = (key: string, value: RawConfigValue): Field => {
+    const valueType = typeof value;
+    const type = typeMap[valueType] || "string";
+    const params = { default: value };
+
+    const label = formatTextStrict(key);
+    return { key, label, type, params };
+  };
+
+  const schema = computed(() => Object.entries(props.configs).map(([k, v]) => inferField(k, v)));
+
+  const form = reactive<EmptyObjectType>({ ...props.modelValue });
+
+  watch(
+    () => props.modelValue,
+    data => {
+      Object.assign(form, data);
+    },
+    { deep: true }
+  );
+
+  watch(form, newForm => emit("update:modelValue", { ...newForm }), {
+    deep: true,
+  });
 </script>
 
 <style scoped lang="less">
-.slider-wrap {
-  .flex-left {
-    gap: 6px;
+  .slider-wrap {
+    .flex-left {
+      gap: 6px;
+    }
+    .intel-input-number {
+      position: relative;
+      top: -10px;
+    }
   }
-  .intel-input-number {
-    position: relative;
-    top: -10px;
+
+  .instruction-field {
+    width: 100%;
+  }
+
+  .instruction-preview {
+    padding: 4px 11px;
+    border: 1px solid var(--border-info);
+    border-radius: 6px;
+    min-height: 32px;
+    line-height: 1.5;
+    cursor: pointer;
+    word-break: break-word;
+    display: flex;
+    align-items: flex-start;
+    justify-content: space-between;
+    gap: 8px;
+    &:hover {
+      border: 1px solid var(--color-primary-hover);
+    }
+
+    .text-wrap {
+      .single-ellipsis;
+    }
+
+    .expand-btn {
+      flex-shrink: 0;
+      padding: 0;
+      height: auto;
+      line-height: inherit;
+      font-size: 12px;
+      &:hover {
+        color: var(--color-primary-hover);
+      }
+    }
   }
-}
 </style>
diff --git a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/Table.vue b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/Table.vue
index 7b53adec77..5b2c2b6f0f 100644
--- a/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/Table.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/settings/components/Agent/components/Table.vue
@@ -16,7 +16,6 @@
       :data-source="tableList"
       :pagination="false"
       :loading="loading"
-      :scroll="{ x: 'max-content' }"
     >
       <template #bodyCell="{ column, record }">
         <template v-if="column.dataIndex === 'name'">
@@ -35,13 +34,31 @@
         <template v-if="column.dataIndex === 'configs'">
           <span v-if="!Object.keys(record?.configs || {}).length">--</span>
           <div class="tag-wrap" v-else>
-            <a-tag v-for="(value, key) in record?.configs" color="processing" class="tag-item">
-              {{ key }}: {{ value }}
-            </a-tag>
+            <a-popover placement="rightBottom">
+              <FileSearchOutlined class="detail-icon" />
+              <template #content>
+                <div class="configs-wrap">
+                  <div class="title-wrap">
+                    <span>{{ $t("agent.label.configs") }}</span>
+                    <a-tooltip placement="top" :title="$t('common.copy')">
+                      <span class="icon-style" @click="handleCopyResponses(record?.configs)">
+                        <CopyOutlined /></span
+                    ></a-tooltip>
+                  </div>
+                  <div class="json-wrap">
+                    <JsonPretty
+                      :data="record?.configs"
+                      :theme="currentTheme"
+                      :show-toggle="false"
+                    />
+                  </div>
+                </div>
+              </template>
+            </a-popover>
           </div>
         </template>
         <template v-else-if="column.dataIndex === 'operation'">
-          <a-space wrap>
+          <a-space class="operation-actions">
             <a-button
               type="primary"
               ghost
@@ -51,6 +68,20 @@
             >
               {{ $t("common.update") }}</a-button
             >
+            <a-button
+              v-if="!record.active"
+              size="small"
+              class="intel-btn-success"
+              @click="handleSwitchState(record)"
+              >{{ $t("common.active") }}</a-button
+            >
+            <a-button
+              v-if="record.active"
+              size="small"
+              class="intel-btn-warning"
+              @click="handleSwitchState(record)"
+              >{{ $t("common.deactivate") }}</a-button
+            >
             <a-button danger size="small" :disabled="record.active" @click="handleDelete(record)"
               >{{ $t("common.delete") }}
             </a-button>
@@ -70,16 +101,27 @@
 </template>
 
 <script lang="ts" setup name="Table">
-  import { requestAgentDelete } from "@/api/agent";
+  import { requestAgentDelete, requestAgentSetActive } from "@/api/agent";
+  import { themeAppStore } from "@/store/theme";
+  import { useClipboard } from "@/utils/clipboard";
   import { getEnumField } from "@/utils/common";
-  import { CloseCircleFilled, PlusOutlined } from "@ant-design/icons-vue";
+  import {
+    CloseCircleFilled,
+    CopyOutlined,
+    FileSearchOutlined,
+    PlusOutlined,
+  } from "@ant-design/icons-vue";
   import { Modal } from "ant-design-vue";
   import { createVNode } from "vue";
   import { useI18n } from "vue-i18n";
+  import JsonPretty from "vue-json-pretty";
+  import "vue-json-pretty/lib/styles.css";
   import getTableColumns from "../columnsList";
   import { AgentType } from "../enum";
 
+  const themeStore = themeAppStore();
   const { t } = useI18n();
+  const { copy } = useClipboard();
 
   const props = defineProps({
     tableData: {
@@ -99,13 +141,15 @@
     pageSize: 10,
   });
   const tableColumns = computed(() => getTableColumns(t));
-
   const tableList = computed(() => {
     const { pageNum, pageSize } = paginationData;
     const start = (pageNum - 1) * pageSize;
     const end = start + pageSize;
     return props.tableData.slice(start, end);
   });
+  const currentTheme = computed(() => {
+    return themeStore.theme;
+  });
   //create
   const handleCreate = () => {
     emit("create");
@@ -114,6 +158,20 @@
   const handleUpdate = (row: EmptyObjectType) => {
     emit("update", row);
   };
+  //activate / deactivate
+  const handleSwitchState = (row: EmptyObjectType) => {
+    const willActivate = !row.active;
+    const text = willActivate ? t("agent.activeTip") : t("agent.deactivateTip");
+    Modal.confirm({
+      title: t("common.prompt"),
+      content: text,
+      okText: t("common.confirm"),
+      async onOk() {
+        await requestAgentSetActive(row.name, willActivate);
+        emit("search");
+      },
+    });
+  };
   //detail
   const handleView = (row: EmptyObjectType) => {
     emit("view", row);
@@ -133,6 +191,10 @@
       },
     });
   };
+  const handleCopyResponses = async (configs: EmptyObjectType) => {
+    console.log(configs);
+    await copy(JSON.stringify(configs));
+  };
   watch(
     () => props.tableData,
     newData => {
@@ -170,10 +232,15 @@
       line-height: 18px;
     }
     .tag-wrap {
-      display: grid;
-      grid-template-columns: 1fr 1fr 1fr;
+      display: flex;
+      flex-wrap: wrap;
       gap: 8px;
     }
+    .tag-item {
+      overflow: hidden;
+      text-overflow: ellipsis;
+      white-space: nowrap;
+    }
   }
   .click-link {
     color: var(--color-primary);
@@ -185,24 +252,11 @@
     }
   }
 
-  .custom-benchmark {
-    position: absolute;
-    top: -40px;
-    height: 36px;
-    z-index: 20;
-
-    .container {
-      padding: 8px 16px;
-    }
-
-    h2 {
-      font-size: 14px;
-      padding: 0;
-      font-weight: 500;
-      color: #595959;
-      justify-content: end;
-    }
+  :deep(.operation-actions) {
+    flex-wrap: nowrap;
+    white-space: nowrap;
   }
+
   .not-configs {
     padding: 16px 0;
     width: 100%;
@@ -210,4 +264,41 @@
       height: 60px;
     }
   }
+  .detail-icon {
+    font-size: 16px;
+    cursor: pointer;
+    color: var(--color-primary-hover);
+    &:hover {
+      color: var(--color-primary-second);
+    }
+  }
+  .configs-wrap {
+    .flex-column;
+    width: 600px;
+    gap: 12px;
+    max-height: 450px;
+    .title-wrap {
+      .flex-between;
+      .pb-8;
+      border-bottom: 1px solid var(--border-main-color);
+      font-weight: 500;
+      color: var(--font-main-color);
+
+      .icon-style {
+        cursor: pointer;
+        &:hover {
+          color: var(--color-primary-hover);
+        }
+      }
+    }
+    .json-wrap {
+      flex: 1;
+      overflow-y: auto;
+    }
+  }
+  :deep(.vjs-tree) {
+    .vjs-value-string {
+      color: var(--color-success);
+    }
+  }
 </style>
diff --git a/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/Generator.vue b/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/Generator.vue
index 7165861930..e70d45ef03 100644
--- a/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/Generator.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/Generator.vue
@@ -48,24 +48,26 @@
           @change="() => handleInferenceTypeChange(index)"
         >
           <a-radio value="vllm">{{ $t("pipeline.config.vllm") }}</a-radio>
+          <a-radio value="ovms">{{ $t("pipeline.config.ovms") }}</a-radio>
           <a-radio value="local">{{ $t("pipeline.config.local") }}</a-radio>
         </a-radio-group>
       </a-form-item>
-      <!-- vLLM -->
-      <template v-if="item.inference_type === 'vllm'">
+      <!-- Remote -->
+      <template v-if="isRemoteInference(item.inference_type)">
         <a-form-item
-          :name="['generator', index, 'vllm_endpoint']"
-          :rules="getVllmEndpointRules(index)"
+          :name="['generator', index, getRemoteEndpointField(item.inference_type)]"
+          :rules="getRemoteEndpointRules(index, item.inference_type)"
         >
           <template #label>
-            {{ $t("pipeline.config.vllm_url") }}
-            <span class="eg-wrap">{{ $t("pipeline.valid.vllm_url") }}</span>
+            {{ $t(getRemoteEndpointConfigKey(item.inference_type)) }}
+            <span class="eg-wrap">{{ $t(getRemoteEndpointValidKey(item.inference_type)) }}</span>
           </template>
 
           <a-input
-            v-model:value="item.vllm_endpoint"
-            :placeholder="$t('pipeline.valid.vllm_url')"
-            @change="() => handleVllmEndpointChange(index)"
+            :value="getRemoteEndpointValue(item, item.inference_type)"
+            :placeholder="$t(getRemoteEndpointValidKey(item.inference_type))"
+            @update:value="value => setRemoteEndpointValue(item, item.inference_type, value)"
+            @change="() => handleRemoteEndpointChange(index)"
           >
             <template #addonBefore>
               <a-select v-model:value="generatorStates[index].protocol">
@@ -79,7 +81,7 @@
                 type="primary"
                 class="text-btn"
                 :disabled="!generatorStates[index].isEndpointValid"
-                @click="handleQueryVllmModels(index)"
+                @click="handleQueryRemoteModels(index, item.inference_type)"
               >
                 <CheckCircleFilled
                   v-if="generatorStates[index].isConnected"
@@ -102,11 +104,11 @@
               v-model:value="item.model.model_id"
               showSearch
               :placeholder="$t('pipeline.valid.language')"
-              @change="() => handleVllmModeChange(index)"
-              @dropdownVisibleChange="v => handleVllmModelVisible(v, index)"
+              @change="() => handleRemoteModelChange(index)"
+              @dropdownVisibleChange="v => handleRemoteModelVisible(v, index, item.inference_type)"
             >
               <a-select-option
-                v-for="item in generatorStates[index].vllmModelList"
+                v-for="item in getRemoteModelList(index, item.inference_type)"
                 :key="item"
                 :value="item"
               >
@@ -118,7 +120,7 @@
               type="primary"
               class="text-btn"
               :disabled="!item.model.model_id"
-              @click="handleTestVllmEndpoint(index)"
+              @click="handleTestRemoteEndpoint(index, item.inference_type)"
             >
               <CheckCircleFilled
                 v-if="generatorStates[index].isEndpointTested"
@@ -176,28 +178,6 @@
           </a-select>
           <FormTooltip :title="$t('pipeline.desc.llmDevice')" />
         </a-form-item>
-
-        <a-form-item
-          :label="$t('pipeline.config.weights')"
-          :name="['generator', index, 'model', 'weight']"
-          :rules="rules.weight"
-        >
-          <a-select
-            v-model:value="item.model.weight"
-            showSearch
-            :placeholder="$t('pipeline.valid.weights')"
-            @dropdownVisibleChange="value => handleWeightVisible(value, index)"
-          >
-            <a-select-option
-              v-for="item in generatorStates[index].weightList"
-              :key="item"
-              :value="item"
-            >
-              {{ item }}
-            </a-select-option>
-          </a-select>
-          <FormTooltip :title="t('pipeline.desc.weights')" />
-        </a-form-item>
       </template>
       <div class="icon-wrap">
         <a-tooltip
@@ -221,7 +201,7 @@
   </a-form>
 </template>
 <script lang="ts" setup>
-  import { getModelList, getModelWeight, getRunDevice, requestUrlVllm } from "@/api/pipeline";
+  import { getModelList, getRunDevice, requestUrlOvms, requestUrlVllm } from "@/api/pipeline";
   import { useNotification } from "@/utils/common";
   import { validateServiceAddress } from "@/utils/validate";
   import {
@@ -263,6 +243,7 @@
     generator_type: string;
     inference_type: string;
     vllm_endpoint?: string;
+    ovms_endpoint?: string;
     prompt_path?: string;
     model: ModelType;
   }
@@ -273,9 +254,9 @@
     isConnected: boolean;
     isEndpointTested: boolean;
     vllmModelList: string[];
+    ovmsModelList: string[];
     localModelList: string[];
     deviceList: string[];
-    weightList: string[];
   }
 
   const formRef = ref<FormInstance>();
@@ -286,11 +267,11 @@
       generator_type = "chatqna",
       inference_type = "vllm",
       vllm_endpoint = `${host}:8086`,
+      ovms_endpoint = `${host}:8000`,
       model = {
         model_id: undefined,
         model_path: "",
         device: "AUTO",
-        weight: undefined,
       },
       prompt_path = "./default_prompt.txt",
     } = data;
@@ -299,6 +280,7 @@
       generator_type,
       inference_type,
       vllm_endpoint: formatUrl(vllm_endpoint || `${host}:8086`),
+      ovms_endpoint: formatUrl(ovms_endpoint || `${host}:8000`),
       prompt_path,
       model,
     };
@@ -324,9 +306,9 @@
     isConnected: connected,
     isEndpointTested: false,
     vllmModelList: [],
+    ovmsModelList: [],
     localModelList: [],
     deviceList: [],
-    weightList: [],
   });
 
   const generatorStates = reactive<GeneratorState[]>([]);
@@ -334,7 +316,9 @@
   const initStatesByForm = () => {
     generatorStates.splice(0);
     form.generator.forEach(item => {
-      generatorStates.push(createDefaultState(item.vllm_endpoint));
+      const endpoint =
+        item.inference_type === "ovms" ? item.ovms_endpoint : item.inference_type === "vllm" ? item.vllm_endpoint : undefined;
+      generatorStates.push(createDefaultState(endpoint));
     });
   };
 
@@ -369,28 +353,54 @@
         trigger: "change",
       },
     ],
-    weight: [
-      {
-        required: true,
-        message: t("pipeline.valid.weights"),
-        trigger: "change",
-      },
-    ],
   });
 
   const resetGenerator = (index: number, resetEndpoint = false) => {
-    Object.assign(form.generator[index].model, { model_id: undefined, weight: undefined });
+    Object.assign(form.generator[index].model, { model_id: undefined });
+    const inferenceType = form.generator[index].inference_type;
+    const endpoint =
+      inferenceType === "ovms"
+        ? form.generator[index].ovms_endpoint
+        : inferenceType === "vllm"
+          ? form.generator[index].vllm_endpoint
+          : undefined;
     generatorStates[index] = createDefaultState(
-      resetEndpoint ? undefined : form.generator[index].vllm_endpoint
+      resetEndpoint ? undefined : endpoint
     );
   };
 
   const handleInferenceTypeChange = (index: number) => resetGenerator(index);
-  const handleVllmEndpointChange = (index: number) => resetGenerator(index, true);
+  const handleRemoteEndpointChange = (index: number) => resetGenerator(index, true);
+
+  const isRemoteInference = (inferenceType: string) => inferenceType === "vllm" || inferenceType === "ovms";
+
+  const getRemoteModelType = (inferenceType: string) => (inferenceType === "ovms" ? "OVMS" : "vLLM");
+
+  const getRemoteEndpointField = (inferenceType: string) =>
+    inferenceType === "ovms" ? "ovms_endpoint" : "vllm_endpoint";
+
+  const getRemoteEndpointConfigKey = (inferenceType: string) =>
+    inferenceType === "ovms" ? "pipeline.config.ovms_url" : "pipeline.config.vllm_url";
+
+  const getRemoteEndpointValidKey = (inferenceType: string) =>
+    inferenceType === "ovms" ? "pipeline.valid.ovms_url" : "pipeline.valid.vllm_url";
+
+  const getRemoteModelTipKey = (inferenceType: string) =>
+    inferenceType === "ovms" ? "pipeline.valid.ovmsModelTip" : "pipeline.valid.modelTip";
+
+  const getRemoteEndpointValue = (item: GeneratorConfig, inferenceType: string) =>
+    inferenceType === "ovms" ? item.ovms_endpoint : item.vllm_endpoint;
+
+  const setRemoteEndpointValue = (item: GeneratorConfig, inferenceType: string, value: string) => {
+    if (inferenceType === "ovms") {
+      item.ovms_endpoint = value;
+    } else {
+      item.vllm_endpoint = value;
+    }
+  };
 
   const handleLocalModelChange = (index: number) => {
-    form.generator[index].model.weight = undefined;
-    generatorStates[index].weightList = [];
+    // no-op: weight selection removed for local LLM
   };
 
   const handleLocalModelVisible = async (visible: boolean, index: number) => {
@@ -415,23 +425,18 @@
     }
   };
 
-  const handleWeightVisible = async (visible: boolean, index: number) => {
-    if (visible) {
-      try {
-        const data: any = await getModelWeight(form.generator[index].model.model_id!);
-        generatorStates[index].weightList = data;
-      } catch (err) {
-        console.error(err);
-      }
-    }
-  };
 
-  const getVllmEndpointRules = (index: number): FormRules => [
+  const getRemoteEndpointRules = (index: number, inferenceType: string): FormRules => [
     {
       validator: (_: any, value: string) => {
-        if (!value) return Promise.reject(t("pipeline.valid.vllmUrlValid1"));
+        if (!value)
+          return Promise.reject(
+            t(inferenceType === "ovms" ? "pipeline.valid.ovmsUrlValid1" : "pipeline.valid.vllmUrlValid1")
+          );
         if (!validateServiceAddress(generatorStates[index].protocol + value))
-          return Promise.reject(t("pipeline.valid.vllmUrlValid2"));
+          return Promise.reject(
+            t(inferenceType === "ovms" ? "pipeline.valid.ovmsUrlValid2" : "pipeline.valid.vllmUrlValid2")
+          );
         generatorStates[index].isEndpointValid = true;
         return Promise.resolve();
       },
@@ -440,34 +445,66 @@
     },
   ];
 
-  const handleQueryVllmModels = async (index: number) => {
-    const data: any = await getModelList("vLLM", {
-      server_address: generatorStates[index].protocol + form.generator[index].vllm_endpoint,
+  const handleQueryRemoteModels = async (index: number, inferenceType: string) => {
+    const data: any = await getModelList(getRemoteModelType(inferenceType), {
+      server_address:
+        generatorStates[index].protocol +
+        (inferenceType === "ovms" ? form.generator[index].ovms_endpoint : form.generator[index].vllm_endpoint),
     });
-    generatorStates[index].vllmModelList = data || [];
+    if (inferenceType === "ovms") {
+      generatorStates[index].ovmsModelList = data || [];
+    } else {
+      generatorStates[index].vllmModelList = data || [];
+    }
     generatorStates[index].isConnected = !!data?.length;
   };
 
-  const handleTestVllmEndpoint = async (index: number) => {
-    const res: any = await requestUrlVllm({
-      server_address: generatorStates[index].protocol + form.generator[index].vllm_endpoint,
-      model_name: form.generator[index].model.model_id,
-    });
-    generatorStates[index].isEndpointTested = res?.status === "200";
+  const handleTestRemoteEndpoint = async (index: number, inferenceType: string) => {
+    const requestUrl = inferenceType === "ovms" ? requestUrlOvms : requestUrlVllm;
+    const endpoint =
+      inferenceType === "ovms" ? form.generator[index].ovms_endpoint : form.generator[index].vllm_endpoint;
+    try {
+      const res: any = await requestUrl({
+        server_address: generatorStates[index].protocol + endpoint,
+        model_name: form.generator[index].model.model_id,
+      });
+
+      const statusValue = res?.status ?? res?.code ?? res?.data?.status;
+      const isSuccess =
+        statusValue === 200 ||
+        statusValue === "200" ||
+        statusValue === true ||
+        statusValue === "ok" ||
+        statusValue === "OK";
+
+      generatorStates[index].isEndpointTested = isSuccess;
+
+      if (isSuccess) {
+        generatorStates[index].isConnected = true;
+      } else {
+        antNotification("warning", t("common.prompt"), res?.message || t("pipeline.valid.remoteUrlValid5"));
+      }
+    } catch (err) {
+      generatorStates[index].isEndpointTested = false;
+      throw err;
+    }
   };
 
-  const handleVllmModeChange = (index: number) => {
+  const handleRemoteModelChange = (index: number) => {
     generatorStates[index].isEndpointTested = false;
   };
 
-  const handleVllmModelVisible = (visible: boolean, index: number) => {
+  const getRemoteModelList = (index: number, inferenceType: string) =>
+    inferenceType === "ovms" ? generatorStates[index].ovmsModelList : generatorStates[index].vllmModelList;
+
+  const handleRemoteModelVisible = (visible: boolean, index: number, inferenceType: string) => {
     if (visible) {
       try {
         if (!generatorStates[index].isConnected) {
-          antNotification("warning", t("common.prompt"), t("pipeline.valid.modelTip"));
+          antNotification("warning", t("common.prompt"), t(getRemoteModelTipKey(inferenceType)));
           return;
         }
-        handleQueryVllmModels(index);
+        handleQueryRemoteModels(index, inferenceType);
       } catch (err) {
         console.error(err);
       }
@@ -494,30 +531,11 @@
     generatorStates.splice(index, 1);
   };
 
-  const handleModelPath = (modelId: string, weights?: string, prefix: string = "./models/") => {
-    const modelDirs = {
-      fp16_model: prefix + modelId + "/FP16/",
-      int8_model: prefix + modelId + "/INT8_compressed_weights/",
-      int4_model: prefix + modelId + "/INT4_compressed_weights/",
-    };
-    let modelPath: string = "";
-    switch (weights) {
-      case "INT4":
-        modelPath = modelDirs["int4_model"];
-        break;
-      case "INT8":
-        modelPath = modelDirs["int8_model"];
-        break;
-      default:
-        modelPath = modelDirs["fp16_model"];
-    }
-    return modelPath;
-  };
 
   const hasUntestedVllmEndpoint = computed(() => {
     return form.generator.some((gen, index) => {
       const state = generatorStates[index];
-      return gen.inference_type === "vllm" && !state.isEndpointTested;
+      return isRemoteInference(gen.inference_type) && !state.isEndpointTested;
     });
   });
 
@@ -526,15 +544,21 @@
   const formatFormParam = () => {
     const { generator } = form;
     return generator.map((item, index) => {
-      const { inference_type, vllm_endpoint, model, ...params } = item;
-      const { model_id, weight } = model;
-      model.model_path = handleModelPath(model_id!, weight);
+      const { inference_type, vllm_endpoint, ovms_endpoint, model, ...params } = item;
+      const { model_id } = model;
+      const localModel = {
+        model_id,
+        model_path: model_id || "",
+        device: model.device,
+      };
       return {
         ...params,
         inference_type,
-        model: inference_type === "vllm" ? { model_id } : model,
+        model: isRemoteInference(inference_type) ? { model_id } : localModel,
         vllm_endpoint:
           inference_type === "vllm" ? generatorStates[index].protocol + vllm_endpoint : undefined,
+        ovms_endpoint:
+          inference_type === "ovms" ? generatorStates[index].protocol + ovms_endpoint : undefined,
       };
     });
   };
@@ -546,7 +570,7 @@
         ?.validate()
         .then(() => {
           if (hasUntestedVllmEndpoint.value) {
-            antNotification("warning", t("common.prompt"), t("pipeline.valid.vllmUrlValid5"));
+            antNotification("warning", t("common.prompt"), t("pipeline.valid.remoteUrlValid5"));
             resolve({ result: false });
             return;
           }
@@ -569,8 +593,8 @@
   onMounted(async () => {
     for (let index = 0; index < form.generator.length; index++) {
       const item = form.generator[index];
-      if (item.inference_type === "vllm") {
-        formRef.value?.validateFields([["generator", index, "vllm_endpoint"]]);
+      if (isRemoteInference(item.inference_type)) {
+        formRef.value?.validateFields([["generator", index, getRemoteEndpointField(item.inference_type)]]);
         if (props.formType !== "update") return;
         if (item.model?.model_id) {
           generatorStates[index] = {
diff --git a/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/PostProcessor.vue b/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/PostProcessor.vue
index 98614b3a70..d60a73af1f 100644
--- a/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/PostProcessor.vue
+++ b/EdgeCraftRAG/ui/vue/src/views/settings/components/Pipeline/components/UpdateDialog/PostProcessor.vue
@@ -153,11 +153,12 @@ interface FormType {
   postprocessor: ProcessorType[];
 }
 const { postprocessor = [] } = props.formData || [];
+const DEFAULT_RERANK_TOP_N = 5;
 
 const defaultConfig = [
   {
     processor_type: "reranker",
-    top_n: 25,
+    top_n: DEFAULT_RERANK_TOP_N,
     reranker_model: {
       model_id: "BAAI/bge-reranker-large",
       model_path: "./models/BAAI/bge-reranker-large",
@@ -219,7 +220,7 @@ const getOptionIntroduction = (value: string) => {
 const handleTypeChange = (value: SelectValue, row: EmptyObjectType) => {
   if (value === "reranker") {
     Object.assign(row, {
-      top_n: 25,
+      top_n: DEFAULT_RERANK_TOP_N,
       reranker_model: {
         model_id: "BAAI/bge-reranker-large",
         model_path: "./models/BAAI/bge-reranker-large",
@@ -250,7 +251,7 @@ const handleModelChange = (item: EmptyObjectType, value: string) => {
 const handleAdd = () => {
   form.postprocessor.push({
     processor_type: "",
-    top_n: 25,
+    top_n: DEFAULT_RERANK_TOP_N,
     reranker_model: {
       model_id: "",
       model_path: "",
diff --git a/EdgeCraftRAG/ui/vue/vite.config.ts b/EdgeCraftRAG/ui/vue/vite.config.ts
index 23fda73b7b..0791f0d736 100644
--- a/EdgeCraftRAG/ui/vue/vite.config.ts
+++ b/EdgeCraftRAG/ui/vue/vite.config.ts
@@ -19,6 +19,10 @@ const alias: Record<string, string> = {
 };
 
 const viteConfig = defineConfig((mode: ConfigEnv) => {
+  const enableLocalProxy = process.env.ECRAG_LOCAL_PROXY === "1";
+  const apiProxyTarget = process.env.ECRAG_LOCAL_API_PROXY_TARGET ?? "http://localhost:16010";
+  const chatbotProxyTarget = process.env.ECRAG_LOCAL_CHATBOT_PROXY_TARGET ?? "http://localhost:16011";
+
   return {
     plugins: [
       vue(),
@@ -42,14 +46,22 @@ const viteConfig = defineConfig((mode: ConfigEnv) => {
       host: "0.0.0.0",
       port: 7777,
       hmr: true,
-      //   proxy: {
-      //     '/api': {
-      //       target: 'http://10.67.106.236:16010',
-      //       ws: true,
-      //       changeOrigin: true,
-      //       rewrite: (path) => path.replace(/^\/api/, ''),
-      //     },
-      //   },
+      proxy: enableLocalProxy
+        ? {
+            "/v1/chatqna": {
+              target: chatbotProxyTarget,
+              changeOrigin: true,
+            },
+            "/v1": {
+              target: apiProxyTarget,
+              changeOrigin: true,
+            },
+            "/home/user": {
+              target: apiProxyTarget,
+              changeOrigin: true,
+            },
+          }
+        : undefined,
     },
     build: {
       outDir: "dist",