From 2b133a41b3e65642147da9976d3f61a9a37b9645 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Tue, 31 Mar 2026 13:46:57 +0200 Subject: [PATCH 01/10] Remove py image --- demos/continuous_batching/rag/README.md | 29 ------------------------- 1 file changed, 29 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index a7951647de..2351b30a1c 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -5,21 +5,6 @@ ### 1. Download the preconfigured models using ovms --pull option from [HugginFaces Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) ::::{tab-set} -:::{tab-item} With Docker -**Required:** Docker Engine installed - -```bash -mkdir models -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank - -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-fp16-ov --model_path OpenVINO/bge-base-en-v1.5-fp16-ov -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-reranker-base-fp16-ov --model_path OpenVINO/bge-reranker-base-fp16-ov -``` -::: - :::{tab-item} On Baremetal Host **Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. @@ -57,20 +42,6 @@ ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/b ### 2. Download the preconfigured models using ovms --pull option for models outside [HugginFaces Hub OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace Hub. (Advanced usage) ::::{tab-set} -:::{tab-item} With Docker -**Required:** Docker Engine installed -```bash -mkdir models -docker run --user $(id -u):$(id -g) -e HF_HOME=/hf_home/cache --rm -v $(pwd)/models:/models:rw -v /opt/home/user/.cache/huggingface/:/hf_home/cache openvino/model_server:latest-py --pull --model_repository_path /models --source_model meta-llama/Meta-Llama-3-8B-Instruct --task text_generation --weight-format int8 -docker run --user $(id -u):$(id -g) -e HF_HOME=/hf_home/cache --rm -v $(pwd)/models:/models:rw -v /opt/home/user/.cache/huggingface/:/hf_home/cache openvino/model_server:latest-py --pull --model_repository_path /models --source_model Alibaba-NLP/gte-large-en-v1.5 --task embeddings --weight-format int8 -docker run --user $(id -u):$(id -g) -e HF_HOME=/hf_home/cache --rm -v $(pwd)/models:/models:rw -v /opt/home/user/.cache/huggingface/:/hf_home/cache openvino/model_server:latest-py --pull --model_repository_path /models --source_model BAAI/bge-reranker-large --task rerank --weight-format int8 - -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-py --add_to_config --config_path /models/config.json --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path meta-llama/Meta-Llama-3-8B-Instruct --weight-format int8 -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-py --add_to_config --config_path /models/config.json --model_name Alibaba-NLP/gte-large-en-v1.5 --model_path Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-py --add_to_config --config_path /models/config.json --model_name BAAI/bge-reranker-large --model_path BAAI/bge-reranker-large --weight-format int8 -``` -::: - :::{tab-item} On Baremetal Host **Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. From 726003d2220c6f5cf97d384451f099b20bb7834c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:59:20 +0000 Subject: [PATCH 02/10] Fix typo: HugginFaces -> Hugging Face and remove single-item tab-set in section 2 Agent-Logs-Url: https://github.com/openvinotoolkit/model_server/sessions/76bdc9f4-5f03-4afd-a8ac-6c88629dc6d0 Co-authored-by: rasapala <58549742+rasapala@users.noreply.github.com> --- demos/continuous_batching/rag/README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index 2351b30a1c..d9ac71306f 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -2,7 +2,7 @@ ## Creating models repository for all the endpoints with ovms --pull or python export_model.py script -### 1. Download the preconfigured models using ovms --pull option from [HugginFaces Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) +### 1. Download the preconfigured models using ovms --pull option from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) ::::{tab-set} :::{tab-item} On Baremetal Host @@ -39,10 +39,8 @@ ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/b :::: :::: -### 2. Download the preconfigured models using ovms --pull option for models outside [HugginFaces Hub OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace Hub. (Advanced usage) -::::{tab-set} +### 2. Download the preconfigured models using ovms --pull option for models outside [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace Hub. (Advanced usage) -:::{tab-item} On Baremetal Host **Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. ```bat @@ -58,8 +56,6 @@ ovms --add_to_config --config_path /models/config.json --model_name meta-llama/M ovms --add_to_config --config_path /models/config.json --model_name Alibaba-NLP/gte-large-en-v1.5 --model_path Alibaba-NLP/gte-large-en-v1.5 ovms --add_to_config --config_path /models/config.json --model_name BAAI/bge-reranker-large --model_path BAAI/bge-reranker-large ``` -::: -:::: ### 3. Export models from HuggingFace Hub including conversion to OpenVINO format using the python script From 45bef5e8f1fb83ef7d6745322d9e0b6c12eaa90d Mon Sep 17 00:00:00 2001 From: rasapala Date: Thu, 2 Apr 2026 14:06:59 +0200 Subject: [PATCH 03/10] Code review --- demos/continuous_batching/rag/README.md | 27 ++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index d9ac71306f..5bd4e32647 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -5,6 +5,21 @@ ### 1. Download the preconfigured models using ovms --pull option from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) ::::{tab-set} +:::{tab-item} With Docker +**Required:** Docker Engine installed + +```bash +mkdir models +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank + +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-base-en-v1.5-fp16-ov --model_path OpenVINO/bge-base-en-v1.5-fp16-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --add_to_config --config_path /models/config.json --model_name OpenVINO/bge-reranker-base-fp16-ov --model_path OpenVINO/bge-reranker-base-fp16-ov +``` +::: + :::{tab-item} On Baremetal Host **Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. @@ -58,7 +73,7 @@ ovms --add_to_config --config_path /models/config.json --model_name BAAI/bge-rer ``` -### 3. Export models from HuggingFace Hub including conversion to OpenVINO format using the python script +### 3. Alternatively, export models from HuggingFace Hub including conversion to OpenVINO format using the python script Use this procedure for all the models outside of OpenVINO organization in HuggingFace Hub. @@ -72,6 +87,8 @@ python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1. python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json ``` +### 4. Alternatively, use the build in ovms functionality in openvino/model_server:latest-py described here [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) + ## Deploying the model server ### With Docker @@ -91,6 +108,14 @@ ovms --rest_port 8000 --config_path models\config.json ```bat sc start ovms ``` + +## Readiness Check + +Wait for the models to load. You can check the status with a simple command: +```console +curl http://localhost:8000/v3/models +``` + ## Using RAG When the model server is deployed and serving all 3 endpoints, run the [jupyter notebook](https://github.com/openvinotoolkit/model_server/blob/main/demos/continuous_batching/rag/rag_demo.ipynb) to use RAG chain with a fully remote execution. From 75db727e5a457a70bce2c483a78c35b4e053fb28 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Tue, 7 Apr 2026 11:19:25 +0200 Subject: [PATCH 04/10] Cleanup --- demos/continuous_batching/rag/README.md | 62 ++++++++---------- demos/continuous_batching/rag/rag_demo.ipynb | 66 ++------------------ 2 files changed, 31 insertions(+), 97 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index 5bd4e32647..94913bf1be 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -2,7 +2,7 @@ ## Creating models repository for all the endpoints with ovms --pull or python export_model.py script -### 1. Download the preconfigured models using ovms --pull option from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) +### Download the preconfigured models using ovms --pull option from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) ::::{tab-set} :::{tab-item} With Docker @@ -52,42 +52,9 @@ ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/b ``` ::: :::: -:::: - -### 2. Download the preconfigured models using ovms --pull option for models outside [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) in HuggingFace Hub. (Advanced usage) - -**Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. - -```bat -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt -pip3 install -q -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/rag/requirements.txt -mkdir models -set HF_HOME=C:\hf_home\cache # export HF_HOME=/hf_home/cache if using linux -ovms --pull --model_repository_path models --source_model meta-llama/Meta-Llama-3-8B-Instruct --task text_generation --weight-format int8 -ovms --pull --model_repository_path models --source_model Alibaba-NLP/gte-large-en-v1.5 --task embeddings --weight-format int8 -ovms --pull --model_repository_path models --source_model BAAI/bge-reranker-large --task rerank --weight-format int8 - -ovms --add_to_config --config_path /models/config.json --model_name meta-llama/Meta-Llama-3-8B-Instruct --model_path meta-llama/Meta-Llama-3-8B-Instruct -ovms --add_to_config --config_path /models/config.json --model_name Alibaba-NLP/gte-large-en-v1.5 --model_path Alibaba-NLP/gte-large-en-v1.5 -ovms --add_to_config --config_path /models/config.json --model_name BAAI/bge-reranker-large --model_path BAAI/bge-reranker-large -``` - -### 3. Alternatively, export models from HuggingFace Hub including conversion to OpenVINO format using the python script -Use this procedure for all the models outside of OpenVINO organization in HuggingFace Hub. - -```console -curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py -pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/requirements.txt - -mkdir models -python export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format int8 --kv_cache_precision u8 --config_file_path models/config.json --model_repository_path models -python export_model.py embeddings_ov --source_model Alibaba-NLP/gte-large-en-v1.5 --weight-format int8 --config_file_path models/config.json -python export_model.py rerank_ov --source_model BAAI/bge-reranker-large --weight-format int8 --config_file_path models/config.json -``` - -### 4. Alternatively, use the build in ovms functionality in openvino/model_server:latest-py described here [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) +### Optionally, if you want to deploy different models use the build-in ovms functionality in openvino/model_server:latest-py described here [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) ## Deploying the model server @@ -115,6 +82,31 @@ Wait for the models to load. You can check the status with a simple command: ```console curl http://localhost:8000/v3/models ``` +``` +{ + "data": [ + { + "id": "OpenVINO/Qwen3-8B-int4-ov", + "object": "model", + "created": 1775552853, + "owned_by": "OVMS" + }, + { + "id": "OpenVINO/bge-base-en-v1.5-fp16-ov", + "object": "model", + "created": 1775552853, + "owned_by": "OVMS" + }, + { + "id": "OpenVINO/bge-reranker-base-fp16-ov", + "object": "model", + "created": 1775552853, + "owned_by": "OVMS" + } + ], + "object": "list" +} +``` ## Using RAG diff --git a/demos/continuous_batching/rag/rag_demo.ipynb b/demos/continuous_batching/rag/rag_demo.ipynb index 7bd1503e5a..88e9c67f72 100644 --- a/demos/continuous_batching/rag/rag_demo.ipynb +++ b/demos/continuous_batching/rag/rag_demo.ipynb @@ -47,58 +47,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "7212515f-b59b-498c-a66a-f6c59de8fcab", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f97b31c1ba61476fa8d43eb48812691c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "RadioButtons(description='Radio Selector:', options=('OpenVINO models', 'Converted models'), value='OpenVINO m…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ee7b21f697bf4063a90a985e26b1b3f2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Text(value='OpenVINO models', disabled=True)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from ipywidgets import widgets, link\n", - "from IPython.display import display\n", - "options = [\"OpenVINO models\", \"Converted models\"]\n", - "\n", - "# Create the radio buttons and a text box for output\n", - "radio_button = widgets.RadioButtons(options=options, description='Radio Selector:')\n", - "output_text = widgets.Text(disabled=True)\n", - "\n", - "# Link the value of the radio buttons to the text box\n", - "link((radio_button, 'value'), (output_text, 'value'))\n", - "\n", - "# Display both widgets\n", - "display(radio_button, output_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "b085cd3f-5473-474e-b35c-a1a548d50f0e", "metadata": {}, "outputs": [ @@ -111,16 +60,9 @@ } ], "source": [ - "print(output_text.value)\n", - "if output_text.value == \"OpenVINO models\":\n", - " embeddings_model = \"OpenVINO/bge-base-en-v1.5-fp16-ov\"\n", - " rerank_model = \"OpenVINO/bge-reranker-base-fp16-ov\"\n", - " chat_model = \"OpenVINO/Qwen3-8B-int4-ov\"\n", - "else:\n", - " embeddings_model = \"Alibaba-NLP/gte-large-en-v1.5\"\n", - " rerank_model = \"BAAI/bge-reranker-large\"\n", - " chat_model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n", - " " + "embeddings_model = \"OpenVINO/bge-base-en-v1.5-fp16-ov\"\n", + "rerank_model = \"OpenVINO/bge-reranker-base-fp16-ov\"\n", + "chat_model = \"OpenVINO/Qwen3-8B-int4-ov\" " ] }, { From 3c65d33588b0fdd5bcf791e6c35f1903a90f539a Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Tue, 7 Apr 2026 14:40:18 +0200 Subject: [PATCH 05/10] Fix --- demos/continuous_batching/rag/README.md | 46 +++++++------------------ 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index 94913bf1be..283920221f 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -1,8 +1,7 @@ # RAG demo with OpenVINO Model Server {#ovms_demos_continuous_batching_rag} -## Creating models repository for all the endpoints with ovms --pull or python export_model.py script +## Creating models repository for all the endpoints with ovms --pull from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) -### Download the preconfigured models using ovms --pull option from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) (Simple usage) ::::{tab-set} :::{tab-item} With Docker @@ -26,55 +25,36 @@ docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/mo ```bat mkdir models -ovms --pull --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation -ovms --pull --model_repository_path models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings -ovms --pull --model_repository_path models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank +ovms --pull --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation --target_device GPU +ovms --pull --model_repository_path models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings --target_device GPU +ovms --pull --model_repository_path models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank --target_device GPU ovms --add_to_config --config_path models/config.json --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov ovms --add_to_config --config_path models/config.json --model_name OpenVINO/bge-base-en-v1.5-fp16-ov --model_path OpenVINO/bge-base-en-v1.5-fp16-ov ovms --add_to_config --config_path models/config.json --model_name OpenVINO/bge-reranker-base-fp16-ov --model_path OpenVINO/bge-reranker-base-fp16-ov ``` ::: - -:::{tab-item} Windows service -**Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. -**Assumption:** install_ovms_service.bat was called without additional parameters - using default c:\models config path. -```bat -mkdir c:\models - -ovms --pull --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation -ovms --pull --model_repository_path c:\models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings -ovms --pull --model_repository_path c:\models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank - -ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov -ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/bge-base-en-v1.5-fp16-ov --model_path OpenVINO/bge-base-en-v1.5-fp16-ov -ovms --add_to_config --config_path c:\models\config.json --model_name OpenVINO/bge-reranker-base-fp16-ov --model_path OpenVINO/bge-reranker-base-fp16-ov -``` -::: :::: -### Optionally, if you want to deploy different models use the build-in ovms functionality in openvino/model_server:latest-py described here [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) +### Optionally, if you want to deploy different models, use the built-in OVMS functionality in `openvino/model_server:latest-py` described in [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) ## Deploying the model server -### With Docker +::::{tab-set} + +:::{tab-item} With Docker ```bash docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro openvino/model_server:latest --rest_port 8000 --config_path /workspace/config.json ``` -### On Baremetal Unix -```bash -ovms --rest_port 8000 --config_path models/config.json -``` -### Windows -```bat -ovms --rest_port 8000 --config_path models\config.json -``` +::: -### Server as Windows Service +:::{tab-item} On Baremetal Windows ```bat -sc start ovms +ovms --rest_port 8000 --config_path models\config.json ``` +::: +:::: ## Readiness Check From d6e026b6706923ae17a92b78c57eac70fd55dda6 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Tue, 7 Apr 2026 17:03:43 +0200 Subject: [PATCH 06/10] Fix doc --- demos/continuous_batching/rag/README.md | 7 ++++--- demos/continuous_batching/rag/rag_demo.ipynb | 4 ---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index 283920221f..aabd68cfad 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -1,6 +1,6 @@ # RAG demo with OpenVINO Model Server {#ovms_demos_continuous_batching_rag} -## Creating models repository for all the endpoints with ovms --pull from [Hugging Face Hub OpenVINO organization](https://huggingface.co/OpenVINO) +## Creating models repository for all the endpoints ::::{tab-set} @@ -19,7 +19,7 @@ docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/mo ``` ::: -:::{tab-item} On Baremetal Host +:::{tab-item} On Baremetal Windows **Required:** OpenVINO Model Server package - see [deployment instructions](../../../docs/deploying_server_baremetal.md) for details. ```bat @@ -37,7 +37,8 @@ ovms --add_to_config --config_path models/config.json --model_name OpenVINO/bge- :::: -### Optionally, if you want to deploy different models, use the built-in OVMS functionality in `openvino/model_server:latest-py` described in [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) +** Note If you want to deploy modelss in pytorch format you can use the built-in OVMS optimum-cli functionality of `openvino/model_server:latest-py` described in [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) +** Note You can also use [the windows service](../../../docs/windows_service.md) setup for the ease of use and shorter commands ## Deploying the model server diff --git a/demos/continuous_batching/rag/rag_demo.ipynb b/demos/continuous_batching/rag/rag_demo.ipynb index 88e9c67f72..732347e3bc 100644 --- a/demos/continuous_batching/rag/rag_demo.ipynb +++ b/demos/continuous_batching/rag/rag_demo.ipynb @@ -15,10 +15,6 @@ "OpenVINO models:\n", " `OpenVINO/Qwen3-8B-int4-ov` for `chat/completions` and `OpenVINO/bge-base-en-v1.5-fp16-ov` for `embeddings` and `OpenVINO/bge-reranker-base-fp16-ov` for `rerank` endpoint.\n", "\n", - "or\n", - "Converted models:\n", - " `meta-llama/Meta-Llama-3-8B-Instruct` for `chat/completions` and `Alibaba-NLP/gte-large-en-v1.5` for `embeddings` and `BAAI/bge-reranker-large` for `rerank` endpoint. \n", - "\n", "Check https://github.com/openvinotoolkit/model_server/tree/main/demos/continuous_batching/rag/README.md to see how they can be deployed.\n", "LLM model, embeddings and rerank can be on hosted on the same model server instance or separately as needed.\n", "openai_api_base , base_url parameters with the target url and model names in the commands might need to be adjusted. \n", From 25e72e1f04486962375ad3b293ca9d2bb407d3c9 Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Wed, 8 Apr 2026 13:29:13 +0200 Subject: [PATCH 07/10] Fix getuuid --- docs/pull_optimum_cli.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pull_optimum_cli.md b/docs/pull_optimum_cli.md index 0c046d3932..89f6e6ac63 100644 --- a/docs/pull_optimum_cli.md +++ b/docs/pull_optimum_cli.md @@ -56,7 +56,7 @@ ovms --pull --source_model "Qwen/Qwen3-4B" --model_repository_path /models --mod ```bash mkdir -p models -docker run -u $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-py --pull --source_model "Qwen/Qwen3-4B" --model_repository_path /models --model_name Qwen3-4B --task text_generation --weight-format int8 +docker run -u $(id -u):$(id -g) -e HF_HOME=/tmp -e TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor --rm -v $(pwd)/models:/models:rw openvino/model_server:latest-py --pull --source_model "Qwen/Qwen3-4B" --model_repository_path /models --model_name Qwen3-4B --task text_generation --weight-format int8 ``` ::: From de3e286de875fa00741a2b8b701c268f12adaa3e Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Wed, 8 Apr 2026 14:33:21 +0200 Subject: [PATCH 08/10] Code review --- demos/continuous_batching/rag/README.md | 5 +++-- docs/pull_optimum_cli.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index aabd68cfad..fc7b9f578d 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -37,8 +37,9 @@ ovms --add_to_config --config_path models/config.json --model_name OpenVINO/bge- :::: -** Note If you want to deploy modelss in pytorch format you can use the built-in OVMS optimum-cli functionality of `openvino/model_server:latest-py` described in [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) -** Note You can also use [the windows service](../../../docs/windows_service.md) setup for the ease of use and shorter commands +> NOTE: If you want to deploy models in pytorch format you can use the built-in OVMS optimum-cli functionality of `openvino/model_server:latest-py` described in [pull mode with optimum cli](../../../docs/pull_optimum_cli.md) + +> NOTE: You can also use [the windows service](../../../docs/windows_service.md) setup for the ease of use and shorter commands - with default model_repository_path and config_path ## Deploying the model server diff --git a/docs/pull_optimum_cli.md b/docs/pull_optimum_cli.md index 89f6e6ac63..e44be7541f 100644 --- a/docs/pull_optimum_cli.md +++ b/docs/pull_optimum_cli.md @@ -85,7 +85,7 @@ You can mount the HuggingFace cache to avoid downloading the original model in c Below is an example pull command with optimum model cache directory sharing for model download: ```bash -docker run -v /etc/passwd:/etc/passwd -e HF_HOME=/hf_home/cache --user $(id -u):$(id -g) --group-add=$(id -g) -v ${HOME}/.cache/huggingface/:/hf_home/cache -v $(pwd)/models:/models:rw openvino/model_server:latest-py --pull --model_repository_path /models --source_model meta-llama/Llama-3.2-1B-Instruct --task text_generation --weight-format int8 +docker run -e TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor -e HF_HOME=/hf_home/cache --user $(id -u):$(id -g) --group-add=$(id -g) -v ${HOME}/.cache/huggingface/:/hf_home/cache -v $(pwd)/models:/models:rw openvino/model_server:latest-py --pull --model_repository_path /models --source_model meta-llama/Llama-3.2-1B-Instruct --task text_generation --weight-format int8 ``` or deploy without caching the model files with passed HF_TOKEN for authorization: From e6464fd676a67413fde778b2766be569fd98bf2a Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Thu, 9 Apr 2026 11:06:26 +0200 Subject: [PATCH 09/10] Add to menu --- docs/prepare_generative_use_cases.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/prepare_generative_use_cases.md b/docs/prepare_generative_use_cases.md index 1add31ba69..5fc49c420a 100644 --- a/docs/prepare_generative_use_cases.md +++ b/docs/prepare_generative_use_cases.md @@ -7,10 +7,13 @@ hidden: --- ovms_docs_pull +ovms_docs_pull_optimum ovms_demos_common_export ``` -Prepare model using OVMS [pull mode](./pull_hf_models.md). +Prepare models using OVMS [pull mode](./pull_hf_models.md). + +Prepare models using OVMS with python[optimum pull mode](./pull_optimum_cli.md). Prepare models using [python script](../demos/common/export_models/README.md). From 35142fa68880f54a95d544bc8bbaef1d641f840a Mon Sep 17 00:00:00 2001 From: Rafal Sapala Date: Thu, 9 Apr 2026 11:28:41 +0200 Subject: [PATCH 10/10] Space fix --- docs/prepare_generative_use_cases.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/prepare_generative_use_cases.md b/docs/prepare_generative_use_cases.md index 5fc49c420a..b07a83ed32 100644 --- a/docs/prepare_generative_use_cases.md +++ b/docs/prepare_generative_use_cases.md @@ -14,6 +14,6 @@ ovms_demos_common_export Prepare models using OVMS [pull mode](./pull_hf_models.md). -Prepare models using OVMS with python[optimum pull mode](./pull_optimum_cli.md). +Prepare models using OVMS with python [optimum pull mode](./pull_optimum_cli.md). Prepare models using [python script](../demos/common/export_models/README.md).