InfiniTensor · PanZezhong1725 · May 18, 2026
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -132,19 +132,31 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
         -> std::optional<infinicore::Tensor> {
         return t.has_value() ? t.value()->to(device) : t;
     };
+    auto to_device_vec = [&](const std::optional<std::vector<infinicore::Tensor>> &vec)
+        -> std::optional<std::vector<infinicore::Tensor>> {
+        if (!vec.has_value()) {
+            return vec;
+        }
+        std::vector<infinicore::Tensor> result;
+        result.reserve(vec->size());
+        for (const auto &t : vec.value()) {
+            result.push_back(t->to(device));
+        }
+        return result;
+    };
 
     infinilm::InfinilmModel::Input input = {
         to_device(input_ids), // @todo: on device in the future
-        to_device(pixel_values),
         to_device(position_ids),
         to_device(past_sequence_lengths), // @todo: on device in the future
         to_device(total_sequence_lengths),
         to_device(input_offsets),
         to_device(cu_seqlens),
         to_device(block_tables),
         to_device(slot_mapping),
-        to_device(image_bound),
-        to_device(tgt_sizes),
+        to_device_vec(pixel_values),
+        to_device_vec(image_bound),
+        to_device_vec(tgt_sizes),
     };
 
     infinilm::global_state::get_forward_context().attn_metadata = {
@@ -153,8 +165,11 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
         input.input_offsets,
         input.cu_seqlens,
         input.block_tables,
-        input.slot_mapping,
-    };
+        input.slot_mapping};
+
+    global_state::get_forward_context().mm_metadata = {
+        image_req_ids};
+
     return input;
 }
 

diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp
@@ -36,8 +36,6 @@ class RankWorker {
     struct Input {
         /// Token IDs tensor of shape `[batch, seq_len]`.
         std::optional<infinicore::Tensor> input_ids;
-        /// Image pixel values for multi-modal models.
-        std::optional<infinicore::Tensor> pixel_values;
         /// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
         std::optional<infinicore::Tensor> position_ids;
         /// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
@@ -52,10 +50,14 @@ class RankWorker {
         std::optional<infinicore::Tensor> block_tables;
         /// Slot ids for each token `[seq]`. Used for paged cache.
         std::optional<infinicore::Tensor> slot_mapping;
+        /// Image pixel values for multi-modal models.
+        std::optional<std::vector<infinicore::Tensor>> pixel_values;
         /// Image placeholder bounds for MiniCPM-V style replacement.
-        std::optional<infinicore::Tensor> image_bound;
+        std::optional<std::vector<infinicore::Tensor>> image_bound;
         /// Target patch sizes for each image (MiniCPM-V).
-        std::optional<infinicore::Tensor> tgt_sizes;
+        std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
+        /// req_id for each pixel_values among a batch
+        std::optional<std::vector<size_t>> image_req_ids;
 
         float temperature{1};
 

diff --git a/csrc/global_state/forward_context.hpp b/csrc/global_state/forward_context.hpp
@@ -40,8 +40,13 @@ struct AttentionMetadata {
                                                                                        input.slot_mapping) {}
 };
 
+struct MultiModalMetadata {
+    std::optional<std::vector<size_t>> image_req_ids;
+};
+
 struct ForwardContext {
     AttentionMetadata attn_metadata;
+    MultiModalMetadata mm_metadata;
     std::vector<infinicore::Tensor> kv_cache_vec;
 };
 

diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp
@@ -21,9 +21,6 @@ class InfinilmModel : public infinicore::nn::Module {
     struct Input {
         /// Token IDs tensor of shape `[batch, seq_len]`.
         std::optional<infinicore::Tensor> input_ids;
-        /// Image pixel values for multi-modal models.
-        /// Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [batch, 3, patch, seq_len * patch]).
-        std::optional<infinicore::Tensor> pixel_values;
         /// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
         std::optional<infinicore::Tensor> position_ids;
         /// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
@@ -38,12 +35,15 @@ class InfinilmModel : public infinicore::nn::Module {
         std::optional<infinicore::Tensor> block_tables;
         /// Slot ids for each token `[seq]`. Used for paged cache.
         std::optional<infinicore::Tensor> slot_mapping;
+        /// Image pixel values for multi-modal models.
+        /// Vector of tensors. Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [n_patch, 3, filter_H, H * W / filter_H]).
+        std::optional<std::vector<infinicore::Tensor>> pixel_values;
         /// Image placeholder bounds for MiniCPM-V style replacement.
-        /// Tensor shape: [batch, max_ranges, 2] (start, end).
-        std::optional<infinicore::Tensor> image_bound;
+        /// Vector of tensors shape: [n_patch, 2].
+        std::optional<std::vector<infinicore::Tensor>> image_bound;
         /// Target patch sizes for each image (MiniCPM-V).
-        /// Tensor shape: [batch, 2] or [batch, max_slices, 2] if pre-flattened.
-        std::optional<infinicore::Tensor> tgt_sizes;
+        /// Vector of tensors shape: [n_path, 2] if pre-flattened.
+        std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
     };
 
     struct Output {

diff --git a/csrc/models/minicpmv/minicpmv_model.cpp b/csrc/models/minicpmv/minicpmv_model.cpp
@@ -70,36 +70,30 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
     }
     auto input_ids = input.input_ids.value();
 
-    if (input.pixel_values.has_value() && input_ids->size(1) > 1) {
-        if (!input.image_bound.has_value()) {
-            throw std::runtime_error("MiniCPMVModel: image_bound required for multimodal input");
+    if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) {
+        if (!input.image_bound.has_value() or !input.tgt_sizes.has_value()) {
+            throw std::runtime_error("MiniCPMVModel: image_bound and tgt_sizes must be provided with pixel_values");
+        }
+        if (input.pixel_values->size() != input.image_bound->size() || input.pixel_values->size() != input.tgt_sizes->size()) {
+            throw std::runtime_error("MiniCPMVModel: pixel_values, image_bound and tgt_sizes must have the same number of elements");
         }
-        auto pixel_values = input.pixel_values.value();
-        auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes);
-        auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes);
 
         auto inputs_embeds = llm_->model().embed_tokens(input_ids);
-        auto merged_embeds = replace_embeddings(inputs_embeds, vision_hidden, input.image_bound.value());
-
-        infinicore::Tensor position_ids;
-        if (input.position_ids.has_value()) {
-            position_ids = input.position_ids.value();
-        } else {
-            auto batch = merged_embeds->size(0);
-            auto seq_len = merged_embeds->size(1);
-            auto pos_cpu = infinicore::Tensor::zeros({batch, seq_len}, infinicore::DataType::I64, infinicore::Device::cpu());
-            auto *pos_ptr = reinterpret_cast<int64_t *>(pos_cpu->data());
-            for (size_t b = 0; b < batch; ++b) {
-                for (size_t i = 0; i < seq_len; ++i) {
-                    pos_ptr[b * seq_len + i] = static_cast<int64_t>(i);
-                }
-            }
-            position_ids = pos_cpu->to(merged_embeds->device());
+
+        // inputs_embeds concat tokens from all requests, while images are processed per request
+        // slice inputs_embeds using request offsets to get the embedding of each request
+        infinicore::Tensor input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu());
+        int32_t *offsets = (int32_t *)(input_offsets_cpu->data());
+        for (size_t i : global_state::get_forward_context().mm_metadata.image_req_ids.value()) {
+            auto pixel_values = input.pixel_values.value().at(i);
+            auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes.value().at(i));
+            auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes.value().at(i));
+            inputs_embeds = replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
         }
 
         auto hidden_states = llm_->model().forward_embeds(
-            merged_embeds,
-            position_ids);
+            inputs_embeds,
+            input.position_ids.value());
 
         auto logits = llm_->logits_from_hidden(hidden_states);
         return {logits};

diff --git a/csrc/models/minicpmv/resampler.cpp b/csrc/models/minicpmv/resampler.cpp
@@ -141,7 +141,7 @@ Resampler::Resampler(size_t num_queries,
 }
 
 infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,
-                                      const std::optional<infinicore::Tensor> &tgt_sizes) const {
+                                      const infinicore::Tensor &tgt_sizes) const {
     auto batch_size = x->size(0);
     auto seq_len = x->size(1);
 
@@ -153,12 +153,11 @@ infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,
 
     // Build positional embeddings on CPU
     std::vector<int64_t> tgt_sizes_host;
-    if (tgt_sizes.has_value()) {
-        auto tgt_cpu = tgt_sizes.value()->to(infinicore::Device::cpu());
-        auto n = tgt_cpu->numel();
-        tgt_sizes_host.resize(n);
-        std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
-    }
+
+    auto tgt_cpu = tgt_sizes->to(infinicore::Device::cpu());
+    auto n = tgt_cpu->numel();
+    tgt_sizes_host.resize(n);
+    std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
 
     auto pos_cpu = infinicore::Tensor::zeros({batch_size, seq_len, embed_dim_}, kv->dtype(), infinicore::Device::cpu());
     auto *pos_ptr = reinterpret_cast<std::byte *>(pos_cpu->data());

diff --git a/csrc/models/minicpmv/resampler.hpp b/csrc/models/minicpmv/resampler.hpp
@@ -43,7 +43,7 @@ class Resampler : public infinicore::nn::Module {
               const infinicore::Device &device);
 
     infinicore::Tensor forward(const infinicore::Tensor &x,
-                               const std::optional<infinicore::Tensor> &tgt_sizes) const;
+                               const infinicore::Tensor &tgt_sizes) const;
 
 private:
     size_t num_queries_;

diff --git a/csrc/models/minicpmv/siglip_vision.cpp b/csrc/models/minicpmv/siglip_vision.cpp
@@ -36,7 +36,7 @@ SiglipVisionEmbeddings::SiglipVisionEmbeddings(const nlohmann::json &config,
 }
 
 infinicore::Tensor SiglipVisionEmbeddings::forward(const infinicore::Tensor &pixel_values,
-                                                   const std::optional<infinicore::Tensor> &tgt_sizes) const {
+                                                   const infinicore::Tensor &tgt_sizes) const {
     auto patch_embeds = patch_embedding_->forward(pixel_values);
     auto batch_size = patch_embeds->size(0);
     auto seq_len = patch_embeds->size(2) * patch_embeds->size(3);
@@ -50,12 +50,11 @@ infinicore::Tensor SiglipVisionEmbeddings::forward(const infinicore::Tensor &pix
     const size_t num_patches_per_side = static_cast<size_t>(std::sqrt(static_cast<double>(num_positions_)));
 
     std::vector<int64_t> tgt_sizes_host;
-    if (tgt_sizes.has_value()) {
-        auto tgt_cpu = tgt_sizes.value()->to(infinicore::Device::cpu());
-        auto n = tgt_cpu->numel();
-        tgt_sizes_host.resize(n);
-        std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
-    }
+
+    auto tgt_cpu = tgt_sizes->to(infinicore::Device::cpu());
+    auto n = tgt_cpu->numel();
+    tgt_sizes_host.resize(n);
+    std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
 
     for (size_t b = 0; b < batch_size; ++b) {
         size_t nb_h = num_patches_per_side;
@@ -211,7 +210,7 @@ SiglipVisionModel::SiglipVisionModel(const nlohmann::json &config,
 }
 
 infinicore::Tensor SiglipVisionModel::forward(const infinicore::Tensor &pixel_values,
-                                              const std::optional<infinicore::Tensor> &tgt_sizes) const {
+                                              const infinicore::Tensor &tgt_sizes) const {
     auto hidden_states = embeddings_->forward(pixel_values, tgt_sizes);
     hidden_states = encoder_->forward(hidden_states, std::nullopt);
     return post_layernorm_->forward(hidden_states);

diff --git a/csrc/models/minicpmv/siglip_vision.hpp b/csrc/models/minicpmv/siglip_vision.hpp
@@ -36,7 +36,7 @@ class SiglipVisionEmbeddings : public infinicore::nn::Module {
                            const infinicore::Device &device);
 
     infinicore::Tensor forward(const infinicore::Tensor &pixel_values,
-                               const std::optional<infinicore::Tensor> &tgt_sizes) const;
+                               const infinicore::Tensor &tgt_sizes) const;
 
 private:
     size_t hidden_size_;
@@ -119,7 +119,7 @@ class SiglipVisionModel : public infinicore::nn::Module {
                       bool drop_last_layer);
 
     infinicore::Tensor forward(const infinicore::Tensor &pixel_values,
-                               const std::optional<infinicore::Tensor> &tgt_sizes) const;
+                               const infinicore::Tensor &tgt_sizes) const;
 
 private:
     nlohmann::json config_;

diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
@@ -67,8 +67,10 @@ inline void bind_infer_engine(py::module &m) {
             return state_dict_tp_all;
         })
         .def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
-        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def(
+            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def(
+            "reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
             auto cfg = self.get_cache_config();
             return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })
@@ -114,8 +116,10 @@ inline void bind_infer_engine(py::module &m) {
             return state_dict_tp_all;
         })
         .def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
-        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def(
+            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def(
+            "reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) {
             auto cfg = self.get_cache_config();
             return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
@@ -125,29 +129,31 @@ inline void bind_infer_engine(py::module &m) {
         .def(
             py::init([](
                          std::optional<infinicore::Tensor> input_ids,
-                         std::optional<infinicore::Tensor> pixel_values,
                          std::optional<infinicore::Tensor> position_ids,
                          std::optional<infinicore::Tensor> past_sequence_lengths,
                          std::optional<infinicore::Tensor> total_sequence_lengths,
                          std::optional<infinicore::Tensor> input_offsets,
                          std::optional<infinicore::Tensor> cu_seqlens,
                          std::optional<infinicore::Tensor> block_tables,
                          std::optional<infinicore::Tensor> slot_mapping,
-                         std::optional<infinicore::Tensor> image_bound,
-                         std::optional<infinicore::Tensor> tgt_sizes,
+                         std::optional<std::vector<infinicore::Tensor>> pixel_values,
+                         std::optional<std::vector<infinicore::Tensor>> image_bound,
+                         std::optional<std::vector<infinicore::Tensor>> tgt_sizes,
+                         std::optional<std::vector<size_t>> image_req_ids,
                          py::kwargs kwargs) {
                 InferEngine::Input input{
                     std::move(input_ids),
-                    std::move(pixel_values),
                     std::move(position_ids),
                     std::move(past_sequence_lengths),
                     std::move(total_sequence_lengths),
                     std::move(input_offsets),
                     std::move(cu_seqlens),
                     std::move(block_tables),
                     std::move(slot_mapping),
+                    std::move(pixel_values),
                     std::move(image_bound),
                     std::move(tgt_sizes),
+                    std::move(image_req_ids),
                 };
 
                 // Explicit defaults
@@ -182,27 +188,29 @@ inline void bind_infer_engine(py::module &m) {
                 return input;
             }),
             py::arg("input_ids") = std::nullopt,
-            py::arg("pixel_values") = std::nullopt,
             py::arg("position_ids") = std::nullopt,
             py::arg("past_sequence_lengths") = std::nullopt,
             py::arg("total_sequence_lengths") = std::nullopt,
             py::arg("input_offsets") = std::nullopt,
             py::arg("cu_seqlens") = std::nullopt,
             py::arg("block_tables") = std::nullopt,
             py::arg("slot_mapping") = std::nullopt,
+            py::arg("pixel_values") = std::nullopt,
             py::arg("image_bound") = std::nullopt,
-            py::arg("tgt_sizes") = std::nullopt)
+            py::arg("tgt_sizes") = std::nullopt,
+            py::arg("image_req_ids") = std::nullopt)
         .def_readwrite("input_ids", &InferEngine::Input::input_ids)
-        .def_readwrite("pixel_values", &InferEngine::Input::pixel_values)
         .def_readwrite("position_ids", &InferEngine::Input::position_ids)
         .def_readwrite("past_sequence_lengths", &InferEngine::Input::past_sequence_lengths)
         .def_readwrite("total_sequence_lengths", &InferEngine::Input::total_sequence_lengths)
         .def_readwrite("input_offsets", &InferEngine::Input::input_offsets)
         .def_readwrite("cu_seqlens", &InferEngine::Input::cu_seqlens)
         .def_readwrite("block_tables", &InferEngine::Input::block_tables)
         .def_readwrite("slot_mapping", &InferEngine::Input::slot_mapping)
+        .def_readwrite("pixel_values", &InferEngine::Input::pixel_values)
         .def_readwrite("image_bound", &InferEngine::Input::image_bound)
         .def_readwrite("tgt_sizes", &InferEngine::Input::tgt_sizes)
+        .def_readwrite("image_req_ids", &InferEngine::Input::image_req_ids)
         .def_readwrite("temperature", &InferEngine::Input::temperature)
         .def_readwrite("top_k", &InferEngine::Input::top_k)
         .def_readwrite("top_p", &InferEngine::Input::top_p);