Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,19 +132,31 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
-> std::optional<infinicore::Tensor> {
return t.has_value() ? t.value()->to(device) : t;
};
auto to_device_vec = [&](const std::optional<std::vector<infinicore::Tensor>> &vec)
-> std::optional<std::vector<infinicore::Tensor>> {
if (!vec.has_value()) {
return vec;
}
std::vector<infinicore::Tensor> result;
result.reserve(vec->size());
for (const auto &t : vec.value()) {
result.push_back(t->to(device));
}
return result;
};

infinilm::InfinilmModel::Input input = {
to_device(input_ids), // @todo: on device in the future
to_device(pixel_values),
to_device(position_ids),
to_device(past_sequence_lengths), // @todo: on device in the future
to_device(total_sequence_lengths),
to_device(input_offsets),
to_device(cu_seqlens),
to_device(block_tables),
to_device(slot_mapping),
to_device(image_bound),
to_device(tgt_sizes),
to_device_vec(pixel_values),
to_device_vec(image_bound),
to_device_vec(tgt_sizes),
};

infinilm::global_state::get_forward_context().attn_metadata = {
Expand All @@ -153,8 +165,11 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
input.input_offsets,
input.cu_seqlens,
input.block_tables,
input.slot_mapping,
};
input.slot_mapping};

global_state::get_forward_context().mm_metadata = {
image_req_ids};

return input;
}

Expand Down
10 changes: 6 additions & 4 deletions csrc/engine/rank_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ class RankWorker {
struct Input {
/// Token IDs tensor of shape `[batch, seq_len]`.
std::optional<infinicore::Tensor> input_ids;
/// Image pixel values for multi-modal models.
std::optional<infinicore::Tensor> pixel_values;
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
Expand All @@ -52,10 +50,14 @@ class RankWorker {
std::optional<infinicore::Tensor> block_tables;
/// Slot ids for each token `[seq]`. Used for paged cache.
std::optional<infinicore::Tensor> slot_mapping;
/// Image pixel values for multi-modal models.
std::optional<std::vector<infinicore::Tensor>> pixel_values;
/// Image placeholder bounds for MiniCPM-V style replacement.
std::optional<infinicore::Tensor> image_bound;
std::optional<std::vector<infinicore::Tensor>> image_bound;
/// Target patch sizes for each image (MiniCPM-V).
std::optional<infinicore::Tensor> tgt_sizes;
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
/// req_id for each pixel_values among a batch
std::optional<std::vector<size_t>> image_req_ids;

float temperature{1};

Expand Down
5 changes: 5 additions & 0 deletions csrc/global_state/forward_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,13 @@ struct AttentionMetadata {
input.slot_mapping) {}
};

struct MultiModalMetadata {
std::optional<std::vector<size_t>> image_req_ids;
};

struct ForwardContext {
AttentionMetadata attn_metadata;
MultiModalMetadata mm_metadata;
std::vector<infinicore::Tensor> kv_cache_vec;
};

Expand Down
14 changes: 7 additions & 7 deletions csrc/models/infinilm_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ class InfinilmModel : public infinicore::nn::Module {
struct Input {
/// Token IDs tensor of shape `[batch, seq_len]`.
std::optional<infinicore::Tensor> input_ids;
/// Image pixel values for multi-modal models.
/// Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [batch, 3, patch, seq_len * patch]).
std::optional<infinicore::Tensor> pixel_values;
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
Expand All @@ -38,12 +35,15 @@ class InfinilmModel : public infinicore::nn::Module {
std::optional<infinicore::Tensor> block_tables;
/// Slot ids for each token `[seq]`. Used for paged cache.
std::optional<infinicore::Tensor> slot_mapping;
/// Image pixel values for multi-modal models.
/// Vector of tensors. Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [n_patch, 3, filter_H, H * W / filter_H]).
std::optional<std::vector<infinicore::Tensor>> pixel_values;
/// Image placeholder bounds for MiniCPM-V style replacement.
/// Tensor shape: [batch, max_ranges, 2] (start, end).
std::optional<infinicore::Tensor> image_bound;
/// Vector of tensors shape: [n_patch, 2].
std::optional<std::vector<infinicore::Tensor>> image_bound;
/// Target patch sizes for each image (MiniCPM-V).
/// Tensor shape: [batch, 2] or [batch, max_slices, 2] if pre-flattened.
std::optional<infinicore::Tensor> tgt_sizes;
/// Vector of tensors shape: [n_path, 2] if pre-flattened.
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
};

struct Output {
Expand Down
42 changes: 18 additions & 24 deletions csrc/models/minicpmv/minicpmv_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,36 +70,30 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
}
auto input_ids = input.input_ids.value();

if (input.pixel_values.has_value() && input_ids->size(1) > 1) {
if (!input.image_bound.has_value()) {
throw std::runtime_error("MiniCPMVModel: image_bound required for multimodal input");
if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) {
if (!input.image_bound.has_value() or !input.tgt_sizes.has_value()) {
throw std::runtime_error("MiniCPMVModel: image_bound and tgt_sizes must be provided with pixel_values");
}
if (input.pixel_values->size() != input.image_bound->size() || input.pixel_values->size() != input.tgt_sizes->size()) {
throw std::runtime_error("MiniCPMVModel: pixel_values, image_bound and tgt_sizes must have the same number of elements");
}
auto pixel_values = input.pixel_values.value();
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes);
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes);

auto inputs_embeds = llm_->model().embed_tokens(input_ids);
auto merged_embeds = replace_embeddings(inputs_embeds, vision_hidden, input.image_bound.value());

infinicore::Tensor position_ids;
if (input.position_ids.has_value()) {
position_ids = input.position_ids.value();
} else {
auto batch = merged_embeds->size(0);
auto seq_len = merged_embeds->size(1);
auto pos_cpu = infinicore::Tensor::zeros({batch, seq_len}, infinicore::DataType::I64, infinicore::Device::cpu());
auto *pos_ptr = reinterpret_cast<int64_t *>(pos_cpu->data());
for (size_t b = 0; b < batch; ++b) {
for (size_t i = 0; i < seq_len; ++i) {
pos_ptr[b * seq_len + i] = static_cast<int64_t>(i);
}
}
position_ids = pos_cpu->to(merged_embeds->device());

// inputs_embeds concat tokens from all requests, while images are processed per request
// slice inputs_embeds using request offsets to get the embedding of each request
infinicore::Tensor input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu());
int32_t *offsets = (int32_t *)(input_offsets_cpu->data());
for (size_t i : global_state::get_forward_context().mm_metadata.image_req_ids.value()) {
auto pixel_values = input.pixel_values.value().at(i);
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes.value().at(i));
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes.value().at(i));
inputs_embeds = replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
}

auto hidden_states = llm_->model().forward_embeds(
merged_embeds,
position_ids);
inputs_embeds,
input.position_ids.value());

auto logits = llm_->logits_from_hidden(hidden_states);
return {logits};
Expand Down
13 changes: 6 additions & 7 deletions csrc/models/minicpmv/resampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ Resampler::Resampler(size_t num_queries,
}

infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,
const std::optional<infinicore::Tensor> &tgt_sizes) const {
const infinicore::Tensor &tgt_sizes) const {
auto batch_size = x->size(0);
auto seq_len = x->size(1);

Expand All @@ -153,12 +153,11 @@ infinicore::Tensor Resampler::forward(const infinicore::Tensor &x,

// Build positional embeddings on CPU
std::vector<int64_t> tgt_sizes_host;
if (tgt_sizes.has_value()) {
auto tgt_cpu = tgt_sizes.value()->to(infinicore::Device::cpu());
auto n = tgt_cpu->numel();
tgt_sizes_host.resize(n);
std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
}

auto tgt_cpu = tgt_sizes->to(infinicore::Device::cpu());
auto n = tgt_cpu->numel();
tgt_sizes_host.resize(n);
std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));

auto pos_cpu = infinicore::Tensor::zeros({batch_size, seq_len, embed_dim_}, kv->dtype(), infinicore::Device::cpu());
auto *pos_ptr = reinterpret_cast<std::byte *>(pos_cpu->data());
Expand Down
2 changes: 1 addition & 1 deletion csrc/models/minicpmv/resampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Resampler : public infinicore::nn::Module {
const infinicore::Device &device);

infinicore::Tensor forward(const infinicore::Tensor &x,
const std::optional<infinicore::Tensor> &tgt_sizes) const;
const infinicore::Tensor &tgt_sizes) const;

private:
size_t num_queries_;
Expand Down
15 changes: 7 additions & 8 deletions csrc/models/minicpmv/siglip_vision.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ SiglipVisionEmbeddings::SiglipVisionEmbeddings(const nlohmann::json &config,
}

infinicore::Tensor SiglipVisionEmbeddings::forward(const infinicore::Tensor &pixel_values,
const std::optional<infinicore::Tensor> &tgt_sizes) const {
const infinicore::Tensor &tgt_sizes) const {
auto patch_embeds = patch_embedding_->forward(pixel_values);
auto batch_size = patch_embeds->size(0);
auto seq_len = patch_embeds->size(2) * patch_embeds->size(3);
Expand All @@ -50,12 +50,11 @@ infinicore::Tensor SiglipVisionEmbeddings::forward(const infinicore::Tensor &pix
const size_t num_patches_per_side = static_cast<size_t>(std::sqrt(static_cast<double>(num_positions_)));

std::vector<int64_t> tgt_sizes_host;
if (tgt_sizes.has_value()) {
auto tgt_cpu = tgt_sizes.value()->to(infinicore::Device::cpu());
auto n = tgt_cpu->numel();
tgt_sizes_host.resize(n);
std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));
}

auto tgt_cpu = tgt_sizes->to(infinicore::Device::cpu());
auto n = tgt_cpu->numel();
tgt_sizes_host.resize(n);
std::memcpy(tgt_sizes_host.data(), tgt_cpu->data(), n * sizeof(int64_t));

for (size_t b = 0; b < batch_size; ++b) {
size_t nb_h = num_patches_per_side;
Expand Down Expand Up @@ -211,7 +210,7 @@ SiglipVisionModel::SiglipVisionModel(const nlohmann::json &config,
}

infinicore::Tensor SiglipVisionModel::forward(const infinicore::Tensor &pixel_values,
const std::optional<infinicore::Tensor> &tgt_sizes) const {
const infinicore::Tensor &tgt_sizes) const {
auto hidden_states = embeddings_->forward(pixel_values, tgt_sizes);
hidden_states = encoder_->forward(hidden_states, std::nullopt);
return post_layernorm_->forward(hidden_states);
Expand Down
4 changes: 2 additions & 2 deletions csrc/models/minicpmv/siglip_vision.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class SiglipVisionEmbeddings : public infinicore::nn::Module {
const infinicore::Device &device);

infinicore::Tensor forward(const infinicore::Tensor &pixel_values,
const std::optional<infinicore::Tensor> &tgt_sizes) const;
const infinicore::Tensor &tgt_sizes) const;

private:
size_t hidden_size_;
Expand Down Expand Up @@ -119,7 +119,7 @@ class SiglipVisionModel : public infinicore::nn::Module {
bool drop_last_layer);

infinicore::Tensor forward(const infinicore::Tensor &pixel_values,
const std::optional<infinicore::Tensor> &tgt_sizes) const;
const infinicore::Tensor &tgt_sizes) const;

private:
nlohmann::json config_;
Expand Down
30 changes: 19 additions & 11 deletions csrc/pybind11/engine/engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ inline void bind_infer_engine(py::module &m) {
return state_dict_tp_all;
})
.def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
.def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
.def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
.def(
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
.def(
"reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
.def("get_cache_config", [](const InferEngine &self) -> std::shared_ptr<cache::CacheConfig> {
auto cfg = self.get_cache_config();
return cfg ? std::shared_ptr<cache::CacheConfig>(cfg->unique_copy()) : nullptr; })
Expand Down Expand Up @@ -114,8 +116,10 @@ inline void bind_infer_engine(py::module &m) {
return state_dict_tp_all;
})
.def("process_weights_after_loading", &InferEngine::process_weights_after_loading, "Process the weights after loading on all workers (e.g., for quantization)")
.def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
.def("reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
.def(
"forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
.def(
"reset_cache", [](InferEngine &self, std::shared_ptr<cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
.def("get_cache_config", [](const InferEngine &self) {
auto cfg = self.get_cache_config();
return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
Expand All @@ -125,29 +129,31 @@ inline void bind_infer_engine(py::module &m) {
.def(
py::init([](
std::optional<infinicore::Tensor> input_ids,
std::optional<infinicore::Tensor> pixel_values,
std::optional<infinicore::Tensor> position_ids,
std::optional<infinicore::Tensor> past_sequence_lengths,
std::optional<infinicore::Tensor> total_sequence_lengths,
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping,
std::optional<infinicore::Tensor> image_bound,
std::optional<infinicore::Tensor> tgt_sizes,
std::optional<std::vector<infinicore::Tensor>> pixel_values,
std::optional<std::vector<infinicore::Tensor>> image_bound,
std::optional<std::vector<infinicore::Tensor>> tgt_sizes,
std::optional<std::vector<size_t>> image_req_ids,
py::kwargs kwargs) {
InferEngine::Input input{
std::move(input_ids),
std::move(pixel_values),
std::move(position_ids),
std::move(past_sequence_lengths),
std::move(total_sequence_lengths),
std::move(input_offsets),
std::move(cu_seqlens),
std::move(block_tables),
std::move(slot_mapping),
std::move(pixel_values),
std::move(image_bound),
std::move(tgt_sizes),
std::move(image_req_ids),
};

// Explicit defaults
Expand Down Expand Up @@ -182,27 +188,29 @@ inline void bind_infer_engine(py::module &m) {
return input;
}),
py::arg("input_ids") = std::nullopt,
py::arg("pixel_values") = std::nullopt,
py::arg("position_ids") = std::nullopt,
py::arg("past_sequence_lengths") = std::nullopt,
py::arg("total_sequence_lengths") = std::nullopt,
py::arg("input_offsets") = std::nullopt,
py::arg("cu_seqlens") = std::nullopt,
py::arg("block_tables") = std::nullopt,
py::arg("slot_mapping") = std::nullopt,
py::arg("pixel_values") = std::nullopt,
py::arg("image_bound") = std::nullopt,
py::arg("tgt_sizes") = std::nullopt)
py::arg("tgt_sizes") = std::nullopt,
py::arg("image_req_ids") = std::nullopt)
.def_readwrite("input_ids", &InferEngine::Input::input_ids)
.def_readwrite("pixel_values", &InferEngine::Input::pixel_values)
.def_readwrite("position_ids", &InferEngine::Input::position_ids)
.def_readwrite("past_sequence_lengths", &InferEngine::Input::past_sequence_lengths)
.def_readwrite("total_sequence_lengths", &InferEngine::Input::total_sequence_lengths)
.def_readwrite("input_offsets", &InferEngine::Input::input_offsets)
.def_readwrite("cu_seqlens", &InferEngine::Input::cu_seqlens)
.def_readwrite("block_tables", &InferEngine::Input::block_tables)
.def_readwrite("slot_mapping", &InferEngine::Input::slot_mapping)
.def_readwrite("pixel_values", &InferEngine::Input::pixel_values)
.def_readwrite("image_bound", &InferEngine::Input::image_bound)
.def_readwrite("tgt_sizes", &InferEngine::Input::tgt_sizes)
.def_readwrite("image_req_ids", &InferEngine::Input::image_req_ids)
.def_readwrite("temperature", &InferEngine::Input::temperature)
.def_readwrite("top_k", &InferEngine::Input::top_k)
.def_readwrite("top_p", &InferEngine::Input::top_p);
Expand Down
Loading