SciSharp · martindevans · Mar 4, 2026 · Feb 18, 2026 · Feb 22, 2026 · Mar 3, 2026
diff --git a/LLama.Benchmark/Constants.cs b/LLama.Benchmark/Constants.cs
@@ -14,10 +14,6 @@ public static string ModelDir
         public static string Generative7BModelPath =>  Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
         public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");
 
-        public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
-        public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
-        public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";
-
         public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
     }
 }
diff --git a/LLama.Unittest/MtmdWeightsTests.cs b/LLama.Unittest/MtmdWeightsTests.cs
@@ -1,8 +1,5 @@
-using System;
-using System.IO;
 using LLama.Common;
 using LLama.Native;
-using Xunit;
 
 namespace LLama.Unittest
 {

diff --git a/LLama.Unittest/NativeAbiTests.cs b/LLama.Unittest/NativeAbiTests.cs
@@ -1,8 +1,5 @@
-using System;
-using System.Collections.Generic;
 using System.Runtime.InteropServices;
 using LLama.Native;
-using Xunit;
 
 namespace LLama.Unittest
 {
@@ -79,27 +76,6 @@ public void ContextParamsSizeMatchesNative()
             Assert.Equal(expectedSize, Marshal.SizeOf<LLamaContextParams>());
         }
 
-        [Fact]
-        public void MtmdContextParamsSizeMatchesNative()
-        {
-            var pointerSize = IntPtr.Size;
-            var fields = new List<(int size, int align)>
-            {
-                (sizeof(sbyte), 1), // use_gpu
-                (sizeof(sbyte), 1), // print_timings
-                (sizeof(int), 4), // n_threads
-                (pointerSize, pointerSize), // image_marker
-                (pointerSize, pointerSize), // media_marker
-                (sizeof(int), 4), // flash_attn_type
-                (sizeof(sbyte), 1), // warmup
-                (sizeof(int), 4), // image_min_tokens
-                (sizeof(int), 4), // image_max_tokens
-            };
-
-            var expectedSize = ComputeSize(fields);
-            Assert.Equal(expectedSize, Marshal.SizeOf<NativeApi.mtmd_context_params>());
-        }
-
         [Fact]
         public void ModelParamsBoolBlockMatchesNative()
         {

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -41,6 +41,9 @@ public class ModelOptions
         /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 
+        /// <inheritdoc />
+        public bool UseDirectIO { get; }
+
         /// <inheritdoc />
         public bool UseMemoryLock { get; set; } = false;
 

diff --git a/LLama/Abstractions/ILLamaExecutor.cs b/LLama/Abstractions/ILLamaExecutor.cs
@@ -1,4 +1,4 @@
-using System.Collections.Generic;
+using System.Collections.Generic;
 using System.Threading;
 using LLama.Native;
 
@@ -14,8 +14,8 @@ public interface ILLamaExecutor
         /// </summary>
         public LLamaContext Context { get; }
 
-        // LLava Section
-        //
+        // Multimodal Section
+
         /// <summary>
         /// Identify if it's a multi-modal model and there is a image to process.
         /// </summary>

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -54,6 +54,11 @@ public interface IModelParams
         /// </summary>
         bool UseMemorymap { get; }
 
+        /// <summary>
+        /// Use direct io, takes precedence over use_mmap when supported
+        /// </summary>
+        bool UseDirectIO { get; }
+
         /// <summary>
         /// Use mlock to keep model in memory (use_mlock)
         /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -34,6 +34,9 @@ public record ModelParams
         /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 
+        /// <inheritdoc />
+        public bool UseDirectIO { get; set; }
+
         /// <inheritdoc />
         public bool UseMemoryLock { get; set; }
 

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -38,6 +38,7 @@
 
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
+        result.use_direct_io = @params.UseDirectIO;
         result.vocab_only = @params.VocabOnly;
         result.check_tensors = @params.CheckTensors;
 
@@ -119,7 +120,7 @@
            if (string.IsNullOrEmpty(name))
                continue;

            result[name] = buft;
        }

        return result;

diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
@@ -69,8 +69,8 @@ public abstract class StatefulExecutorBase : ILLamaExecutor
         /// </summary>
         protected AntipromptProcessor AntipromptProcessor { get; }
 
-        // LLava Section 
-        //
+        // Multimodal Section
+
         /// <inheritdoc />
         public bool IsMultiModal
         {

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>506bb6e01009058f355</BinaryReleaseId>
+    <BinaryReleaseId>ff4affb4c1aa7eb4_v3</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -82,7 +82,7 @@ public bool use_mmap
         private sbyte _use_mmap;
 
         /// <summary>
-        /// use direct io, takes precedence over use_mmap
+        /// use direct io, takes precedence over use_mmap when supported
         /// </summary>
         public bool use_direct_io
         {

diff --git a/LLama/Native/LLamaParamsFitStatus.cs b/LLama/Native/LLamaParamsFitStatus.cs
@@ -0,0 +1,19 @@
+namespace LLama.Native;
+
+public enum LLamaParamsFitStatus
+{
+    /// <summary>
+    /// Found allocations that are projected to fit
+    /// </summary>
+    LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,
+
+    /// <summary>
+    /// Could not find allocations that are projected to fit
+    /// </summary>
+    LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,
+
+    /// <summary>
+    /// A hard error occurred, e.g. because no model could be found at the specified path
+    /// </summary>
+    LLAMA_PARAMS_FIT_STATUS_ERROR = 2,
+}
diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs
@@ -283,13 +283,13 @@ public override string ToString()
     public sealed partial class NativeLibraryConfig
     {
         /// <summary>
-        /// Set configurations for all the native libraries, including LLama and LLava
+        /// Set configurations for all the native libraries, including LLama and Multimodal
         /// </summary>
-        [Obsolete("Please use NativeLibraryConfig.All instead, or set configurations for NativeLibraryConfig.LLama and NativeLibraryConfig.LLavaShared respectively.")]
+        [Obsolete("Please use NativeLibraryConfig.All instead, or set configurations for NativeLibraryConfig.LLama and NativeLibraryConfig.Mtmd respectively.")]
         public static NativeLibraryConfigContainer Instance => All;
 
         /// <summary>
-        /// Set configurations for all the native libraries, including LLama and LLava
+        /// Set configurations for all the native libraries, including LLama and Multimodal
         /// </summary>
         public static NativeLibraryConfigContainer All { get; }
 

diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
@@ -41,10 +41,11 @@
    public void Unload()
    {
        Loaded = false;
        llama_adapter_lora_free(Pointer);
 
         // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        [Obsolete("adapters are now freed together with the associated model")]
         static extern void llama_adapter_lora_free(IntPtr adapter);
     }
 }
diff --git a/LLama/Native/MtmdContextParams.cs b/LLama/Native/MtmdContextParams.cs
@@ -1,5 +1,4 @@
 using System;
-using System.Runtime.InteropServices;
 using System.Text;
 
 namespace LLama.Native;
@@ -24,12 +23,6 @@ public class MtmdContextParams
     /// </summary>
     public int NThreads { get; set; }
 
-    /// <summary>
-    /// Verbosity is no longer supported by mtmd_context_params and is ignored.
-    /// </summary>
-    [Obsolete("Verbosity is no longer supported by mtmd_context_params and is ignored.")]
-    public int Verbosity { get; set; }
-
     /// <summary>
     /// Marker token inserted into the text stream to reference an image embedding (deprecated by mtmd).
     /// </summary>

diff --git a/LLama/Native/MtmdImageEmbed.cs b/LLama/Native/MtmdImageEmbed.cs
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
@@ -1,6 +1,4 @@
 using System;
-using System.Runtime.InteropServices;
-using System.Text;
 
 namespace LLama.Native;
 
@@ -25,6 +23,9 @@ internal struct mtmd_context_params
         [MarshalAs(UnmanagedType.I1)] public bool warmup;
         public int image_min_tokens;
         public int image_max_tokens;
+
+        private IntPtr /* ggml_backend_sched_eval_callback */ cb_eval;
+        private IntPtr cb_eval_user_data;
     }
 
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_default_marker", CallingConvention = CallingConvention.Cdecl)]

diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -33,6 +33,9 @@ public static void llama_empty_call()
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern long llama_max_devices();
 
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern nint llama_max_tensor_buft_overrides();
+
         /// <summary>
         /// Maximum number of parallel sequences
         /// </summary>
@@ -72,6 +75,9 @@ public static void llama_empty_call()
         [return: MarshalAs(UnmanagedType.U1)]
         public static extern bool llama_supports_rpc();
 
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern uint llama_n_ctx_seq(SafeLLamaContextHandle ctx);
+
         /// <summary>
         /// Initialize the llama + ggml backend. Call once at the start of the program.
         ///
@@ -168,8 +174,11 @@ public static void llama_empty_call()
 
         /// <summary>
         /// Apply chat template. Inspired by hf apply_chat_template() on python.
+        /// <br />
+        /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template.
+        /// See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
         /// </summary>
-        /// <param name="tmpl">A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.</param>
+        /// <param name="tmpl">A Jinja template to use for this chat.</param>
         /// <param name="chat">Pointer to a list of multiple llama_chat_message</param>
         /// <param name="n_msg">Number of llama_chat_message in this chat</param>
         /// <param name="add_ass">Whether to end the prompt with the token(s) that indicate the start of an assistant message.</param>
@@ -312,7 +321,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <param name="il_end"></param>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe int llama_apply_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len, int n_embd, int il_start, int il_end);
+        public static extern unsafe int llama_set_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len, int n_embd, int il_start, int il_end);
 
         /// <summary>
         /// Build a split GGUF final path for this chunk.
@@ -474,5 +483,38 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split
         /// <returns>Name of the buffer type</returns>
         [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
+
+        /// <summary>
+        /// Fits mparams and cparams to free device memory (assumes system memory is unlimited)
+        ///   - returns true if the parameters could be successfully modified to fit device memory
+        ///   - this function is NOT thread safe because it modifies the global llama logger state
+        ///   - only parameters that have the same value as in llama_default_model_params are modified
+        ///     with the exception of the context size which is modified if and only if equal to 0
+        /// </summary>
+        /// <param name="path"></param>
+        /// <param name="mparams"></param>
+        /// <param name="cparams"></param>
+        /// <param name="tensor_split">Writable buffer for tensor split, needs at least llama_max_devices elements</param>
+        /// <param name="tensor_buft_overrides">Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements</param>
+        /// <param name="margins">Margins of memory to leave per device in bytes</param>
+        /// <param name="n_ctx_min">Minimum context size to set when trying to reduce memory use</param>
+        /// <param name="log_level">Minimum log level to print during fitting, lower levels go to debug log</param>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe LLamaParamsFitStatus llama_params_fit(
+            string path,
+            ref LLamaModelParams mparams,
+            ref LLamaContextParams cparams,
+            float* tensor_split,
+            LLamaModelTensorBufferOverride* tensor_buft_overrides,
+            nint* margins,
+            uint n_ctx_min,
+            int /* GGML_LOG_LEVEL */ log_level
+        );
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern long llama_time_us();
+
+
     }
 }