Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions LLama.Benchmark/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ public static string ModelDir
public static string Generative7BModelPath => Path.Combine(ModelDir, "llama-2-7b-chat.Q3_K_S.gguf");
public static string EmbeddingModelPath => Path.Combine(ModelDir, "all-MiniLM-L12-v2.Q8_0.gguf");

public static string LLavaModelPath => Path.Combine("llava-v1.6-mistral-7b.Q3_K_XS.gguf");
public static string LLavaMmpPath => Path.Combine("mmproj-model-f16.gguf");
public static string LLavaImage => "Assets/extreme-ironing-taxi-610x427.jpg";

public static string TextCompletionPromptsFilePath => "Assets/TextCompletionPrompts.txt";
}
}
3 changes: 0 additions & 3 deletions LLama.Unittest/MtmdWeightsTests.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
using System;
using System.IO;
using LLama.Common;
using LLama.Native;
using Xunit;

namespace LLama.Unittest
{
Expand Down
24 changes: 0 additions & 24 deletions LLama.Unittest/NativeAbiTests.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using LLama.Native;
using Xunit;

namespace LLama.Unittest
{
Expand Down Expand Up @@ -79,27 +76,6 @@ public void ContextParamsSizeMatchesNative()
Assert.Equal(expectedSize, Marshal.SizeOf<LLamaContextParams>());
}

[Fact]
public void MtmdContextParamsSizeMatchesNative()
{
var pointerSize = IntPtr.Size;
var fields = new List<(int size, int align)>
{
(sizeof(sbyte), 1), // use_gpu
(sizeof(sbyte), 1), // print_timings
(sizeof(int), 4), // n_threads
(pointerSize, pointerSize), // image_marker
(pointerSize, pointerSize), // media_marker
(sizeof(int), 4), // flash_attn_type
(sizeof(sbyte), 1), // warmup
(sizeof(int), 4), // image_min_tokens
(sizeof(int), 4), // image_max_tokens
};

var expectedSize = ComputeSize(fields);
Assert.Equal(expectedSize, Marshal.SizeOf<NativeApi.mtmd_context_params>());
}

[Fact]
public void ModelParamsBoolBlockMatchesNative()
{
Expand Down
3 changes: 3 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ public class ModelOptions
/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

/// <inheritdoc />
public bool UseDirectIO { get; }

/// <inheritdoc />
public bool UseMemoryLock { get; set; } = false;

Expand Down
6 changes: 3 additions & 3 deletions LLama/Abstractions/ILLamaExecutor.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Collections.Generic;
using System.Collections.Generic;
using System.Threading;
using LLama.Native;

Expand All @@ -14,8 +14,8 @@ public interface ILLamaExecutor
/// </summary>
public LLamaContext Context { get; }

// LLava Section
//
// Multimodal Section

/// <summary>
/// Identify if it's a multi-modal model and there is a image to process.
/// </summary>
Expand Down
5 changes: 5 additions & 0 deletions LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ public interface IModelParams
/// </summary>
bool UseMemorymap { get; }

/// <summary>
/// Use direct io, takes precedence over use_mmap when supported
/// </summary>
bool UseDirectIO { get; }

/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
Expand Down
3 changes: 3 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ public record ModelParams
/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

/// <inheritdoc />
public bool UseDirectIO { get; set; }

/// <inheritdoc />
public bool UseMemoryLock { get; set; }

Expand Down
1 change: 1 addition & 0 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
result.use_direct_io = @params.UseDirectIO;
result.vocab_only = @params.VocabOnly;
result.check_tensors = @params.CheckTensors;

Expand Down Expand Up @@ -119,7 +120,7 @@
if (string.IsNullOrEmpty(name))
continue;

result[name] = buft;

Check warning on line 123 in LLama/Extensions/IModelParamsExtensions.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference argument for parameter 'key' in 'IntPtr Dictionary<string, IntPtr>.this[string key]'.

Check warning on line 123 in LLama/Extensions/IModelParamsExtensions.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

Possible null reference argument for parameter 'key' in 'IntPtr Dictionary<string, IntPtr>.this[string key]'.

Check warning on line 123 in LLama/Extensions/IModelParamsExtensions.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference argument for parameter 'key' in 'IntPtr Dictionary<string, IntPtr>.this[string key]'.
}

return result;
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaExecutorBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ public abstract class StatefulExecutorBase : ILLamaExecutor
/// </summary>
protected AntipromptProcessor AntipromptProcessor { get; }

// LLava Section
//
// Multimodal Section

/// <inheritdoc />
public bool IsMultiModal
{
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>506bb6e01009058f355</BinaryReleaseId>
<BinaryReleaseId>ff4affb4c1aa7eb4_v3</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
2 changes: 1 addition & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public bool use_mmap
private sbyte _use_mmap;

/// <summary>
/// use direct io, takes precedence over use_mmap
/// use direct io, takes precedence over use_mmap when supported
/// </summary>
public bool use_direct_io
{
Expand Down
19 changes: 19 additions & 0 deletions LLama/Native/LLamaParamsFitStatus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace LLama.Native;

public enum LLamaParamsFitStatus
{
/// <summary>
/// Found allocations that are projected to fit
/// </summary>
LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0,

/// <summary>
/// Could not find allocations that are projected to fit
/// </summary>
LLAMA_PARAMS_FIT_STATUS_FAILURE = 1,

/// <summary>
/// A hard error occurred, e.g. because no model could be found at the specified path
/// </summary>
LLAMA_PARAMS_FIT_STATUS_ERROR = 2,
}
6 changes: 3 additions & 3 deletions LLama/Native/Load/NativeLibraryConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,13 @@ public override string ToString()
public sealed partial class NativeLibraryConfig
{
/// <summary>
/// Set configurations for all the native libraries, including LLama and LLava
/// Set configurations for all the native libraries, including LLama and Multimodal
/// </summary>
[Obsolete("Please use NativeLibraryConfig.All instead, or set configurations for NativeLibraryConfig.LLama and NativeLibraryConfig.LLavaShared respectively.")]
[Obsolete("Please use NativeLibraryConfig.All instead, or set configurations for NativeLibraryConfig.LLama and NativeLibraryConfig.Mtmd respectively.")]
public static NativeLibraryConfigContainer Instance => All;

/// <summary>
/// Set configurations for all the native libraries, including LLama and LLava
/// Set configurations for all the native libraries, including LLama and Multimodal
/// </summary>
public static NativeLibraryConfigContainer All { get; }

Expand Down
1 change: 1 addition & 0 deletions LLama/Native/LoraAdapter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@
public void Unload()
{
Loaded = false;
llama_adapter_lora_free(Pointer);

Check warning on line 44 in LLama/Native/LoraAdapter.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

'llama_adapter_lora_free(IntPtr)' is obsolete: 'adapters are now freed together with the associated model'

Check warning on line 44 in LLama/Native/LoraAdapter.cs

View workflow job for this annotation

GitHub Actions / Test (osx-release)

'llama_adapter_lora_free(IntPtr)' is obsolete: 'adapters are now freed together with the associated model'

Check warning on line 44 in LLama/Native/LoraAdapter.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

'llama_adapter_lora_free(IntPtr)' is obsolete: 'adapters are now freed together with the associated model'

// Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
[Obsolete("adapters are now freed together with the associated model")]
static extern void llama_adapter_lora_free(IntPtr adapter);
}
}
7 changes: 0 additions & 7 deletions LLama/Native/MtmdContextParams.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System;
using System.Runtime.InteropServices;
using System.Text;

namespace LLama.Native;
Expand All @@ -24,12 +23,6 @@ public class MtmdContextParams
/// </summary>
public int NThreads { get; set; }

/// <summary>
/// Verbosity is no longer supported by mtmd_context_params and is ignored.
/// </summary>
[Obsolete("Verbosity is no longer supported by mtmd_context_params and is ignored.")]
public int Verbosity { get; set; }

/// <summary>
/// Marker token inserted into the text stream to reference an image embedding (deprecated by mtmd).
/// </summary>
Expand Down
20 changes: 0 additions & 20 deletions LLama/Native/MtmdImageEmbed.cs

This file was deleted.

5 changes: 3 additions & 2 deletions LLama/Native/NativeApi.Mtmd.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using System;
using System.Runtime.InteropServices;
using System.Text;

namespace LLama.Native;

Expand All @@ -25,6 +23,9 @@ internal struct mtmd_context_params
[MarshalAs(UnmanagedType.I1)] public bool warmup;
public int image_min_tokens;
public int image_max_tokens;

private IntPtr /* ggml_backend_sched_eval_callback */ cb_eval;
private IntPtr cb_eval_user_data;
}

[DllImport(mtmdLibraryName, EntryPoint = "mtmd_default_marker", CallingConvention = CallingConvention.Cdecl)]
Expand Down
46 changes: 44 additions & 2 deletions LLama/Native/NativeApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ public static void llama_empty_call()
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern long llama_max_devices();

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern nint llama_max_tensor_buft_overrides();

/// <summary>
/// Maximum number of parallel sequences
/// </summary>
Expand Down Expand Up @@ -72,6 +75,9 @@ public static void llama_empty_call()
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llama_supports_rpc();

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern uint llama_n_ctx_seq(SafeLLamaContextHandle ctx);

/// <summary>
/// Initialize the llama + ggml backend. Call once at the start of the program.
///
Expand Down Expand Up @@ -168,8 +174,11 @@ public static void llama_empty_call()

/// <summary>
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// <br />
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template.
/// See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
/// </summary>
/// <param name="tmpl">A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.</param>
/// <param name="tmpl">A Jinja template to use for this chat.</param>
/// <param name="chat">Pointer to a list of multiple llama_chat_message</param>
/// <param name="n_msg">Number of llama_chat_message in this chat</param>
/// <param name="add_ass">Whether to end the prompt with the token(s) that indicate the start of an assistant message.</param>
Expand Down Expand Up @@ -312,7 +321,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
/// <param name="il_end"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe int llama_apply_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len, int n_embd, int il_start, int il_end);
public static extern unsafe int llama_set_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len, int n_embd, int il_start, int il_end);

/// <summary>
/// Build a split GGUF final path for this chunk.
Expand Down Expand Up @@ -474,5 +483,38 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split
/// <returns>Name of the buffer type</returns>
[DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr ggml_backend_buft_name(IntPtr buft);

/// <summary>
/// Fits mparams and cparams to free device memory (assumes system memory is unlimited)
/// - returns true if the parameters could be successfully modified to fit device memory
/// - this function is NOT thread safe because it modifies the global llama logger state
/// - only parameters that have the same value as in llama_default_model_params are modified
/// with the exception of the context size which is modified if and only if equal to 0
/// </summary>
/// <param name="path"></param>
/// <param name="mparams"></param>
/// <param name="cparams"></param>
/// <param name="tensor_split">Writable buffer for tensor split, needs at least llama_max_devices elements</param>
/// <param name="tensor_buft_overrides">Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements</param>
/// <param name="margins">Margins of memory to leave per device in bytes</param>
/// <param name="n_ctx_min">Minimum context size to set when trying to reduce memory use</param>
/// <param name="log_level">Minimum log level to print during fitting, lower levels go to debug log</param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe LLamaParamsFitStatus llama_params_fit(
string path,
ref LLamaModelParams mparams,
ref LLamaContextParams cparams,
float* tensor_split,
LLamaModelTensorBufferOverride* tensor_buft_overrides,
nint* margins,
uint n_ctx_min,
int /* GGML_LOG_LEVEL */ log_level
);

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern long llama_time_us();


}
}
Loading
Loading