LLM: plugin module - added repeat penalty handling

Chris Warren-Smith · Chris Warren-Smith · commit fc00a831be8a · 2025-12-24T15:34:28.000+10:30
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -5,6 +5,7 @@
 //
 // Copyright(C) 2026 Chris Warren-Smith
 
+#include <chrono>
 #include <vector>
 #include "llama.h"
 #include "llama-sb.h"
@@ -14,12 +15,14 @@ Llama::Llama() :
   _ctx(nullptr),
   _sampler(nullptr),
   _vocab(nullptr),
+  _penalty_last_n(64),
+  _penalty_repeat(1.1f),
   _temperature(0),
   _top_k(0),
   _top_p(1.0f),
   _min_p(0.0f),
   _max_tokens(150),
-  _log_level(GGML_LOG_LEVEL_NONE) {
+  _log_level(GGML_LOG_LEVEL_CONT) {
   llama_log_set([](enum ggml_log_level level, const char * text, void *user_data) {
     Llama *llama = (Llama *)user_data;
     if (level > llama->_log_level) {
@@ -82,6 +85,10 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch) {
 
 void Llama::configure_sampler() {
   llama_sampler_reset(_sampler);
+  if (_penalty_last_n != 0 && _penalty_repeat != 1.0f) {
+    auto penalties = llama_sampler_init_penalties(_penalty_last_n, _penalty_repeat, 0.0f, 0.0f);
+    llama_sampler_chain_add(_sampler, penalties);
+  }
   if (_temperature <= 0.0f) {
     llama_sampler_chain_add(_sampler, llama_sampler_init_greedy());
   } else {
@@ -99,72 +106,104 @@ void Llama::configure_sampler() {
   }
 }
 
+void Llama::reset() {
+  // llama_kv_cache_clear(it->second->ctx);
+  _chat_prompt.clear();
+}
+
 string Llama::generate(const string &prompt) {
-  string out;
+  string out = prompt;
 
-  // find the number of tokens in the prompt
-  int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(), nullptr, 0, true, true);
+  // ---- tokenize prompt ----
+  int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
+                                 nullptr, 0, true, true);
+
+  if (n_prompt <= 0) {
+    _last_error = "failed to tokenize prompt";
+    return out;
+  }
 
-  // allocate space for the tokens and tokenize the prompt
   std::vector<llama_token> prompt_tokens(n_prompt);
-  if (llama_tokenize(_vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
-    _last_error = "failed tokenize the prompt";
+  if (llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
+                     prompt_tokens.data(), n_prompt, true, true) < 0) {
+    _last_error = "failed to tokenize prompt";
     return out;
   }
 
-  // initialize the sampler
+  // ---- sampler ----
   configure_sampler();
 
-  // prepare a batch for the prompt
-  llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
-  if (llama_model_has_encoder(_model)) {
-    if (llama_encode(_ctx, batch)) {
-      _last_error = "failed to eval";
-      return out;
-    }
+  // ---- decode prompt ----
+  llama_batch batch = llama_batch_get_one(prompt_tokens.data(), n_prompt);
+  if (llama_decode(_ctx, batch)) {
+    _last_error = "failed to eval prompt";
+    return out;
+  }
 
+  // ---- handle encoder models ----
+  if (llama_model_has_encoder(_model)) {
     llama_token decoder_start_token_id = llama_model_decoder_start_token(_model);
     if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
       decoder_start_token_id = llama_vocab_bos(_vocab);
     }
-
     batch = llama_batch_get_one(&decoder_start_token_id, 1);
-  }
-
-  for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + _max_tokens;) {
-    // evaluate the current batch with the transformer model
     if (llama_decode(_ctx, batch)) {
-      _last_error = "failed to eval";
-      break;
+      _last_error = "failed to eval decoder start token";
+      return out;
     }
+  }
+
+  // ---- generation loop ----
+  std::vector<llama_token> decoded;
+  decoded.reserve(_max_tokens);
 
-    n_pos += batch.n_tokens;
+  int generated = 0;
+  auto t_start = std::chrono::high_resolution_clock::now();
 
-    // sample the next token
-    llama_token new_token_id = llama_sampler_sample(_sampler, _ctx, -1);
+  while (generated < _max_tokens) {
+    // sample one token from the current logits
+    llama_token tok = llama_sampler_sample(_sampler, _ctx, -1);
 
-    // is it an end of generation?
-    if (llama_vocab_is_eog(_vocab, new_token_id)) {
+    // end-of-generation check
+    if (llama_vocab_is_eog(_vocab, tok)) {
       break;
     }
 
-    char buf[128];
-    int n = llama_token_to_piece(_vocab, new_token_id, buf, sizeof(buf), 0, true);
-    if (n < 0) {
-      _last_error = "failed to convert token to piece";
+    // append token to decoded list
+    decoded.push_back(tok);
+    ++generated;
+
+    // ---- decode the token immediately ----
+    llama_batch batch = llama_batch_get_one(&tok, 1);
+    if (llama_decode(_ctx, batch)) {
+      _last_error = "failed to eval token during generation";
       break;
-    } else if (n > 0) {
-      out.append(buf, n);
     }
+  }
 
-    // prepare the next batch with the sampled token
-    batch = llama_batch_get_one(&new_token_id, 1);
+  // ---- detokenize sequentially ----
+  if (!decoded.empty()) {
+    char buf[512];
+    for (llama_token tok : decoded) {
+      if (llama_vocab_is_control(_vocab, tok)) {
+        continue;
+      }
+      int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
+      if (n > 0) {
+        out.append(buf, n);
+      }
+    }
   }
 
+  // ---- timing ----
+  auto t_end = std::chrono::high_resolution_clock::now();
+  double secs = std::chrono::duration<double>(t_end - t_start).count();
+  double tokps = secs > 0 ? generated / secs : 0;
+
+  fprintf(stderr,
+          "[tok/s=%.2f] generated=%d time=%.3fs\n",
+          tokps, generated, secs);
+
   return out;
 }
 
-void Llama::reset() {
-  // llama_kv_cache_clear(it->second->ctx);
-  _chat_prompt.clear();
-}
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -23,6 +23,9 @@ struct Llama {
   string generate(const string &prompt);
 
   // generation parameters
+
+  void set_penalty_last_n(int32_t penalty_last_n) { _penalty_last_n = penalty_last_n; }
+  void set_penalty_repeat(float penalty_repeat) { _penalty_repeat = penalty_repeat; }
   void set_max_tokens(int max_tokens) { _max_tokens = max_tokens; }
   void set_min_p(float min_p) { _min_p = min_p; }
   void set_temperature(float temperature) { _temperature = temperature; }
@@ -49,6 +52,8 @@ struct Llama {
   const llama_vocab *_vocab;
   string _chat_prompt;
   string _last_error;
+  int32_t _penalty_last_n;
+  float _penalty_repeat;
   float _temperature;
   float _top_p;
   float _min_p;
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -50,6 +50,44 @@ static string expand_path(const char *path) {
   return result;
 }
 
+
+//
+// llama.set_penalty_repeat(0.8)
+//
+static int cmd_llama_set_penalty_repeat(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_penalty_repeat", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_penalty_repeat(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+//
+// llama.set_penalty_last_n(0.8)
+//
+static int cmd_llama_set_penalty_last_n(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_penalty_last_n", 1, 1);
+  } else {
+    int id = get_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_map.at(id);
+      llama.set_penalty_last_n(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
+
 //
 // llama.set_max_tokens(50)
 //
@@ -105,7 +143,7 @@ static int cmd_llama_set_temperature(var_s *self, int argc, slib_par_t *arg, var
 }
 
 //
-// llama.set_set_top_k(10.0)
+// llama.set_top_k(10.0)
 //
 static int cmd_llama_set_top_k(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
   int result = 0;
@@ -122,6 +160,9 @@ static int cmd_llama_set_top_k(var_s *self, int argc, slib_par_t *arg, var_s *re
   return result;
 }
 
+//
+// llama.set_top_p(0)
+//
 static int cmd_llama_set_top_p(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
   int result = 0;
   if (argc != 1) {
@@ -207,14 +248,16 @@ static int cmd_llama_generate(var_s *self, int argc, slib_par_t *arg, var_s *ret
 static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
   int result;
   auto model = expand_path(get_param_str(argc, params, 0, ""));
-  auto n_ctx = get_param_int(argc, params, 0, 2048);
-  auto n_batch = get_param_int(argc, params, 1, 1024);
-  auto temperature = get_param_num(argc, params, 2, 0.25);
+  auto n_ctx = get_param_int(argc, params, 1, 2048);
+  auto n_batch = get_param_int(argc, params, 2, 1024);
+  auto temperature = get_param_num(argc, params, 3, 0.25);
   int id = ++g_nextId;
   Llama &llama = g_map[id];
   if (llama.construct(model, n_ctx, n_batch)) {
     llama.set_temperature(temperature);
     map_init_id(retval, id, CLASS_ID);
+    v_create_callback(retval, "set_penalty_repeat", cmd_llama_set_penalty_repeat);
+    v_create_callback(retval, "set_penalty_last_n", cmd_llama_set_penalty_last_n);
     v_create_callback(retval, "set_max_tokens", cmd_llama_set_max_tokens);
     v_create_callback(retval, "set_min_p", cmd_llama_set_min_p);
     v_create_callback(retval, "set_temperature", cmd_llama_set_temperature);
diff --git a/llama/samples/chat.bas b/llama/samples/chat.bas
@@ -1,12 +1,87 @@
+import llm
 
-const llama = llm.llama("qwen.gguf", 1024)
+const model = "models/Qwen_Qwen2.5-1.5B-Instruct-GGUF-Q4/qwen2.5-1.5b-instruct-q4_k_m.gguf"
+const llama = llm.llama(model, 4096, 512)
 
-print llama.generate("Write a BASIC program", 256, 0.2)
+llama.set_max_tokens(150)
+llama.set_min_p(0.5)
+llama.set_temperature(.8)
+llama.set_top_k(1)
+llama.set_top_p(0)
 
-print llama.chat("Hello")
-print llama.chat("Write a BASIC program to draw a cat")
-print llama.chat("Now add color")
 
-llama.reset()
+rem factual answers, tools, summaries
+' llama.set_max_tokens(150)
+' llama.set_temperature(0.0)
+' llama.set_top_k(1)
+' llama.set_top_p(0.0)
+' llama.set_min_p(0.0)
 
-print llama.chat("Who are you?")
+rem assistant, Q+A, explanations, chat
+' llama.set_max_tokens(150)
+' llama.set_temperature(0.8)
+' llama.set_top_k(40)
+' llama.set_top_p(0.0)
+' llama.set_min_p(0.05)
+
+rem creative, storytelling
+' llama.set_max_tokens(200)
+' llama.set_temperature(1.0)
+' llama.set_top_k(80)
+' llama.set_top_p(0.0)
+' llama.set_min_p(0.1)
+
+rem surprises/loko
+' llama.set_max_tokens(200)
+' llama.set_temperature(1.2)
+' llama.set_top_k(120)
+' llama.set_top_p(0.0)
+' llama.set_min_p(0.15)
+
+rem technical, conservative
+' llama.set_max_tokens(150)
+' llama.set_temperature(0.6)
+' llama.set_top_k(30)
+' llama.set_top_p(0.0)
+' llama.set_min_p(0.02)
+
+rem speed optimised on CPU
+llama.set_max_tokens(150)
+llama.set_temperature(0.7)
+llama.set_top_k(20)
+llama.set_top_p(0.0)
+llama.set_min_p(0.05)
+
+' // Conservative - minimal repetition control
+' _penalty_last_n = 64;
+' _penalty_repeat = 1.05f;
+
+' // Balanced - good default
+' _penalty_last_n = 64;
+' _penalty_repeat = 1.1f;
+
+' // Aggressive - strong anti-repetition
+' _penalty_last_n = 128;
+' _penalty_repeat = 1.2f;
+
+' // Disabled
+' _penalty_last_n = 0;
+' _penalty_repeat = 1.0f;
+
+llama.set_penalty_repeat(1.15)
+llama.set_penalty_last_n(64)
+
+
+
+llm_prompt = """\
+you are a helpful assistant\
+ \nQuestion: when is dinner?\
+"""
+
+print llm_prompt
+print llama.generate(llm_prompt)
+
+' iter = llama.generate(llm_prompt)
+' while iter != 0
+'   print iter.next()
+' wend