LLM: plugin module - impl add_stop func

Chris Warren-Smith · Chris Warren-Smith · commit fad622ea163e · 2025-12-25T17:09:48.000+10:30
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -6,13 +6,15 @@
 // Copyright(C) 2026 Chris Warren-Smith
 
 #include <chrono>
-#include <vector>
 #include "llama.h"
 #include "llama-sb.h"
 
+constexpr int MAX_REPEAT = 5;
+
 LlamaIter::LlamaIter() :
   _llama(nullptr),
   _tokens_sec(0),
+  _repetition_count(0),
   _has_next(false) {
 }
 
@@ -21,20 +23,21 @@ Llama::Llama() :
   _ctx(nullptr),
   _sampler(nullptr),
   _vocab(nullptr),
-  _penalty_last_n(64),
-  _penalty_repeat(1.1f),
+  _penalty_last_n(0),
+  _penalty_repeat(0),
   _temperature(0),
   _top_k(0),
-  _top_p(1.0f),
-  _min_p(0.0f),
-  _max_tokens(150),
+  _top_p(0),
+  _min_p(0),
+  _max_tokens(0),
   _log_level(GGML_LOG_LEVEL_CONT) {
   llama_log_set([](enum ggml_log_level level, const char * text, void *user_data) {
     Llama *llama = (Llama *)user_data;
     if (level > llama->_log_level) {
       fprintf(stderr, "LLAMA: %s", text);
     }
   }, this);
+  reset();
 }
 
 Llama::~Llama() {
@@ -49,6 +52,18 @@ Llama::~Llama() {
   }
 }
 
+void Llama::reset() {
+  _stop_sequences.clear();
+  _last_error = "";
+  _penalty_last_n = 64;
+  _penalty_repeat = 1.1f;
+  _temperature = 0;
+  _top_k = 0;
+  _top_p = 1.0f;
+  _min_p = 0.0f;
+  _max_tokens = 150;
+}
+
 bool Llama::construct(string model_path, int n_ctx, int n_batch) {
   ggml_backend_load_all();
 
@@ -100,31 +115,31 @@ void Llama::configure_sampler() {
   }
 }
 
-void Llama::reset() {
-  llama_sampler_reset(_sampler);
-  _chat_prompt.clear();
-}
-
-bool Llama::generate(LlamaIter &iter, const string &prompt) {
-  int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
-                                 nullptr, 0, true, true);
+vector<llama_token> Llama::tokenize(const string &prompt) {
+  vector<llama_token> result;
 
+  int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(), nullptr, 0, true, true);
   if (n_prompt <= 0) {
     _last_error = "failed to tokenize prompt";
-    return false;
+  } else {
+    result.reserve(n_prompt);
+    result.resize(n_prompt);
+    if (llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
+                       result.data(), n_prompt, true, true) < 0) {
+      _last_error = "failed to tokenize prompt";
+    }
   }
+  return result;
+}
 
-  std::vector<llama_token> prompt_tokens(n_prompt);
-  if (llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
-                     prompt_tokens.data(), n_prompt, true, true) < 0) {
-    _last_error = "failed to tokenize prompt";
+bool Llama::generate(LlamaIter &iter, const string &prompt) {
+  vector<llama_token> prompt_tokens = tokenize(prompt);
+  if (prompt_tokens.size() == 0) {
     return false;
   }
 
-  configure_sampler();
-
   // decode prompt
-  llama_batch batch = llama_batch_get_one(prompt_tokens.data(), n_prompt);
+  llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
   if (llama_decode(_ctx, batch)) {
     _last_error = "failed to eval prompt";
     return false;
@@ -143,8 +158,9 @@ bool Llama::generate(LlamaIter &iter, const string &prompt) {
     }
   }
 
+  configure_sampler();
+
   iter._llama = this;
-  iter._batch = batch;
   iter._has_next = true;
   iter._tokens_sec = 0;
   return true;
@@ -153,7 +169,7 @@ bool Llama::generate(LlamaIter &iter, const string &prompt) {
 string Llama::next(LlamaIter &iter) {
   string out;
 
-  std::vector<llama_token> decoded;
+  vector<llama_token> decoded;
   decoded.reserve(_max_tokens);
 
   int generated = 0;
@@ -183,14 +199,32 @@ string Llama::next(LlamaIter &iter) {
 
   // detokenize sequentially
   if (!decoded.empty()) {
-    char buf[512];
     for (llama_token tok : decoded) {
-      if (llama_vocab_is_control(_vocab, tok)) {
-        continue;
-      }
-      int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
-      if (n > 0) {
-        out.append(buf, n);
+      if (!llama_vocab_is_control(_vocab, tok)) {
+        char buf[512];
+        int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
+        if (n > 0) {
+          if (iter._last_word == buf) {
+            if (++iter._repetition_count == MAX_REPEAT) {
+              iter._has_next = false;
+              break;
+            }
+          } else {
+            iter._repetition_count = 0;
+            iter._last_word = buf;
+          }
+          out.append(buf, n);
+
+          for (const auto &stop : _stop_sequences) {
+            size_t pos = out.find(stop);
+            if (pos != std::string::npos) {
+              // found stop sequence - truncate and signal end
+              out = out.substr(0, pos);
+              iter._has_next = false;
+              break;
+            }
+          }
+        }
       }
     }
   }
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <string>
+#include <vector>
 #include "llama.h"
 
 using namespace std;
@@ -19,8 +20,9 @@ struct LlamaIter {
   ~LlamaIter() {}
 
   Llama *_llama;
-  llama_batch _batch;
   float _tokens_sec;
+  string _last_word;
+  int _repetition_count;
   bool _has_next;
 };
 
@@ -36,6 +38,8 @@ struct Llama {
   string next(LlamaIter &iter);
 
   // generation parameters
+  void add_stop(const char *stop) { _stop_sequences.push_back(stop); }
+  void clear_stops() { _stop_sequences.clear(); }
   void set_penalty_last_n(int32_t penalty_last_n) { _penalty_last_n = penalty_last_n; }
   void set_penalty_repeat(float penalty_repeat) { _penalty_repeat = penalty_repeat; }
   void set_max_tokens(int max_tokens) { _max_tokens = max_tokens; }
@@ -51,12 +55,13 @@ struct Llama {
 
   private:
   void configure_sampler();
+  vector<llama_token> tokenize(const string &prompt);
 
   llama_model *_model;
   llama_context *_ctx;
   llama_sampler *_sampler;
   const llama_vocab *_vocab;
-  string _chat_prompt;
+  vector<string> _stop_sequences;
   string _last_error;
   int32_t _penalty_last_n;
   float _penalty_repeat;
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -66,6 +66,23 @@ static string expand_path(const char *path) {
   return result;
 }
 
+//
+// llama.add_stop('xyz')
+//
+static int cmd_llama_add_stop(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.add_stop", 1, 1);
+  } else {
+    int id = get_llama_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_llama.at(id);
+      llama.add_stop(get_param_str(argc, arg, 0, "stop"));
+      result = 1;
+    }
+  }
+  return result;
+}
 
 //
 // llama.set_penalty_repeat(0.8)
@@ -304,15 +321,16 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
   Llama &llama = g_llama[id];
   if (llama.construct(model, n_ctx, n_batch)) {
     map_init_id(retval, id, CLASS_ID_LLAMA);
+    v_create_callback(retval, "add_stop", cmd_llama_add_stop);
+    v_create_callback(retval, "generate", cmd_llama_generate);
+    v_create_callback(retval, "reset", cmd_llama_reset);
     v_create_callback(retval, "set_penalty_repeat", cmd_llama_set_penalty_repeat);
     v_create_callback(retval, "set_penalty_last_n", cmd_llama_set_penalty_last_n);
     v_create_callback(retval, "set_max_tokens", cmd_llama_set_max_tokens);
     v_create_callback(retval, "set_min_p", cmd_llama_set_min_p);
     v_create_callback(retval, "set_temperature", cmd_llama_set_temperature);
     v_create_callback(retval, "set_top_k", cmd_llama_set_top_k);
     v_create_callback(retval, "set_top_p", cmd_llama_set_top_p);
-    v_create_callback(retval, "generate", cmd_llama_generate);
-    v_create_callback(retval, "reset", cmd_llama_reset);
     result = 1;
   } else {
     error(retval, llama.last_error());