smallbasic
diff --git a/‎include/param.cpp‎
Lines changed: 2 additions & 2 deletions b/‎include/param.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/var.h‎
Lines changed: 3 additions & 3 deletions b/‎include/var.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎llama/llama-sb.cpp‎
Lines changed: 29 additions & 33 deletions b/‎llama/llama-sb.cpp‎
Lines changed: 29 additions & 33 deletions
diff --git a/‎llama/llama-sb.h‎
Lines changed: 14 additions & 8 deletions b/‎llama/llama-sb.h‎
Lines changed: 14 additions & 8 deletions
@@ -589,14 +589,14 @@ void v_create_func(var_p_t map, const char *name, method cb) {
   var_p_t v_func = map_add_var(map, name, 0);
   v_func->type = V_FUNC;
   v_func->v.fn.cb = cb;
-  v_func->v.fn.mcb = NULL;
+  v_func->v.fn.mcb = nullptr;
   v_func->v.fn.id = 0;
 }
 
 void v_create_callback(var_p_t map, const char *name, callback cb) {
   var_p_t v_func = map_add_var(map, name, 0);
   v_func->type = V_FUNC;
-  v_func->v.fn.cb = NULL;
+  v_func->v.fn.cb = nullptr;
   v_func->v.fn.mcb = cb;
   v_func->v.fn.id = 0;
 }
@@ -80,7 +80,7 @@ typedef struct var_s {
 
     // associative array/map
     struct {
-      // pointer the map structure
+      // pointer to the map structure
       void *map;
 
       uint32_t count;
@@ -132,7 +132,7 @@ typedef struct var_s {
   // non-zero if constant
   uint8_t const_flag;
 
-  // whether help in pooled memory
+  // whether held in pooled memory
   uint8_t pooled;
 } var_t;
 
@@ -154,7 +154,7 @@ var_t *v_new(void);
  *
  * @return a newly created var_t array of the given size
  */
-void v_new_array(var_t *var, unsigned size);
+void v_new_array(var_t *var, uint32_t size);
 
 /**
  * @ingroup var
 
@@ -10,6 +10,12 @@
 #include "llama.h"
 #include "llama-sb.h"
 
+LlamaIter::LlamaIter() :
+  _llama(nullptr),
+  _tokens_sec(0),
+  _has_next(false) {
+}
+
 Llama::Llama() :
   _model(nullptr),
   _ctx(nullptr),
@@ -43,18 +49,6 @@ Llama::~Llama() {
   }
 }
 
-void Llama::append_response(const string &response) {
-  _chat_prompt += response;
-  _chat_prompt += "\n";
-}
-
-const string Llama::build_chat_prompt(const string &user_msg) {
-  _chat_prompt += "User: ";
-  _chat_prompt += user_msg;
-  _chat_prompt += "\nAssistant: ";
-  return _chat_prompt;
-}
-
 bool Llama::construct(string model_path, int n_ctx, int n_batch) {
   ggml_backend_load_all();
 
@@ -107,40 +101,36 @@ void Llama::configure_sampler() {
 }
 
 void Llama::reset() {
-  // llama_kv_cache_clear(it->second->ctx);
+  llama_sampler_reset(_sampler);
   _chat_prompt.clear();
 }
 
-string Llama::generate(const string &prompt) {
-  string out = prompt;
-
-  // ---- tokenize prompt ----
+bool Llama::generate(LlamaIter &iter, const string &prompt) {
   int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
                                  nullptr, 0, true, true);
 
   if (n_prompt <= 0) {
     _last_error = "failed to tokenize prompt";
-    return out;
+    return false;
   }
 
   std::vector<llama_token> prompt_tokens(n_prompt);
   if (llama_tokenize(_vocab, prompt.c_str(), prompt.size(),
                      prompt_tokens.data(), n_prompt, true, true) < 0) {
     _last_error = "failed to tokenize prompt";
-    return out;
+    return false;
   }
 
-  // ---- sampler ----
   configure_sampler();
 
-  // ---- decode prompt ----
+  // decode prompt
   llama_batch batch = llama_batch_get_one(prompt_tokens.data(), n_prompt);
   if (llama_decode(_ctx, batch)) {
     _last_error = "failed to eval prompt";
-    return out;
+    return false;
   }
 
-  // ---- handle encoder models ----
+  // handle encoder models
   if (llama_model_has_encoder(_model)) {
     llama_token decoder_start_token_id = llama_model_decoder_start_token(_model);
     if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
@@ -149,11 +139,20 @@ string Llama::generate(const string &prompt) {
     batch = llama_batch_get_one(&decoder_start_token_id, 1);
     if (llama_decode(_ctx, batch)) {
       _last_error = "failed to eval decoder start token";
-      return out;
+      return false;
     }
   }
 
-  // ---- generation loop ----
+  iter._llama = this;
+  iter._batch = batch;
+  iter._has_next = true;
+  iter._tokens_sec = 0;
+  return true;
+}
+
+string Llama::next(LlamaIter &iter) {
+  string out;
+
   std::vector<llama_token> decoded;
   decoded.reserve(_max_tokens);
 
@@ -166,22 +165,23 @@ string Llama::generate(const string &prompt) {
 
     // end-of-generation check
     if (llama_vocab_is_eog(_vocab, tok)) {
+      iter._has_next = false;
       break;
     }
 
     // append token to decoded list
     decoded.push_back(tok);
     ++generated;
 
-    // ---- decode the token immediately ----
+    // decode the token
     llama_batch batch = llama_batch_get_one(&tok, 1);
     if (llama_decode(_ctx, batch)) {
       _last_error = "failed to eval token during generation";
       break;
     }
   }
 
-  // ---- detokenize sequentially ----
+  // detokenize sequentially
   if (!decoded.empty()) {
     char buf[512];
     for (llama_token tok : decoded) {
@@ -195,14 +195,10 @@ string Llama::generate(const string &prompt) {
     }
   }
 
-  // ---- timing ----
+  // timing
   auto t_end = std::chrono::high_resolution_clock::now();
   double secs = std::chrono::duration<double>(t_end - t_start).count();
-  double tokps = secs > 0 ? generated / secs : 0;
-
-  fprintf(stderr,
-          "[tok/s=%.2f] generated=%d time=%.3fs\n",
-          tokps, generated, secs);
+  iter._tokens_sec = secs > 0 ? generated / secs : 0;
 
   return out;
 }
 
@@ -12,6 +12,18 @@
 
 using namespace std;
 
+struct Llama;
+
+struct LlamaIter {
+  explicit LlamaIter();
+  ~LlamaIter() {}
+
+  Llama *_llama;
+  llama_batch _batch;
+  float _tokens_sec;
+  bool _has_next;
+};
+
 struct Llama {
   explicit Llama();
   ~Llama();
@@ -20,10 +32,10 @@ struct Llama {
   bool construct(string model_path, int n_ctx, int n_batch);
 
   // generation
-  string generate(const string &prompt);
+  bool generate(LlamaIter &iter, const string &prompt);
+  string next(LlamaIter &iter);
 
   // generation parameters
-
   void set_penalty_last_n(int32_t penalty_last_n) { _penalty_last_n = penalty_last_n; }
   void set_penalty_repeat(float penalty_repeat) { _penalty_repeat = penalty_repeat; }
   void set_max_tokens(int max_tokens) { _max_tokens = max_tokens; }
@@ -32,12 +44,6 @@ struct Llama {
   void set_top_k(int top_k) { _top_k = top_k; }
   void set_top_p(float top_p) { _top_p = top_p; }
 
-  // messages
-  void append_response(const string &response);
-  void append_user_message(const string &user_msg);
-  const string& get_chat_history() const;
-  const string build_chat_prompt(const string &user_msg);
-
   // error handling
   const char *last_error() { return _last_error.c_str(); }
   void set_log_level(int level) { _log_level = level; }