splashkit
diff --git a/‎coresdk/src/backend/backend_types.h‎
Lines changed: 1 addition & 0 deletions b/‎coresdk/src/backend/backend_types.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎coresdk/src/backend/genai_driver.cpp‎
Lines changed: 80 additions & 35 deletions b/‎coresdk/src/backend/genai_driver.cpp‎
Lines changed: 80 additions & 35 deletions
diff --git a/‎coresdk/src/backend/genai_driver.h‎
Lines changed: 41 additions & 6 deletions b/‎coresdk/src/backend/genai_driver.h‎
Lines changed: 41 additions & 6 deletions
@@ -64,6 +64,7 @@ namespace splashkit_lib
         ADC_PTR=                    0x41444350, //'ADCP';
         MOTOR_DRIVER_PTR =           0x4d444950, //'MDIP';
         SERVO_DRIVER_PTR =           0x53455256, //'SERV'; 
+        CONVERSATION_PTR =          0x434f4e56, //'CONV';
         NONE_PTR =                  0x4e4f4e45  //'NONE';
     };
 
 
@@ -60,6 +60,13 @@ namespace splashkit_lib
                 return {false};
             }
 
+            if (llama_model_has_encoder(model))
+            {
+                llama_model_free(model);
+                CLOG(ERROR, "GenAI") << "Unsupported model, requires encoder-decoder support.";
+                return {false};
+            }
+
             const llama_vocab * vocab = llama_model_get_vocab(model);
             const char* tmpl = llama_model_chat_template(model, /* name */ nullptr);
 
@@ -82,7 +89,7 @@ namespace splashkit_lib
             llama_model_free(mdl.model);
         }
 
-        std::string format_chat(model& mdl, const std::vector<message>& messages)
+        std::string format_chat(model& mdl, const std::vector<message>& messages, bool add_assistant)
         {
             std::vector<llama_chat_message> llama_formatted;
             std::vector<char> formatted(0);
@@ -94,27 +101,27 @@ namespace splashkit_lib
                 llama_formatted.push_back({msg.role.c_str(), msg.content.c_str()});
             }
 
-            int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+            int new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size());
             if (new_len > (int)formatted.size())
             {
                 formatted.resize(new_len);
-                new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), true, formatted.data(), formatted.size());
+                new_len = llama_chat_apply_template(mdl.tmpl, llama_formatted.data(), llama_formatted.size(), add_assistant, formatted.data(), formatted.size());
             }
 
             return std::string(formatted.begin(), formatted.end());
         }
 
-        llama_tokens tokenize_string(model& mdl, const std::string& prompt)
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first)
         {
             // get token count
             // note: returns a negative number, the count of tokens it would have returned if the buffer was large enough
-            const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, true, true);
+            const int n_prompt = -llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), NULL, 0, is_first, true);
 
             // create buffer
             std::vector<llama_token> prompt_tokens(n_prompt);
 
             // recieve the tokens
-            if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0)
+            if (llama_tokenize(mdl.vocab, prompt.data(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0)
             {
                 CLOG(ERROR, "GenAI") << "Failed to tokenize the prompt.";
                 return {};
@@ -128,7 +135,7 @@ namespace splashkit_lib
             // Create the context
             llama_context_params ctx_params = llama_context_default_params();
             ctx_params.n_ctx = starting_context.size() + settings.max_length - 1;
-            ctx_params.n_batch = starting_context.size();
+            ctx_params.n_batch = ctx_params.n_ctx;
             ctx_params.no_perf = true;
 
             llama_context * ctx = llama_init_from_model(mdl.model, ctx_params);
@@ -153,60 +160,58 @@ namespace splashkit_lib
                 llama_sampler_chain_add(smpl, llama_sampler_init_penalties(64, 0, 0, settings.presence_penalty));
             llama_sampler_chain_add(smpl, llama_sampler_init_dist(settings.seed));
 
-            // Prepare batch and encode starting context
-            llama_batch batch = llama_batch_get_one(starting_context.data(), starting_context.size());
+            // Prepare batch for starting context
+            llama_tokens next_batch = starting_context;
 
-            if (llama_model_has_encoder(mdl.model))
-            {
-                if (llama_encode(ctx, batch))
-                {
-                    llama_free(ctx);
-                    llama_sampler_free(smpl);
-                    CLOG(ERROR, "GenAI") << "Failed to encode prompt.";
-                    return {nullptr};
-                }
-
-                llama_token decoder_start_token_id = llama_model_decoder_start_token(mdl.model);
-                if (decoder_start_token_id == LLAMA_TOKEN_NULL)
-                {
-                    decoder_start_token_id = llama_vocab_bos(mdl.vocab);
-                }
-
-                batch = llama_batch_get_one(&decoder_start_token_id, 1);
-            }
+            // Cache newline token - we use this manually in some spots
+            llama_token newline_token;
+            llama_tokenize(mdl.vocab, "\n", 1, &newline_token, 1, false, true);
 
             return
             {
                 ctx,
                 smpl,
-                batch,
+                next_batch,
                 (int)ctx_params.n_ctx,
                 mdl.vocab,
+                newline_token,
                 0,
-                ""
+                {},
+                false
             };
         }
 
-        int context_step(context& ctx)
+        int context_step(context& ctx, token_result* token)
         {
+            const string THINKING_START = "<think>";
+            const string THINKING_END = "</think>";
+
             if (!ctx.ctx)
                 return -1;
 
+            llama_batch batch = llama_batch_get_one(ctx.next_batch.data(), ctx.next_batch.size());
             // Decode current batch with the model
-            if (llama_decode(ctx.ctx, ctx.batch))
+            if (llama_decode(ctx.ctx, batch))
             {
                 CLOG(ERROR, "GenAI") << "Failed to process response from language model.";
+                if (token)
+                    token->type = token_result::NONE;
                 return -1;
             }
 
-            ctx.n_pos += ctx.batch.n_tokens;
+            ctx.total_context.insert(ctx.total_context.end(), ctx.next_batch.begin(), ctx.next_batch.end());
+            ctx.n_pos += batch.n_tokens;
 
             // Sample next token
             llama_token new_token_id = llama_sampler_sample(ctx.smpl, ctx.ctx, -1);
 
             // Has the model finished its response?
             if (llama_vocab_is_eog(ctx.vocab, new_token_id))
+            {
+                if (token)
+                    token->type = token_result::NONE;
                 return 1;
+            }
 
             char buf[128];
             int n = llama_token_to_piece(ctx.vocab, new_token_id, buf, sizeof(buf), 0, true);
@@ -217,19 +222,46 @@ namespace splashkit_lib
             }
 
             std::string s(buf, n);
-            ctx.ctx_string += s;
+
+            if (token)
+            {
+                bool is_meta = s == THINKING_START || s == THINKING_END;
+                token->text = s;
+                if (is_meta)
+                    token->type = token_result::META;
+                else if (ctx.in_thinking)
+                    token->type = token_result::THINKING;
+                else
+                    token->type = token_result::CONTENT;
+            }
+
+            if (s == THINKING_START)
+                ctx.in_thinking = true;
+            else if (s == THINKING_END)
+                ctx.in_thinking = false;
 
             // prepare the next batch with the sampled token
-            ctx.batch = llama_batch_get_one(&new_token_id, 1);
+            ctx.next_batch = {new_token_id};
 
             // Have we reached the end of the context?
             // If so, stop now.
-            if (ctx.n_pos + ctx.batch.n_tokens >= ctx.ctx_size)
+            if (ctx.n_pos + ctx.next_batch.size() >= ctx.ctx_size)
                 return 1;
 
             return 0;
         }
 
+        void add_to_context(context& ctx, llama_tokens& message)
+        {
+            ctx.next_batch.insert(ctx.next_batch.end(), message.begin(), message.end());
+        }
+
+        void manual_end_message(context& ctx)
+        {
+            ctx.next_batch.push_back(llama_vocab_eot(ctx.vocab));
+            ctx.next_batch.push_back(ctx.newline_token);
+        }
+
         void delete_context(context& ctx)
         {
             if (ctx.smpl)
@@ -238,5 +270,18 @@ namespace splashkit_lib
             if (ctx.ctx)
                 llama_free(ctx.ctx);
         }
+
+        void __print_debug_context(context& ctx)
+        {
+            for (auto& x : ctx.total_context)
+            {
+                char buf[128];
+                int n = llama_token_to_piece(ctx.vocab, x, buf, sizeof(buf), 0, true);
+
+                std::string s(buf, n);
+                std::cout << "|" << s;
+            }
+            std::cout << std::endl;
+        }
     }
 }
@@ -18,6 +18,8 @@ namespace splashkit_lib
 
     namespace llamacpp
     {
+        typedef std::vector<llama_token> llama_tokens;
+
         struct model
         {
             bool valid;
@@ -47,29 +49,62 @@ namespace splashkit_lib
         {
             llama_context* ctx;
             llama_sampler* smpl;
-            llama_batch batch;
+            llama_tokens next_batch;
             int ctx_size = 0;
 
             const llama_vocab* vocab;
+            llama_token newline_token;
 
             int n_pos;
-            std::string ctx_string;
+            llama_tokens total_context;
+
+            bool in_thinking = false;
         };
 
-        typedef std::vector<llama_token> llama_tokens;
+        struct token_result
+        {
+            enum token_type {
+                NONE,
+                CONTENT,
+                THINKING,
+                META
+            };
+            string text;
+            token_type type;
+        };
 
         void init();
 
         model create_model(std::string path);
         void delete_model(model mdl);
 
-        std::string format_chat(model& mdl, const std::vector<message>& messages);
-        llama_tokens tokenize_string(model& mdl, const std::string& prompt);
+        std::string format_chat(model& mdl, const std::vector<message>& messages, bool add_assistant);
+        llama_tokens tokenize_string(model& mdl, const std::string& prompt, bool is_first);
 
         context start_context(model& mdl, llama_tokens& starting_context, inference_settings settings);
-        int context_step(context& ctx);
         void delete_context(context& ctx);
+
+        int context_step(context& ctx, token_result* token);
+        void add_to_context(context& ctx, llama_tokens& message);
+        void manual_end_message(context& ctx);
+
+        void __print_debug_context(context& ctx);
     }
+
+    struct sk_conversation
+    {
+        pointer_identifier id;
+
+        llamacpp::model model;
+        llamacpp::context context;
+
+        bool was_generating;
+        bool is_generating;
+
+        string prompt_append;
+
+        llamacpp::token_result next_token;
+    };
 }
 
 #endif /* defined(graphics_driver) */