LLAMA: nitro - now displays thinking text

Chris Warren-Smith · Chris Warren-Smith · commit 0a38fe429470 · 2026-06-01T20:07:20.000+09:30
- fixed exiting on false positive
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -14,7 +14,7 @@
 #include "llama.h"
 #include "llama-sb.h"
 
-constexpr int MAX_REPEAT = 5;
+constexpr int MAX_REPEAT = 50;
 
 static bool read_vram(size_t &used, size_t &total) {
   size_t free = 0;
@@ -391,9 +391,18 @@ LlamaMemoryInfo Llama::memory_info() {
     info.vram_percent = 100.0f * info.vram_used / info.vram_total;
   }
 
+  info.model_native_max_ctx = llama_model_n_ctx_train(_model);
+
   // Advice
   ostringstream advice;
 
+  // Check structural limits & model configuration quirks
+  if (info.kv_total > info.model_native_max_ctx) {
+    advice << "WARNING: Configured context size (" << info.kv_total
+           << ") exceeds model native training length (" << info.model_native_max_ctx
+           << "). Logic flaws or repetition bugs will occur unless RoPE scaling options are enabled. ";
+  }
+
   if (n_gpu_layers < info.n_layers_total) {
     advice << "Only " << n_gpu_layers << "/" << info.n_layers_total
            << " layers on GPU - increase n_gpu_layers if VRAM allows. ";
@@ -519,49 +528,6 @@ bool Llama::configure_sampler() {
   return true;
 }
 
-bool Llama::ends_with_sentence_boundary(const string &text) {
-  if (text.empty()) {
-    return false;
-  }
-
-  // Get last few characters (in case of whitespace after punctuation)
-  size_t check_len = std::min(text.length(), (size_t)5);
-  std::string ending = text.substr(text.length() - check_len);
-
-  // Check for various sentence endings
-  // Period followed by space or end
-  if (ending.find(". ") != std::string::npos ||
-      ending.back() == '.') {
-    return true;
-  }
-
-  // Exclamation mark
-  if (ending.find("! ") != std::string::npos ||
-      ending.back() == '!') {
-    return true;
-  }
-
-  // Question mark
-  if (ending.find("? ") != std::string::npos ||
-      ending.back() == '?') {
-    return true;
-  }
-
-  // Newline (paragraph break)
-  if (ending.find('\n') != std::string::npos) {
-    return true;
-  }
-
-  // Quote followed by period: "something."
-  if (ending.find(".\"") != std::string::npos ||
-      ending.find("!\"") != std::string::npos ||
-      ending.find("?\"") != std::string::npos) {
-    return true;
-  }
-
-  return false;
-}
-
 // Makes space in the context for n_tokens by removing old tokens if necessary
 // Returns true if successful, false if impossible to make space
 //
@@ -641,23 +607,23 @@ string Llama::token_to_string(LlamaIter &iter, llama_token tok) {
   char buf[512];
   int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
   if (n > 0) {
-    // detect repetition
-    if (iter._last_word == buf) {
-      if (++iter._repetition_count == MAX_REPEAT) {
-        iter._has_next = false;
+    // detect repetition - only on non-whitespace tokens, otherwise
+    // spaces/newlines trigger false positives almost immediately.
+    string piece(buf, n);
+    bool is_trivial = piece.find_first_not_of(" \t\n\r") == string::npos;
+    if (!is_trivial) {
+      if (iter._last_word == piece) {
+        if (++iter._repetition_count >= MAX_REPEAT) {
+          iter._has_next = false;
+        }
+      } else {
+        iter._repetition_count = 0;
+        iter._last_word = piece;
       }
-    } else {
-      iter._repetition_count = 0;
-      iter._last_word = buf;
     }
 
     result.append(buf, n);
 
-    // detect end of max-tokens
-    if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(result)) {
-      iter._has_next = false;
-    }
-
     // detect stop words
     if (iter._has_next) {
       for (const auto &stop : _stop_sequences) {
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -33,6 +33,7 @@ struct LlamaMemoryInfo {
   int     n_layers_total; // total model layers
   int     n_layers_gpu;   // layers offloaded to GPU
   int     n_layers_cpu;   // layers on CPU
+  int     model_native_max_ctx;
 
   // Advice
   string  advice;
@@ -117,7 +118,6 @@ struct Llama {
   bool batch_decode_tokens(vector<llama_token> &tokens);
   bool configure_sampler();
   void dirty() {_sampler_dirty = true; }
-  bool ends_with_sentence_boundary(const string &out);
   bool make_space_for_tokens(int n_tokens);
   vector<llama_token> tokenize(const string &prompt);
   string token_to_string(LlamaIter &iter, llama_token tok);
diff --git a/llama/llama.cpp b/llama/llama.cpp
@@ -1 +1 @@
-Subproject commit 4d8cc0c56ffba3f8b7fdb0130627fed2a6f71958
+Subproject commit d749821db3bd587932d1ed57d43626cd552c9909
diff --git a/llama/nitro.cpp b/llama/nitro.cpp