Skip to content

Commit 0a38fe4

Browse files
author
Chris Warren-Smith
committed
LLAMA: nitro - now displays thinking text
- fixed exiting on false positive
1 parent 6c212c0 commit 0a38fe4

4 files changed

Lines changed: 206 additions & 123 deletions

File tree

llama/llama-sb.cpp

Lines changed: 22 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include "llama.h"
1515
#include "llama-sb.h"
1616

17-
constexpr int MAX_REPEAT = 5;
17+
constexpr int MAX_REPEAT = 50;
1818

1919
static bool read_vram(size_t &used, size_t &total) {
2020
size_t free = 0;
@@ -391,9 +391,18 @@ LlamaMemoryInfo Llama::memory_info() {
391391
info.vram_percent = 100.0f * info.vram_used / info.vram_total;
392392
}
393393

394+
info.model_native_max_ctx = llama_model_n_ctx_train(_model);
395+
394396
// Advice
395397
ostringstream advice;
396398

399+
// Check structural limits & model configuration quirks
400+
if (info.kv_total > info.model_native_max_ctx) {
401+
advice << "WARNING: Configured context size (" << info.kv_total
402+
<< ") exceeds model native training length (" << info.model_native_max_ctx
403+
<< "). Logic flaws or repetition bugs will occur unless RoPE scaling options are enabled. ";
404+
}
405+
397406
if (n_gpu_layers < info.n_layers_total) {
398407
advice << "Only " << n_gpu_layers << "/" << info.n_layers_total
399408
<< " layers on GPU - increase n_gpu_layers if VRAM allows. ";
@@ -519,49 +528,6 @@ bool Llama::configure_sampler() {
519528
return true;
520529
}
521530

522-
bool Llama::ends_with_sentence_boundary(const string &text) {
523-
if (text.empty()) {
524-
return false;
525-
}
526-
527-
// Get last few characters (in case of whitespace after punctuation)
528-
size_t check_len = std::min(text.length(), (size_t)5);
529-
std::string ending = text.substr(text.length() - check_len);
530-
531-
// Check for various sentence endings
532-
// Period followed by space or end
533-
if (ending.find(". ") != std::string::npos ||
534-
ending.back() == '.') {
535-
return true;
536-
}
537-
538-
// Exclamation mark
539-
if (ending.find("! ") != std::string::npos ||
540-
ending.back() == '!') {
541-
return true;
542-
}
543-
544-
// Question mark
545-
if (ending.find("? ") != std::string::npos ||
546-
ending.back() == '?') {
547-
return true;
548-
}
549-
550-
// Newline (paragraph break)
551-
if (ending.find('\n') != std::string::npos) {
552-
return true;
553-
}
554-
555-
// Quote followed by period: "something."
556-
if (ending.find(".\"") != std::string::npos ||
557-
ending.find("!\"") != std::string::npos ||
558-
ending.find("?\"") != std::string::npos) {
559-
return true;
560-
}
561-
562-
return false;
563-
}
564-
565531
// Makes space in the context for n_tokens by removing old tokens if necessary
566532
// Returns true if successful, false if impossible to make space
567533
//
@@ -641,23 +607,23 @@ string Llama::token_to_string(LlamaIter &iter, llama_token tok) {
641607
char buf[512];
642608
int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false);
643609
if (n > 0) {
644-
// detect repetition
645-
if (iter._last_word == buf) {
646-
if (++iter._repetition_count == MAX_REPEAT) {
647-
iter._has_next = false;
610+
// detect repetition - only on non-whitespace tokens, otherwise
611+
// spaces/newlines trigger false positives almost immediately.
612+
string piece(buf, n);
613+
bool is_trivial = piece.find_first_not_of(" \t\n\r") == string::npos;
614+
if (!is_trivial) {
615+
if (iter._last_word == piece) {
616+
if (++iter._repetition_count >= MAX_REPEAT) {
617+
iter._has_next = false;
618+
}
619+
} else {
620+
iter._repetition_count = 0;
621+
iter._last_word = piece;
648622
}
649-
} else {
650-
iter._repetition_count = 0;
651-
iter._last_word = buf;
652623
}
653624

654625
result.append(buf, n);
655626

656-
// detect end of max-tokens
657-
if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(result)) {
658-
iter._has_next = false;
659-
}
660-
661627
// detect stop words
662628
if (iter._has_next) {
663629
for (const auto &stop : _stop_sequences) {

llama/llama-sb.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct LlamaMemoryInfo {
3333
int n_layers_total; // total model layers
3434
int n_layers_gpu; // layers offloaded to GPU
3535
int n_layers_cpu; // layers on CPU
36+
int model_native_max_ctx;
3637

3738
// Advice
3839
string advice;
@@ -117,7 +118,6 @@ struct Llama {
117118
bool batch_decode_tokens(vector<llama_token> &tokens);
118119
bool configure_sampler();
119120
void dirty() {_sampler_dirty = true; }
120-
bool ends_with_sentence_boundary(const string &out);
121121
bool make_space_for_tokens(int n_tokens);
122122
vector<llama_token> tokenize(const string &prompt);
123123
string token_to_string(LlamaIter &iter, llama_token tok);

llama/llama.cpp

0 commit comments

Comments
 (0)