|
14 | 14 | #include "llama.h" |
15 | 15 | #include "llama-sb.h" |
16 | 16 |
|
17 | | -constexpr int MAX_REPEAT = 5; |
| 17 | +constexpr int MAX_REPEAT = 50; |
18 | 18 |
|
19 | 19 | static bool read_vram(size_t &used, size_t &total) { |
20 | 20 | size_t free = 0; |
@@ -391,9 +391,18 @@ LlamaMemoryInfo Llama::memory_info() { |
391 | 391 | info.vram_percent = 100.0f * info.vram_used / info.vram_total; |
392 | 392 | } |
393 | 393 |
|
| 394 | + info.model_native_max_ctx = llama_model_n_ctx_train(_model); |
| 395 | + |
394 | 396 | // Advice |
395 | 397 | ostringstream advice; |
396 | 398 |
|
| 399 | + // Check structural limits & model configuration quirks |
| 400 | + if (info.kv_total > info.model_native_max_ctx) { |
| 401 | + advice << "WARNING: Configured context size (" << info.kv_total |
| 402 | + << ") exceeds model native training length (" << info.model_native_max_ctx |
| 403 | + << "). Logic flaws or repetition bugs will occur unless RoPE scaling options are enabled. "; |
| 404 | + } |
| 405 | + |
397 | 406 | if (n_gpu_layers < info.n_layers_total) { |
398 | 407 | advice << "Only " << n_gpu_layers << "/" << info.n_layers_total |
399 | 408 | << " layers on GPU - increase n_gpu_layers if VRAM allows. "; |
@@ -519,49 +528,6 @@ bool Llama::configure_sampler() { |
519 | 528 | return true; |
520 | 529 | } |
521 | 530 |
|
522 | | -bool Llama::ends_with_sentence_boundary(const string &text) { |
523 | | - if (text.empty()) { |
524 | | - return false; |
525 | | - } |
526 | | - |
527 | | - // Get last few characters (in case of whitespace after punctuation) |
528 | | - size_t check_len = std::min(text.length(), (size_t)5); |
529 | | - std::string ending = text.substr(text.length() - check_len); |
530 | | - |
531 | | - // Check for various sentence endings |
532 | | - // Period followed by space or end |
533 | | - if (ending.find(". ") != std::string::npos || |
534 | | - ending.back() == '.') { |
535 | | - return true; |
536 | | - } |
537 | | - |
538 | | - // Exclamation mark |
539 | | - if (ending.find("! ") != std::string::npos || |
540 | | - ending.back() == '!') { |
541 | | - return true; |
542 | | - } |
543 | | - |
544 | | - // Question mark |
545 | | - if (ending.find("? ") != std::string::npos || |
546 | | - ending.back() == '?') { |
547 | | - return true; |
548 | | - } |
549 | | - |
550 | | - // Newline (paragraph break) |
551 | | - if (ending.find('\n') != std::string::npos) { |
552 | | - return true; |
553 | | - } |
554 | | - |
555 | | - // Quote followed by period: "something." |
556 | | - if (ending.find(".\"") != std::string::npos || |
557 | | - ending.find("!\"") != std::string::npos || |
558 | | - ending.find("?\"") != std::string::npos) { |
559 | | - return true; |
560 | | - } |
561 | | - |
562 | | - return false; |
563 | | -} |
564 | | - |
565 | 531 | // Makes space in the context for n_tokens by removing old tokens if necessary |
566 | 532 | // Returns true if successful, false if impossible to make space |
567 | 533 | // |
@@ -641,23 +607,23 @@ string Llama::token_to_string(LlamaIter &iter, llama_token tok) { |
641 | 607 | char buf[512]; |
642 | 608 | int n = llama_token_to_piece(_vocab, tok, buf, sizeof(buf), 0, false); |
643 | 609 | if (n > 0) { |
644 | | - // detect repetition |
645 | | - if (iter._last_word == buf) { |
646 | | - if (++iter._repetition_count == MAX_REPEAT) { |
647 | | - iter._has_next = false; |
| 610 | + // detect repetition - only on non-whitespace tokens, otherwise |
| 611 | + // spaces/newlines trigger false positives almost immediately. |
| 612 | + string piece(buf, n); |
| 613 | + bool is_trivial = piece.find_first_not_of(" \t\n\r") == string::npos; |
| 614 | + if (!is_trivial) { |
| 615 | + if (iter._last_word == piece) { |
| 616 | + if (++iter._repetition_count >= MAX_REPEAT) { |
| 617 | + iter._has_next = false; |
| 618 | + } |
| 619 | + } else { |
| 620 | + iter._repetition_count = 0; |
| 621 | + iter._last_word = piece; |
648 | 622 | } |
649 | | - } else { |
650 | | - iter._repetition_count = 0; |
651 | | - iter._last_word = buf; |
652 | 623 | } |
653 | 624 |
|
654 | 625 | result.append(buf, n); |
655 | 626 |
|
656 | | - // detect end of max-tokens |
657 | | - if (++iter._tokens_generated > _max_tokens && ends_with_sentence_boundary(result)) { |
658 | | - iter._has_next = false; |
659 | | - } |
660 | | - |
661 | 627 | // detect stop words |
662 | 628 | if (iter._has_next) { |
663 | 629 | for (const auto &stop : _stop_sequences) { |
|
0 commit comments