Skip to content

[BUG] #1198

@w-ahmad1a10

Description

@w-ahmad1a10

I was evaluating LFM 2 350 M on Gsm8k , i experimented from 0 shot to 5 shots and thus had 6 experiments , now my code was this with max length = None and max generation = 1024 , i know that gsm8k gen tokens is set to 256 by light eval

code

import torch, gc

for obj in gc.get_objects():
try:
if torch.is_tensor(obj) and obj.is_cuda:
del obj
except:
pass

gc.collect()
torch.cuda.empty_cache()
from lighteval.pipeline import Pipeline, PipelineParameters, ParallelismManager
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
from transformers import AutoModelForCausalLM
from lighteval.models.model_input import GenerationParameters
import os
import wandb

-------------------------------------------------------------------------------------------------------------------

os.environ["WANDB_PROJECT"] = "my_lighteval_project"
wandb.login(key="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")

tracker = EvaluationTracker(
output_dir = "./eval_results", # required — where results are saved locally
results_path_template = None, # custom folder structure template, leave None
save_details = True, # save per-sample predictions and correctness
push_to_hub = False, # push results to HuggingFace Hub
push_to_tensorboard = False, # push metrics to TensorBoard
hub_results_org = "", # your HF org name, only needed if push_to_hub=True
tensorboard_metric_prefix = "eval", # prefix for TensorBoard metric names
public = False, # make HF Hub dataset public or private
nanotron_run_info = None, # only for Nanotron users, ignore
use_wandb = True, # log to Weights & Biases
)

── PipelineParameters ────────────────────────────────────────────────────────

pipeline_params = PipelineParameters(
launcher_type = ParallelismManager.NONE, # parallelism backend
job_id = 0, # job ID for cluster runs, always 0 on Colab
dataset_loading_processes = 1, # CPU processes for loading dataset
nanotron_checkpoint_path = None, # only for Nanotron, ignore
custom_tasks_directory = None, # path to custom tasks folder, None for registered tasks
num_fewshot_seeds = 1, # how many seeds to average few-shot over
max_samples = None, # limit samples for quick test, None for full eval
cot_prompt = None, # chain-of-thought string to append to prompts
remove_reasoning_tags = True, # strip ... from reasoning model outputs
reasoning_tags = [("", "")], # the tags to strip
load_responses_from_details_date_id= None, # reuse saved responses from a previous run
bootstrap_iters = 1000, # iterations for confidence interval bootstrapping
load_tasks_multilingual = False, # load multilingual task variants
)

── GenerationParameters (nested inside model config) ─────────────────────────

gen_params = GenerationParameters(
temperature = 0.0, # 0 = greedy/deterministic, higher = more random
top_p = None, # nucleus sampling threshold
top_k = None, # top-k sampling
min_p = None, # min-p sampling
max_new_tokens = , # max tokens to generate per sample
min_new_tokens = None, # min tokens before EOS is allowed
stop_tokens = None, # ← square brackets, makes it a list # list of strings that stop generation
seed = 42, # random seed for generation
repetition_penalty = None, # penalize repeated tokens, 1.0 = no penalty
frequency_penalty = None, # penalize frequent tokens
length_penalty = None, # penalize long outputs (for beam search)
presence_penalty = None, # penalize tokens already present
early_stopping = None, # stop beam search early
truncate_prompt = None, # truncate prompt if too long instead of erroring
num_blocks = None, # for block-sparse attention, ignore
block_size = None, # for block-sparse attention, ignore
cache_implementation= None, # custom KV-cache implementation, leave None
response_format = None, # structured output format, leave None
)

── TransformersModelConfig ───────────────────────────────────────────────────

model_cfg = TransformersModelConfig(
model_name = "LiquidAI/LFM2-350M", # HF model ID or local path
tokenizer = None, # separate tokenizer ID if different from model
subfolder = None, # subfolder inside the HF repo
revision = "main", # git revision / commit hash
batch_size = 50, # inference batch size, tune for your GPU
max_length = None, # max total tokens (prompt + output)
model_loading_kwargs = {}, # extra kwargs for from_pretrained e.g. {"device_map": "auto"}
add_special_tokens = True, # add BOS etc. during tokenization
skip_special_tokens = True, # strip special tokens from decoded output
model_parallel = None, # multi-GPU model parallelism, None = auto
dtype = "float16", # weight precision: float16, bfloat16, float32, 4bit, 8bit
device = "cuda", # cuda, cpu, or GPU index
trust_remote_code = False, # allow custom model code from HF repo
compile = False, # torch.compile the model
multichoice_continuations_start_space = None, # space before MCQ answer choices, None = auto
pairwise_tokenization = False, # tokenize context and continuation separately
continuous_batching = False, # continuous batching for generation
override_chat_template = True, # True/False/None to force or disable chat template
generation_parameters = gen_params, # the GenerationParameters object above
system_prompt = None, # system prompt prepended to every sample
cache_dir = "/content/hf_cache", # where model weights are cached on disk
)

── Pipeline ──────────────────────────────────────────────────────────────────

pipeline = Pipeline(
tasks = "|gsm8k|5", # task string: |task|num_fewshot|
pipeline_parameters= pipeline_params,
evaluation_tracker = tracker,
model_config = model_cfg,
model = None, # pass a pre-loaded model object here instead of model_config
metric_options = None, # extra metric options, None for registered tasks
)

── Run ───────────────────────────────────────────────────────────────────────

pipeline.evaluate()
pipeline.save_and_push_results()
pipeline.show_results()

query :

when i opened my detail file produced by light eval i saw this "padded_tokens_count": 178, "reasonings": [], text [ all what it said ] , "], "truncated_tokens_count": 2047, "unconditioned_logprobs": null} . i do not know why the truncated count was 2047 and what was it . this happened even when in 0 shot . what is this , i could not find it . also my result file was this for 5 shot

result {

"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"max_samples": null,
"job_id": "0",
"start_time": 3275.624964992,
"end_time": 3699.36364977,
"total_evaluation_time_secondes": "423.7386847779999",
"model_config": {
"model_name": "LiquidAI/LFM2-350M",
"generation_parameters": {
"num_blocks": null,
"block_size": null,
"early_stopping": null,
"repetition_penalty": null,
"frequency_penalty": null,
"length_penalty": null,
"presence_penalty": null,
"max_new_tokens": 1024,
"min_new_tokens": null,
"seed": 42,
"stop_tokens": null,
"temperature": 0.0,
"top_k": null,
"min_p": null,
"top_p": null,
"truncate_prompt": null,
"cache_implementation": null,
"response_format": null
},
"system_prompt": null,
"cache_dir": "/content/hf_cache",
"tokenizer": null,
"subfolder": null,
"revision": "main",
"batch_size": 50,
"max_length": null,
"model_loading_kwargs": {},
"add_special_tokens": true,
"skip_special_tokens": true,
"model_parallel": false,
"dtype": "float16",
"device": "cuda",
"trust_remote_code": false,
"compile": false,
"multichoice_continuations_start_space": null,
"pairwise_tokenization": false,
"continuous_batching": false,
"override_chat_template": true
},
"model_name": "LiquidAI/LFM2-350M"
},
"results": {
"gsm8k|5": {
"extractive_match": 0.3502653525398029,
"extractive_match_stderr": 0.013140409455571263
},
"all": {
"extractive_match": 0.3502653525398029,
"extractive_match_stderr": 0.013140409455571263
}
},
"versions": {},
"config_tasks": {
"gsm8k|5": {
"name": "gsm8k",
"prompt_function": "gsm8k_prompt",
"hf_repo": "openai/gsm8k",
"hf_subset": "main",
"metrics": [
{
"metric_name": "extractive_match",
"higher_is_better": true,
"category": "GENERATIVE",
"sample_level_fn": "MultilingualExtractiveMatchMetric(language=Language.ENGLISH, gold_extraction_target=(ExprExtractionConfig(try_extract_without_anchor=True),), pred_extraction_target=(ExprExtractionConfig(try_extract_without_anchor=True), LatexExtractionConfig(try_extract_without_anchor=True, boxed_match_priority=0, normalization_config=NormalizationConfig(basic_latex=True, units=True, malformed_operators=True, nits=True, boxed='all', equations=True))), aggregation_function=max, fallback_mode=first_match, extraction_mode=any_match, precision=5, timeout_seconds=5)",
"corpus_level_fn": "mean",
"batched_compute": false
}
],
"solver": [
"solve",
"solve"
],
"scorer": "score",
"sample_fields": "record_to_sample",
"sample_to_fewshot": "sample_to_fewshot",
"filter": null,
"hf_revision": null,
"hf_filter": null,
"hf_avail_splits": [
"train",
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": "random_sampling_from_train",
"generation_size": 256,
"generation_grammar": null,
"stop_sequence": [
"Question:"
],
"num_samples": null,
"original_num_docs": -1,
"effective_num_docs": -1,
"must_remove_duplicate_docs": false,
"num_fewshots": 5,
"version": 0
}
},
"summary_tasks": {
"gsm8k|5": {
"hashes": {
"hash_examples": "0ed016e24e7512fd",
"hash_full_prompts": "ef46db3751d8e999",
"hash_input_tokens": "d01025ef6535eaa0",
"hash_cont_tokens": "84ac51e597f7a1bb"
},
"truncated": 0,
"non_truncated": 0,
"padded": 0,
"non_padded": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "bc71463e88551d0e",
"hash_full_prompts": "c166e5d20ad58f4e",
"hash_input_tokens": "bdee8939673f2335",
"hash_cont_tokens": "56b056577811391f"
},
"truncated": 0,
"non_truncated": 0,
"padded": 0,
"non_padded": 0
}
}. Please explain what is this

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions