Skip to content

Unable to run the model #11

@kirtishrinkhala

Description

@kirtishrinkhala

Hi, I am following the steps in the Readme to run the model.
My goal is to be able to run the model to be able to provide my inputs. I dont want to train the model.

I did the following:

On running the command-

python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py \
    --data_root blip \
    --model declare-lab/flan-alpaca-base \
    --epoch 10 --lr 1e-4 \
    --user_msg seq_future_blip_axis_all0.1_hist8_future4 --img_type blip --img_dim 1408 \
    --bs 4 --eval_bs 16 --input_len 512 --output_len 128 --eval_acc 40 \
    --transform_axis --warmup_ratio 0.05 \
    --all_data 0.1 \
    --use_history 8 \
    --use_future 4 \
    --eval_subset dataset/blip/general_blip \
    --output_dir experiments

I get the following error :

args Namespace(all_data=0.1, bs=4, data_ratio=None, data_root='blip', debug_num=None, epoch=10, eval_acc=40, eval_bs=16, eval_name=None, eval_subset='dataset/blip/general_blip', evaluate_dir=None, final_eval=False, img_dim=1408, img_type='blip', input_len=512, local_rank=-1, lr=0.0001, model='declare-lab/flan-alpaca-base', output_dir='experiments', output_len=128, seed=42, transform_axis=True, use_future=4, use_generate=True, use_history=8, use_img_history=False, use_layout=False, user_msg='seq_future_blip_axis_all0.1_hist8_future4', warmup_ratio=0.05)
====Input Arguments====
{
  "data_root": "blip",
  "output_dir": "experiments",
  "model": "declare-lab/flan-alpaca-base",
  "data_ratio": null,
  "eval_name": null,
  "local_rank": -1,
  "epoch": 10,
  "lr": 0.0001,
  "warmup_ratio": 0.05,
  "bs": 4,
  "debug_num": null,
  "input_len": 512,
  "output_len": 128,
  "img_dim": 1408,
  "eval_bs": 16,
  "eval_acc": 40,
  "all_data": 0.1,
  "eval_subset": "dataset/blip/general_blip",
  "use_history": 8,
  "use_img_history": false,
  "use_future": 4,
  "use_layout": false,
  "transform_axis": true,
  "use_generate": true,
  "final_eval": false,
  "user_msg": "seq_future_blip_axis_all0.1_hist8_future4",
  "img_type": "blip",
  "evaluate_dir": null,
  "seed": 42
}
args Namespace(all_data=0.1, bs=4, data_ratio=None, data_root='blip', debug_num=None, epoch=10, eval_acc=40, eval_bs=16, eval_name=None, eval_subset='dataset/blip/general_blip', evaluate_dir=None, final_eval=False, img_dim=1408, img_type='blip', input_len=512, local_rank=-1, lr=0.0001, model='declare-lab/flan-alpaca-base', output_dir='experiments', output_len=128, seed=42, transform_axis=True, use_future=4, use_generate=True, use_history=8, use_img_history=False, use_layout=False, user_msg='seq_future_blip_axis_all0.1_hist8_future4', warmup_ratio=0.05)
====Input Arguments====
{
  "data_root": "blip",
  "output_dir": "experiments",
  "model": "declare-lab/flan-alpaca-base",
  "data_ratio": null,
  "eval_name": null,
  "local_rank": -1,
  "epoch": 10,
  "lr": 0.0001,
  "warmup_ratio": 0.05,
  "bs": 4,
  "debug_num": null,
  "input_len": 512,
  "output_len": 128,
  "img_dim": 1408,
  "eval_bs": 16,
  "eval_acc": 40,
  "all_data": 0.1,
  "eval_subset": "dataset/blip/general_blip",
  "use_history": 8,
  "use_img_history": false,
  "use_future": 4,
  "use_layout": false,
  "transform_axis": true,
  "use_generate": true,
  "final_eval": false,
  "user_msg": "seq_future_blip_axis_all0.1_hist8_future4",
  "img_type": "blip",
  "evaluate_dir": null,
  "seed": 42
}
args Namespace(all_data=0.1, bs=4, data_ratio=None, data_root='blip', debug_num=None, epoch=10, eval_acc=40, eval_bs=16, eval_name=None, eval_subset='dataset/blip/general_blip', evaluate_dir=None, final_eval=False, img_dim=1408, img_type='blip', input_len=512, local_rank=-1, lr=0.0001, model='declare-lab/flan-alpaca-base', output_dir='experiments', output_len=128, seed=42, transform_axis=True, use_future=4, use_generate=True, use_history=8, use_img_history=False, use_layout=False, user_msg='seq_future_blip_axis_all0.1_hist8_future4', warmup_ratio=0.05)
====Input Arguments====
{
  "data_root": "blip",
  "output_dir": "experiments",
  "model": "declare-lab/flan-alpaca-base",
  "data_ratio": null,
  "eval_name": null,
  "local_rank": -1,
  "epoch": 10,
  "lr": 0.0001,
  "warmup_ratio": 0.05,
  "bs": 4,
  "debug_num": null,
  "input_len": 512,
  "output_len": 128,
  "img_dim": 1408,
  "eval_bs": 16,
  "eval_acc": 40,
  "all_data": 0.1,
  "eval_subset": "dataset/blip/general_blip",
  "use_history": 8,
  "use_img_history": false,
  "use_future": 4,
  "use_layout": false,
  "transform_axis": true,
  "use_generate": true,
  "final_eval": false,
  "user_msg": "seq_future_blip_axis_all0.1_hist8_future4",
  "img_type": "blip",
  "evaluate_dir": null,
  "seed": 42
}
args Namespace(all_data=0.1, bs=4, data_ratio=None, data_root='blip', debug_num=None, epoch=10, eval_acc=40, eval_bs=16, eval_name=None, eval_subset='dataset/blip/general_blip', evaluate_dir=None, final_eval=False, img_dim=1408, img_type='blip', input_len=512, local_rank=-1, lr=0.0001, model='declare-lab/flan-alpaca-base', output_dir='experiments', output_len=128, seed=42, transform_axis=True, use_future=4, use_generate=True, use_history=8, use_img_history=False, use_layout=False, user_msg='seq_future_blip_axis_all0.1_hist8_future4', warmup_ratio=0.05)
====Input Arguments====
{
  "data_root": "blip",
  "output_dir": "experiments",
  "model": "declare-lab/flan-alpaca-base",
  "data_ratio": null,
  "eval_name": null,
  "local_rank": -1,
  "epoch": 10,
  "lr": 0.0001,
  "warmup_ratio": 0.05,
  "bs": 4,
  "debug_num": null,
  "input_len": 512,
  "output_len": 128,
  "img_dim": 1408,
  "eval_bs": 16,
  "eval_acc": 40,
  "all_data": 0.1,
  "eval_subset": "dataset/blip/general_blip",
  "use_history": 8,
  "use_img_history": false,
  "use_future": 4,
  "use_layout": false,
  "transform_axis": true,
  "use_generate": true,
  "final_eval": false,
  "user_msg": "seq_future_blip_axis_all0.1_hist8_future4",
  "img_type": "blip",
  "evaluate_dir": null,
  "seed": 42
}
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
[20:18:30] [Model]: Loading declare-lab/flan-alpaca-base...                                                                                                                                                                                         main.py:83
                                                                                                                                                                                                                                                              
           [Data]: Reading data...                                                                                                                                                                                                                  main.py:84
                                                                                                                                                                                                                                                              
experiments/seq_future_blip_axis_all0.1_hist8_future4_declare-lab-flan-alpaca-base_blip_lr0.0001_bs0_ip512_op128_ep10
model.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 990M/990M [00:17<00:00, 56.1MB/s]
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.out_proj.bias', 'gate_dense.bias', 'mha_layer.in_proj_bias', 'image_dense.weight', 'mha_layer.out_proj.weight', 'mha_layer.in_proj_weight', 'gate_dense.weight', 'image_dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.out_proj.bias', 'gate_dense.bias', 'mha_layer.in_proj_bias', 'gate_dense.weight', 'mha_layer.in_proj_weight', 'mha_layer.out_proj.weight', 'image_dense.weight', 'image_dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['image_dense.bias', 'mha_layer.out_proj.weight', 'image_dense.weight', 'mha_layer.in_proj_bias', 'gate_dense.bias', 'gate_dense.weight', 'mha_layer.out_proj.bias', 'mha_layer.in_proj_weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.out_proj.weight', 'mha_layer.in_proj_weight', 'mha_layer.in_proj_bias', 'gate_dense.bias', 'gate_dense.weight', 'mha_layer.out_proj.bias', 'image_dense.bias', 'image_dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['image_dense.weight', 'mha_layer.out_proj.weight', 'image_dense.bias', 'mha_layer.out_proj.bias', 'mha_layer.in_proj_bias', 'gate_dense.bias', 'mha_layer.in_proj_weight', 'gate_dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.out_proj.bias', 'gate_dense.bias', 'gate_dense.weight', 'mha_layer.in_proj_bias', 'mha_layer.out_proj.weight', 'mha_layer.in_proj_weight', 'image_dense.weight', 'image_dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.in_proj_bias', 'mha_layer.in_proj_weight', 'gate_dense.bias', 'image_dense.weight', 'mha_layer.out_proj.weight', 'mha_layer.out_proj.bias', 'gate_dense.weight', 'image_dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at declare-lab/flan-alpaca-base and are newly initialized: ['mha_layer.in_proj_bias', 'gate_dense.weight', 'gate_dense.bias', 'mha_layer.out_proj.bias', 'mha_layer.in_proj_weight', 'image_dense.bias', 'mha_layer.out_proj.weight', 'image_dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
generation_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 25.5kB/s]
loading general 0
loading general 0
loading general 0
loading general 0loading general
 0
loading general loading general0 
0
loading general 0
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
loading google_apps 7580
[2024-01-07 20:20:07,853] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19300 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19301 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19302 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19303 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19304 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19306 closing signal SIGTERM
[2024-01-07 20:20:07,855] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 19307 closing signal SIGTERM
[2024-01-07 20:20:08,928] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: -9) local_rank: 5 (pid: 19305) of binary: /home/skirti/.pyenv/versions/3.8.11/bin/python
Traceback (most recent call last):
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/launch.py", line 196, in <module>
    main()
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/launch.py", line 192, in main
    launch(args)
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/launch.py", line 177, in launch
    run(args)
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
    elastic_launch(
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/skirti/.pyenv/versions/3.8.11/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
main.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-01-07_20:20:07
  host      : 211b70a3
  rank      : 5 (local_rank: 5)
  exitcode  : -9 (pid: 19305)
  error_file: <N/A>
  traceback : Signal 9 (SIGKILL) received by PID 19305
============================================================

Any pointers on what is causing this?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions