when i run : run_training.sh
get :
$ bash run_training.sh
args Namespace(data_root='data', output_dir='experiments', model='allenai/unifiedqa-t5-base', options=['A', 'B', 'C', 'D', 'E'], epoch=20, lr=5e-05, bs=8, input_len=512, output_len=512, eval_bs=4, eval_acc=10, train_split='train', val_split='val', test_split='test', use_generate=False, final_eval=True, user_msg='rationale', img_type='detr', eval_le=None, test_le=None, evaluate_dir=None, caption_file='data/captions.json', use_caption=False, prompt_format='QCM-LE', seed=42)
====Input Arguments====
{
"data_root": "data",
"output_dir": "experiments",
"model": "allenai/unifiedqa-t5-base",
"options": [
"A",
"B",
"C",
"D",
"E"
],
"epoch": 20,
"lr": 5e-05,
"bs": 8,
"input_len": 512,
"output_len": 512,
"eval_bs": 4,
"eval_acc": 10,
"train_split": "train",
"val_split": "val",
"test_split": "test",
"use_generate": false,
"final_eval": true,
"user_msg": "rationale",
"img_type": "detr",
"eval_le": null,
"test_le": null,
"evaluate_dir": null,
"caption_file": "data/captions.json",
"use_caption": false,
"prompt_format": "QCM-LE",
"seed": 42
}
img_features size: (11208, 100, 256)
number of train problems: 12726
number of val problems: 4241
number of test problems: 4241
[16:21:38] [Model]: Loading allenai/unifiedqa-t5-base... main.py:68
[Data]: Reading data... main.py:69
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at allenai/unifiedqa-t5-base and are newly initialized: ['gate_dense.bias', 'mha_layer.in_proj_bias', 'mha_layer.in_proj_weight', 'mha_layer.out_proj.weight', 'mha_layer.out_proj.bias', 'gate_dense.weight', 'image_dense.weight', 'image_dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
model parameters: 226643712
***** Running training *****
Num examples = 12726
Num Epochs = 20
Instantaneous batch size per device = 8
Total train batch size (w. parallel, distributed & accumulation) = 16
Gradient Accumulation steps = 1
Total optimization steps = 15920
0%| | 0/15920 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/Workspace/sxk/2023/mm-cot-main/main.py", line 380, in
T5Trainer(
File "/home/Workspace/sxk/2023/mm-cot-main/main.py", line 269, in T5Trainer
trainer.train()
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/trainer.py", line 1498, in train
return inner_training_loop(
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/trainer.py", line 1740, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/trainer.py", line 2470, in training_step
loss = self.compute_loss(model, inputs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/trainer.py", line 2502, in compute_loss
outputs = model(**inputs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/Workspace/sxk/2023/mm-cot-main/model.py", line 98, in forward
encoder_outputs = self.encoder(
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1035, in forward
layer_outputs = layer_module(
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 666, in forward
self_attention_outputs = self.layer[0](
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 572, in forward
attention_output = self.SelfAttention(
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 498, in forward
query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/anaconda3/envs/s20230223e310mmcot/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)
0%| | 0/15920 [00:03<?, ?it/s]args Namespace(data_root='data', output_dir='experiments', model='allenai/unifiedqa-t5-base', options=['A', 'B', 'C', 'D', 'E'], epoch=20, lr=5e-05, bs=8, input_len=512, output_len=64, eval_bs=4, eval_acc=10, train_split='train', val_split='val', test_split='test', use_generate=False, final_eval=True, user_msg='answer', img_type='detr', eval_le='experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_eval.json', test_le='experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_test.json', evaluate_dir=None, caption_file='data/captions.json', use_caption=False, prompt_format='QCMG-A', seed=42)
====Input Arguments====
{
"data_root": "data",
"output_dir": "experiments",
"model": "allenai/unifiedqa-t5-base",
"options": [
"A",
"B",
"C",
"D",
"E"
],
"epoch": 20,
"lr": 5e-05,
"bs": 8,
"input_len": 512,
"output_len": 64,
"eval_bs": 4,
"eval_acc": 10,
"train_split": "train",
"val_split": "val",
"test_split": "test",
"use_generate": false,
"final_eval": true,
"user_msg": "answer",
"img_type": "detr",
"eval_le": "experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_eval.json",
"test_le": "experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_test.json",
"evaluate_dir": null,
"caption_file": "data/captions.json",
"use_caption": false,
"prompt_format": "QCMG-A",
"seed": 42
}
img_features size: (11208, 100, 256)
number of train problems: 12726
number of val problems: 4241
number of test problems: 4241
[16:22:05] [Model]: Loading allenai/unifiedqa-t5-base... main.py:68
[Data]: Reading data... main.py:69
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at allenai/unifiedqa-t5-base and are newly initialized: ['gate_dense.bias', 'mha_layer.out_proj.bias', 'gate_dense.weight', 'image_dense.weight', 'mha_layer.in_proj_bias', 'image_dense.bias', 'mha_layer.in_proj_weight', 'mha_layer.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Traceback (most recent call last):
File "/home/Workspace/sxk/2023/mm-cot-main/main.py", line 380, in
T5Trainer(
File "/home/Workspace/sxk/2023/mm-cot-main/main.py", line 101, in T5Trainer
eval_set = ScienceQADatasetImg(
File "/home/Workspace/sxk/2023/mm-cot-main/utils_data.py", line 165, in init
test_le_data =json.load(open(test_le))["preds"]
FileNotFoundError: [Errno 2] No such file or directory: 'experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_eval.json'
Environment
Linux version 3.10.0-693.el7.x86_64 ([email protected]) (gcc version 4.8.5 20150623 (Red Hat 4.8.5-16) (GCC) ) hpcaitech/ColossalAI#1 SMP Tue Aug 22 21:09:27 UTC 2017
python=3.10.9
conda 4.14.0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0