Performing the standard experiment on README.md results in a CUDA out of memory error.
Furthermore, I get this same error even while using two 32 GB GPUs ( totalling 64 GB ) through the command CUDA_VISIBLE_DEVICES=0,1 python3 ./src/train.py -num_epochs 1 -cuda_devices 0,1
docker run --gpus all -it pytorch/pytorch
python -m venv venv
source venv/bin/activate
torch==1.4.0
transformers==2.8.0
allennlp==0.9.0
faiss-gpu==1.6.3
git clone https://github.com/izuna385/Zero-Shot-Entity-Linking.git
cd Zero-Shot-Entity-Linking
sh preprocessing.sh # ~3 min
python3 ./src/train.py -num_epochs 1
(venv) root@1a9122ab5fe4:/home/repositories/Zero-Shot-Entity-Linking# python3 ./src/train.py -num_epochs 1
===experiment starts===
===PARAMETERS===
debug False
bert_name bert-base-uncased
word_embedding_dropout 0.05
cuda_devices 0
allen_lazyload True
batch_size_for_train 32
batch_size_for_eval 8
hard_negatives_num 10
num_epochs 1
lr 1e-05
weight_decay 0
beta1 0.9
beta2 0.999
epsilon 1e-08
amsgrad False
max_title_len 12
max_desc_len 50
max_context_len_after_tokenize 100
add_mse_for_biencoder False
search_method indexflatip
add_hard_negatives True
metionPooling CLS
entityPooling CLS
dimentionReduction False
dimentionReductionToThisDim 300
extracted_first_token_for_description 100
extracted_first_token_for_title 16
dataset_dir ./data/
documents_dir ./data/documents/
mentions_dir ./data/mentions/
mentions_splitbyworld_dir ./data/mentions_split_by_world/
mention_leftandright_tokenwindowwidth 40
debugSampleNum 100000000
dir_for_each_world ./data/worlds/
experiment_logdir ./src/experiment_logdir/
===PARAMETERS END===
experiment_logdir: ./src/experiment_logdir/200817_040200/
World american_football is now being loaded...
0%| | 0/1 [00:00<?, ?it/s]======Encoding all entites from title and description=====
100%|##########| 31929/31929 [03:44<00:00, 141.96it/s]
250it [03:45, 1.11it/s]1929 [03:44<00:00, 141.88it/s]
########
HARD NEGATIVE MININGS started
########
100%|##########| 3898/3898 [02:45<00:00, 23.53it/s]
488it [02:45, 2.94it/s]98 [02:45<00:00, 19.44it/s]
0%| | 0/1 [00:03<?, ?it/s]
Traceback (most recent call last)::42, 13.68it/s]
File "./src/train.py", line 131, in <module>
main()
File "./src/train.py", line 76, in main
trainer.train()
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 478, in train
train_metrics = self._train_epoch(epoch)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 320, in _train_epoch
loss = self.batch_loss(batch_group, for_training=True)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 256, in batch_loss
output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/util.py", line 331, in data_parallel
outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/src/model.py", line 62, in forward
encoded_entities_from_hard_negatives_idx0isgold = self.entity_encoder(docked_tokenlist).view(batch_, gold_plus_negs_num, -1)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/src/encoders.py", line 41, in forward
entity_emb = self.word_embedder(title_and_desc_concatnated_text)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/modules/text_field_embedders/basic_text_field_embedder.py", line 131, in forward
token_vectors = embedder(*tensors, **forward_params_values)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/modules/token_embedders/pretrained_transformer_embedder.py", line 26, in forward
return self.transformer_model(token_ids)[0]
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 715, in forward
head_mask=head_mask)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 437, in forward
layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 417, in forward
intermediate_output = self.intermediate(attention_output)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 389, in forward
hidden_states = self.intermediate_act_fn(hidden_states)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 142, in gelu
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
RuntimeError: CUDA out of memory. Tried to allocate 260.00 MiB (GPU 0; 31.72 GiB total capacity; 30.05 GiB already allocated; 81.81 MiB free; 30.58 GiB reserved in total by PyTorch)
1%| | 31/3898 [00:03<06:44, 9.56it/s]
(venv) root@a236a2cd8f06:/home/repositories/Zero-Shot-Entity-Linking# CUDA_VISIBLE_DEVICES=0,1 python3 ./src/train.py -num_epochs 1 -cuda_devices 0,1
===experiment starts===
===PARAMETERS===
debug False
bert_name bert-base-uncased
word_embedding_dropout 0.05
cuda_devices 0,1
allen_lazyload True
batch_size_for_train 32
batch_size_for_eval 8
hard_negatives_num 10
num_epochs 1
lr 1e-05
weight_decay 0
beta1 0.9
beta2 0.999
epsilon 1e-08
amsgrad False
max_title_len 12
max_desc_len 50
max_context_len_after_tokenize 100
add_mse_for_biencoder False
search_method indexflatip
add_hard_negatives True
metionPooling CLS
entityPooling CLS
dimentionReduction False
dimentionReductionToThisDim 300
extracted_first_token_for_description 100
extracted_first_token_for_title 16
dataset_dir ./data/
documents_dir ./data/documents/
mentions_dir ./data/mentions/
mentions_splitbyworld_dir ./data/mentions_split_by_world/
mention_leftandright_tokenwindowwidth 40
debugSampleNum 100000000
dir_for_each_world ./data/worlds/
experiment_logdir ./src/experiment_logdir/
===PARAMETERS END===
experiment_logdir: ./src/experiment_logdir/200817_043656/
100%|##########| 433/433 [00:00<00:00, 277696.27B/s]
100%|##########| 440473133/440473133 [01:12<00:00, 6085356.67B/s]
100%|##########| 231508/231508 [00:00<00:00, 448844.71B/s]
100%|##########| 407873900/407873900 [01:17<00:00, 5267746.95B/s]
World american_football is now being loaded...
0%| | 0/1 [00:00<?, ?it/s]======Encoding all entites from title and description=====
100%|##########| 31929/31929 [03:33<00:00, 149.64it/s]
250it [03:33, 1.17it/s]1929 [03:33<00:00, 153.28it/s]
########
HARD NEGATIVE MININGS started
########
100%|##########| 3898/3898 [02:34<00:00, 25.24it/s]
488it [02:34, 3.16it/s]98 [02:34<00:00, 24.44it/s]
0%| | 0/1 [00:11<?, ?it/s]
Traceback (most recent call last)::00, 15.92it/s]
File "./src/train.py", line 131, in <module>
main()
File "./src/train.py", line 76, in main
trainer.train()
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 478, in train
train_metrics = self._train_epoch(epoch)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 320, in _train_epoch
loss = self.batch_loss(batch_group, for_training=True)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/trainer.py", line 256, in batch_loss
output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/training/util.py", line 331, in data_parallel
outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/_utils.py", line 394, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/src/model.py", line 62, in forward
encoded_entities_from_hard_negatives_idx0isgold = self.entity_encoder(docked_tokenlist).view(batch_, gold_plus_negs_num, -1)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/src/encoders.py", line 41, in forward
entity_emb = self.word_embedder(title_and_desc_concatnated_text)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/modules/text_field_embedders/basic_text_field_embedder.py", line 131, in forward
token_vectors = embedder(*tensors, **forward_params_values)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/allennlp/modules/token_embedders/pretrained_transformer_embedder.py", line 26, in forward
return self.transformer_model(token_ids)[0]
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 715, in forward
head_mask=head_mask)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 437, in forward
layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 417, in forward
intermediate_output = self.intermediate(attention_output)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 389, in forward
hidden_states = self.intermediate_act_fn(hidden_states)
File "/home/repositories/Zero-Shot-Entity-Linking/venv/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 142, in gelu
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
RuntimeError: CUDA out of memory. Tried to allocate 260.00 MiB (GPU 0; 31.72 GiB total capacity; 30.05 GiB already allocated; 57.81 MiB free; 30.58 GiB reserved in total by PyTorch)
2%|1 | 63/3898 [00:11<11:38, 5.49it/s]