tep 2: Train
Training command :: python /home/work/notebooks/multilingual_nmt/train.py -i temp/run_src_lang_tgt_lang/data --data processed --model_file temp/run_src_lang_tgt_lang/models/model_run_src_lang_tgt_lang.ckpt --best_model_file temp/run_src_lang_tgt_lang/models/model_best_run_src_lang_tgt_lang.ckpt --data processed --batchsize 30 --tied --beam_size 5 --epoch 30 --layers 6 --multi_heads 8 --gpu0 --max_decode_len 70 --dev_hyp temp/run_src_lang_tgt_lang/test/valid.out --test_hyp temp/run_src_lang_tgt_lang/test/test.out --model Transformer --metric bleu --wbatchsize 3000
{
"input": "temp/run_src_lang_tgt_lang/data",
"data": "processed",
"report_every": 50,
"model": "Transformer",
"pshare_decoder_param": false,
"pshare_encoder_param": false,
"lang1": null,
"lang2": null,
"share_sublayer": null,
"attn_share": null,
"batchsize": 30,
"wbatchsize": 3000,
"epoch": 30,
"gpu": 0,
"resume": false,
"start_epoch": 0,
"debug": false,
"grad_accumulator_count": 1,
"seed": 1234,
"fp16": false,
"static_loss_scale": 1,
"dynamic_loss_scale": false,
"multi_gpu": [
0
],
"n_units": 512,
"n_hidden": 2048,
"layers": 6,
"multi_heads": 8,
"dropout": 0.1,
"attention_dropout": 0.1,
"relu_dropout": 0.1,
"layer_prepostprocess_dropout": 0.1,
"tied": true,
"pos_attention": false,
"label_smoothing": 0.1,
"embed_position": false,
"max_length": 500,
"use_pad_remover": true,
"optimizer": "Noam",
"grad_norm_for_yogi": false,
"warmup_steps": 16000,
"learning_rate": 0.2,
"learning_rate_constant": 2.0,
"optimizer_adam_beta1": 0.9,
"optimizer_adam_beta2": 0.997,
"optimizer_adam_epsilon": 1e-09,
"ema_decay": 0.999,
"eval_steps": 1000,
"beam_size": 5,
"metric": "bleu",
"alpha": 1.0,
"max_sent_eval": 500,
"max_decode_len": 70,
"out": "results",
"model_file": "temp/run_src_lang_tgt_lang/models/model_run_src_lang_tgt_lang.ckpt",
"best_model_file": "temp/run_src_lang_tgt_lang/models/model_best_run_src_lang_tgt_lang.ckpt",
"dev_hyp": "temp/run_src_lang_tgt_lang/test/valid.out",
"test_hyp": "temp/run_src_lang_tgt_lang/test/test.out",
"log_path": "results/log.txt"
}
/usr/local/python3/lib/python3.6/site-packages/torch/nn/functional.py:52: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
warnings.warn(warning.format(ret))
- number of parameters: 64387713
encoder: 18903040
decoder: 25200640
Transformer(
(embed_word): ScaledEmbedding(39041, 512, padding_idx=0)
(embed_dropout): Dropout(p=0.1)
(encoder): Encoder(
(layers): ModuleList(
(0): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
(1): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
(2): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
(3): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
(4): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
(5): EncoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout2): Dropout(p=0.1)
)
)
(ln): LayerNorm()
)
(decoder): Decoder(
(layers): ModuleList(
(0): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
(1): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
(2): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
(3): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
(4): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
(5): DecoderLayer(
(ln_1): LayerNorm()
(self_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout1): Dropout(p=0.1)
(ln_2): LayerNorm()
(source_attention): MultiHeadAttention(
(W_Q): Linear(in_features=512, out_features=512, bias=False)
(W_K): Linear(in_features=512, out_features=512, bias=False)
(W_V): Linear(in_features=512, out_features=512, bias=False)
(finishing_linear_layer): Linear(in_features=512, out_features=512, bias=False)
(dropout): Dropout(p=0.1)
)
(dropout2): Dropout(p=0.1)
(ln_3): LayerNorm()
(feed_forward): FeedForwardLayer(
(W_1): Linear(in_features=512, out_features=2048, bias=True)
(act): ReLU()
(dropout): Dropout(p=0.1)
(W_2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout3): Dropout(p=0.1)
)
)
(ln): LayerNorm()
)
(affine): Linear(in_features=512, out_features=39041, bias=True)
(criterion): KLDivLoss()
)
Approximate number of iter/epoch = 3589
Traceback (most recent call last):
File "/home/work/notebooks/multilingual_nmt/train.py", line 457, in
main()
File "/home/work/notebooks/multilingual_nmt/train.py", line 315, in main
loss, stat = model(*in_arrays)
File "/usr/local/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in call
result = self.forward(*input, **kwargs)
File "/home/work/notebooks/multilingual_nmt/models/transformer.py", line 601, in forward
y_out_block)
File "/home/work/notebooks/multilingual_nmt/models/transformer.py", line 551, in output_and_loss
stats = utils.Statistics(loss=loss.data.cpu() * n_total,
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 'other'
BPE decoding/detokenising target to match with references
Step 4a: Evaluate Test
Use of uninitialized value $length_reference in numeric eq (==) at /home/work/notebooks/multilingual_nmt/tools/multi-bleu.perl line 148.
Use of uninitialized value $length_reference in numeric eq (==) at /home/work/notebooks/multilingual_nmt/tools/multi-bleu.perl line 148.
Step 4b: Evaluate Dev
Use of uninitialized value $length_reference in numeric eq (==) at /home/work/notebooks/multilingual_nmt/tools/multi-bleu.perl line 148.
Use of uninitialized value $length_reference in numeric eq (==) at /home/work/notebooks/multilingual_nmt/tools/multi-bleu.perl line 148.
Traceback (most recent call last):
File "/home/work/notebooks/multilingual_nmt/bin/t2t-bleu", line 208, in
case_sensitive=False)
File "/home/work/notebooks/multilingual_nmt/bin/t2t-bleu", line 189, in bleu_wrapper
assert len(ref_lines) == len(hyp_lines)
AssertionError
[root@ccb234d5f670 multilingual_nmt]#