First off, thank you for your excellent work on this project! I've encountered an issue while attempting to utilize the multi-GPU functionality. Specifically, the program runs successfully when GPUs are specified in a sequential order starting from zero using the --devices
parameter (e.g., "0,1,2"). However, errors occur when the GPUs are specified in a non-sequential order or do not start with zero (e.g., "0,2" or "1,2,3").
Traceback (most recent call last):
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/run_longExp.py", line 117, in <module>
exp = Exp(args) # set experiments
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 24, in __init__
super(Exp_Main, self).__init__(args)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_basic.py", line 10, in __init__
self.model = self._build_model().to(self.device)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 33, in _build_model
model = nn.DataParallel(model, device_ids=self.args.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 159, in __init__
_check_balance(self.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 26, in _check_balance
dev_props = _get_devices_properties(device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in _get_devices_properties
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <listcomp>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 713, in _get_device_attr
return get_member(torch.cuda)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <lambda>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/cuda/__init__.py", line 452, in get_device_properties
raise AssertionError("Invalid device id")
AssertionError: Invalid device id
Traceback (most recent call last):
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/run_longExp.py", line 117, in <module>
exp = Exp(args) # set experiments
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 24, in __init__
super(Exp_Main, self).__init__(args)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_basic.py", line 10, in __init__
self.model = self._build_model().to(self.device)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 33, in _build_model
model = nn.DataParallel(model, device_ids=self.args.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 159, in __init__
_check_balance(self.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 26, in _check_balance
dev_props = _get_devices_properties(device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in _get_devices_properties
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <listcomp>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 713, in _get_device_attr
return get_member(torch.cuda)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <lambda>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/cuda/__init__.py", line 452, in get_device_properties
raise AssertionError("Invalid device id")
AssertionError: Invalid device id
Traceback (most recent call last):
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/run_longExp.py", line 117, in <module>
exp = Exp(args) # set experiments
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 24, in __init__
super(Exp_Main, self).__init__(args)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_basic.py", line 10, in __init__
self.model = self._build_model().to(self.device)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 33, in _build_model
model = nn.DataParallel(model, device_ids=self.args.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 159, in __init__
_check_balance(self.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 26, in _check_balance
dev_props = _get_devices_properties(device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in _get_devices_properties
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <listcomp>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 713, in _get_device_attr
return get_member(torch.cuda)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <lambda>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/cuda/__init__.py", line 452, in get_device_properties
raise AssertionError("Invalid device id")
AssertionError: Invalid device id
Traceback (most recent call last):
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/run_longExp.py", line 117, in <module>
exp = Exp(args) # set experiments
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 24, in __init__
super(Exp_Main, self).__init__(args)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_basic.py", line 10, in __init__
self.model = self._build_model().to(self.device)
File "/home/mateus/repos/forks/TimeMachine/TimeMachine_supervised/exp/exp_main.py", line 33, in _build_model
model = nn.DataParallel(model, device_ids=self.args.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 159, in __init__
_check_balance(self.device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 26, in _check_balance
dev_props = _get_devices_properties(device_ids)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in _get_devices_properties
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <listcomp>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 713, in _get_device_attr
return get_member(torch.cuda)
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/_utils.py", line 734, in <lambda>
return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
File "/home/mateus/miniconda3/envs/TimeMachine_supervised/lib/python3.10/site-packages/torch/cuda/__init__.py", line 452, in get_device_properties
raise AssertionError("Invalid device id")
AssertionError: Invalid device id
I appreciate any guidance or updates you can provide on this issue. Thank you!