Hello, I encountered an error during the reproduction process. Could you help me?
Training with your own dataset reports an error:
(ID2) ubuntu@s414g1:~/cssegmentation$ bash scripts/dist_train.sh 4 /home/ubuntu/cssegmentation/csseg/co
nfigs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Filtering Images: 100%|█████████████████████████████████████████████| 274/274 [00:00<00:00, 391.28it/s]
Filtering Images: 100%|███████████████████████████████████████████████| 39/39 [00:00<00:00, 411.89it/s]
Selected optimization level O1: Insert automatic casts around Pytorch functions and Tensor methods.
Defaults for this optimization level are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled : True
opt_level : O1
cast_model_type : None
patch_torch_functions : True
keep_batchnorm_fp32 : None
master_weights : None
loss_scale : dynamic
Warning: multi_tensor_applier fused unscale kernel is unavailable, possibly because apex was installed without --cuda_ext --cpp_ext. Using Python fallback. Original ImportError was: ModuleNotFoundError("No module named 'amp_C'")
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
Traceback (most recent call last):
File "/home/ubuntu/cssegmentation/csseg/train.py", line 62, in
trainer_client.start()
File "/home/ubuntu/cssegmentation/csseg/train.py", line 54, in start
runner_client = BuildRunner(mode='TRAIN', cmd_args=cmd_args, runner_cfg=runner_cfg_task)
File "/home/ubuntu/cssegmentation/csseg/modules/runners/builder.py", line 29, in build
return super().build(module_cfg)
File "/home/ubuntu/cssegmentation/csseg/modules/utils/modulebuilder.py", line 26, in build
module = self.REGISTERED_MODULESmodule_type
File "/home/ubuntu/cssegmentation/csseg/modules/runners/mib.py", line 18, in init
super(MIBRunner, self).init(
File "/home/ubuntu/cssegmentation/csseg/modules/runners/base.py", line 120, in init
self.segmentor = BuildDistributedModel(model=self.segmentor, model_cfg=parallel_cfg['model_cfg'])
File "/home/ubuntu/cssegmentation/csseg/modules/parallel/model.py", line 12, in BuildDistributedModel
return nn.parallel.DistributedDataParallel(model, **model_cfg)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 448, in init
self._ddp_init_helper()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 503, in _ddp_init_helper
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 91, in replicate
param_copies = _broadcast_coalesced_reshape(params, devices, detach)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/replicate.py", line 67, in _broadcast_coalesced_reshape
return comm.broadcast_coalesced(tensors, devices)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 58, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: All tensors must be on devices[0]: 0
Killing subprocess 1686028
Killing subprocess 1686029
Killing subprocess 1686030
Killing subprocess 1686031
Traceback (most recent call last):
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 340, in
main()
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 326, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/home/ubuntu/miniconda3/envs/ID2/lib/python3.9/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/miniconda3/envs/ID2/bin/python', '-u', 'csseg/train.py', '--local_rank=3', '--nproc_per_node', '4', '--cfgfilepath', '/home/ubuntu/cssegmentation/csseg/configs/mib/mib_r101iabnd16_aspp_512x512_tile2-1_overlap.py']' returned non-zero exit status 1.