Right now, when trying to run the training of the DL models, the multiprocessing throws an error on my machine (MacBook Pro with M1 Pro).
python -m icu_benchmarks.run train \
-c configs/hirid/Classification/LSTM.gin \
-l logs/random_search/24h_multiclass/LSTM/run \
-t Phenotyping_APACHEGroup \
--num-class 15 \
--maxlen 288 \
-rs True\
-lr 3e-4 1e-4 3e-5 1e-5\
-sd 1111 2222 3333 \
--hidden 32 64 128 256 \
--do 0.0 0.1 0.2 0.3 0.4 \
--depth 1 2 3
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2022-08-30 11:16:08,290 - INFO: Model will be trained using CPU Hardware. This should be considerably slower
Traceback (most recent call last):
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/run.py", line 426, in <module>
main()
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/run.py", line 406, in main
train_with_gin(model_dir=log_dir_seed,
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/models/train.py", line 46, in train_with_gin
train_common(model_dir, overwrite, load_weights)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/config.py", line 1531, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/config.py", line 1508, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/models/train.py", line 88, in train_common
model.train(dataset, val_dataset, weight)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/config.py", line 1531, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.__traceback__) from None
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/gin/config.py", line 1508, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/models/wrappers.py", line 179, in train
train_loss, train_metric_results = self._do_training(train_loader, weight, metrics)
File "/Users/hendrikschmidt/projects/thesis/YAIB/icu_benchmarks/models/wrappers.py", line 134, in _do_training
for t, elem in tqdm(enumerate(train_loader)):
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 355, in __iter__
return self._get_iterator()
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 301, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 914, in __init__
w.start()
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 32, in __init__
super().__init__(process_obj)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 47, in _launch
reduction.dump(process_obj, fp)
File "/Users/hendrikschmidt/opt/anaconda3/envs/icu-benchmark/lib/python3.8/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
AttributeError: Can't pickle local object 'WeakValueDictionary.__init__.<locals>.remove'
In call to configurable 'train' (<function DLWrapper.train at 0x7fcc5bec0790>)
In call to configurable 'train_common' (<function train_common at 0x7fcc3b5000d0>)
Closing remaining open files:/Users/hendrikschmidt/projects/thesis/data/hirid_preprocessed/ml_stage/ml_stage_12h.h5...done/Users/hendrikschmidt/projects/thesis/data/hirid_preprocessed/ml_stage/ml_stage_12h.h5...done
The issue might be the dataloader / way the H5 file is opened, a possible solution is described here: pytorch/pytorch#11929 (comment).
Ideally, the training would work on all different architectures and not only Linux to facilitate development speed.