Hello,
I am running this command on an Ubuntu 22.04 LTS system python main.py --finetune '' --eval --resume vim_t_midclstok_76p1acc.pth--model vim_tiny_patch16_224_bimambav2_final_pool_mean_abs_pos_embed_with_midclstok_div2 --data-path imagenet-mini --data-set 'IMNET' in order to try your model, as you suggest in the documentation.
I installed mamba-ssm using pip wheels because your recommended approach does not work, as noted in another issue.
However, it seems that the pretrained model is not loaded correctly as shown in the error message below. What could I do to fix this?
Traceback (most recent call last):
File "/rds/general/user/kp4718/home/code/Vim/vim/main.py", line 545, in
main(args)
File "/rds/general/user/kp4718/home/code/Vim/vim/main.py", line 448, in main
model_without_ddp.load_state_dict(checkpoint['model'])
File "/rds/general/user/kp4718/home/anaconda3/envs/mambaenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2152, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for VisionMamba:
Unexpected key(s) in state_dict: "layers.0.mixer.A_b_log", "layers.0.mixer.D_b", "layers.0.mixer.conv1d_b.weight", "layers.0.mixer.conv1d_b.bias", "layers.0.mixer.x_proj_b.weight", "layers.0.mixer.dt_proj_b.weight", "layers.0.mixer.dt_proj_b.bias", "layers.1.mixer.A_b_log", "layers.1.mixer.D_b", "layers.1.mixer.conv1d_b.weight", "layers.1.mixer.conv1d_b.bias", "layers.1.mixer.x_proj_b.weight", "layers.1.mixer.dt_proj_b.weight", "layers.1.mixer.dt_proj_b.bias", "layers.2.mixer.A_b_log", "layers.2.mixer.D_b", "layers.2.mixer.conv1d_b.weight", "layers.2.mixer.conv1d_b.bias", "layers.2.mixer.x_proj_b.weight", "layers.2.mixer.dt_proj_b.weight", "layers.2.mixer.dt_proj_b.bias", "layers.3.mixer.A_b_log", "layers.3.mixer.D_b", "layers.3.mixer.conv1d_b.weight", "layers.3.mixer.conv1d_b.bias", "layers.3.mixer.x_proj_b.weight", "layers.3.mixer.dt_proj_b.weight", "layers.3.mixer.dt_proj_b.bias", "layers.4.mixer.A_b_log", "layers.4.mixer.D_b", "layers.4.mixer.conv1d_b.weight", "layers.4.mixer.conv1d_b.bias", "layers.4.mixer.x_proj_b.weight", "layers.4.mixer.dt_proj_b.weight", "layers.4.mixer.dt_proj_b.bias", "layers.5.mixer.A_b_log", "layers.5.mixer.D_b", "layers.5.mixer.conv1d_b.weight", "layers.5.mixer.conv1d_b.bias", "layers.5.mixer.x_proj_b.weight", "layers.5.mixer.dt_proj_b.weight", "layers.5.mixer.dt_proj_b.bias", "layers.6.mixer.A_b_log", "layers.6.mixer.D_b", "layers.6.mixer.conv1d_b.weight", "layers.6.mixer.conv1d_b.bias", "layers.6.mixer.x_proj_b.weight", "layers.6.mixer.dt_proj_b.weight", "layers.6.mixer.dt_proj_b.bias", "layers.7.mixer.A_b_log", "layers.7.mixer.D_b", "layers.7.mixer.conv1d_b.weight", "layers.7.mixer.conv1d_b.bias", "layers.7.mixer.x_proj_b.weight", "layers.7.mixer.dt_proj_b.weight", "layers.7.mixer.dt_proj_b.bias", "layers.8.mixer.A_b_log", "layers.8.mixer.D_b", "layers.8.mixer.conv1d_b.weight", "layers.8.mixer.conv1d_b.bias", "layers.8.mixer.x_proj_b.weight", "layers.8.mixer.dt_proj_b.weight", "layers.8.mixer.dt_proj_b.bias", "layers.9.mixer.A_b_log", "layers.9.mixer.D_b", "layers.9.mixer.conv1d_b.weight", "layers.9.mixer.conv1d_b.bias", "layers.9.mixer.x_proj_b.weight", "layers.9.mixer.dt_proj_b.weight", "layers.9.mixer.dt_proj_b.bias", "layers.10.mixer.A_b_log", "layers.10.mixer.D_b", "layers.10.mixer.conv1d_b.weight", "layers.10.mixer.conv1d_b.bias", "layers.10.mixer.x_proj_b.weight", "layers.10.mixer.dt_proj_b.weight", "layers.10.mixer.dt_proj_b.bias", "layers.11.mixer.A_b_log", "layers.11.mixer.D_b", "layers.11.mixer.conv1d_b.weight", "layers.11.mixer.conv1d_b.bias", "layers.11.mixer.x_proj_b.weight", "layers.11.mixer.dt_proj_b.weight", "layers.11.mixer.dt_proj_b.bias", "layers.12.mixer.A_b_log", "layers.12.mixer.D_b", "layers.12.mixer.conv1d_b.weight", "layers.12.mixer.conv1d_b.bias", "layers.12.mixer.x_proj_b.weight", "layers.12.mixer.dt_proj_b.weight", "layers.12.mixer.dt_proj_b.bias", "layers.13.mixer.A_b_log", "layers.13.mixer.D_b", "layers.13.mixer.conv1d_b.weight", "layers.13.mixer.conv1d_b.bias", "layers.13.mixer.x_proj_b.weight", "layers.13.mixer.dt_proj_b.weight", "layers.13.mixer.dt_proj_b.bias", "layers.14.mixer.A_b_log", "layers.14.mixer.D_b", "layers.14.mixer.conv1d_b.weight", "layers.14.mixer.conv1d_b.bias", "layers.14.mixer.x_proj_b.weight", "layers.14.mixer.dt_proj_b.weight", "layers.14.mixer.dt_proj_b.bias", "layers.15.mixer.A_b_log", "layers.15.mixer.D_b", "layers.15.mixer.conv1d_b.weight", "layers.15.mixer.conv1d_b.bias", "layers.15.mixer.x_proj_b.weight", "layers.15.mixer.dt_proj_b.weight", "layers.15.mixer.dt_proj_b.bias", "layers.16.mixer.A_b_log", "layers.16.mixer.D_b", "layers.16.mixer.conv1d_b.weight", "layers.16.mixer.conv1d_b.bias", "layers.16.mixer.x_proj_b.weight", "layers.16.mixer.dt_proj_b.weight", "layers.16.mixer.dt_proj_b.bias", "layers.17.mixer.A_b_log", "layers.17.mixer.D_b", "layers.17.mixer.conv1d_b.weight", "layers.17.mixer.conv1d_b.bias", "layers.17.mixer.x_proj_b.weight", "layers.17.mixer.dt_proj_b.weight", "layers.17.mixer.dt_proj_b.bias", "layers.18.mixer.A_b_log", "layers.18.mixer.D_b", "layers.18.mixer.conv1d_b.weight", "layers.18.mixer.conv1d_b.bias", "layers.18.mixer.x_proj_b.weight", "layers.18.mixer.dt_proj_b.weight", "layers.18.mixer.dt_proj_b.bias", "layers.19.mixer.A_b_log", "layers.19.mixer.D_b", "layers.19.mixer.conv1d_b.weight", "layers.19.mixer.conv1d_b.bias", "layers.19.mixer.x_proj_b.weight", "layers.19.mixer.dt_proj_b.weight", "layers.19.mixer.dt_proj_b.bias", "layers.20.mixer.A_b_log", "layers.20.mixer.D_b", "layers.20.mixer.conv1d_b.weight", "layers.20.mixer.conv1d_b.bias", "layers.20.mixer.x_proj_b.weight", "layers.20.mixer.dt_proj_b.weight", "layers.20.mixer.dt_proj_b.bias", "layers.21.mixer.A_b_log", "layers.21.mixer.D_b", "layers.21.mixer.conv1d_b.weight", "layers.21.mixer.conv1d_b.bias", "layers.21.mixer.x_proj_b.weight", "layers.21.mixer.dt_proj_b.weight", "layers.21.mixer.dt_proj_b.bias", "layers.22.mixer.A_b_log", "layers.22.mixer.D_b", "layers.22.mixer.conv1d_b.weight", "layers.22.mixer.conv1d_b.bias", "layers.22.mixer.x_proj_b.weight", "layers.22.mixer.dt_proj_b.weight", "layers.22.mixer.dt_proj_b.bias", "layers.23.mixer.A_b_log", "layers.23.mixer.D_b", "layers.23.mixer.conv1d_b.weight", "layers.23.mixer.conv1d_b.bias", "layers.23.mixer.x_proj_b.weight", "layers.23.mixer.dt_proj_b.weight", "layers.23.mixer.dt_proj_b.bias".