hey
i wanted to add attention layer of ia model to it model just before the pooling layer i.e between the last convo1d layer and pooling layer. And i am making changes in forward_feature_3 . But there is some issue
CODE:
def forward_feature_3(self, x_audio, x_visual):
x_audio = self.audio_model.forward_stage1(x_audio)
x_visual = self.visual_model.forward_features(x_visual)
x_visual = self.visual_model.forward_stage1(x_visual)
proj_x_a = x_audio.permute(0,2,1)
proj_x_v = x_visual.permute(0,2,1)
h_av = self.av1(proj_x_v, proj_x_a)
h_va = self.va1(proj_x_a, proj_x_v)
h_av = h_av.permute(0,2,1)
h_va = h_va.permute(0,2,1)
x_audio = h_av+x_audio
x_visual = h_va + x_visual
x_audio = self.audio_model.forward_stage2(x_audio)
x_visual = self.visual_model.forward_stage2(x_visual)
proj_x_a = x_audio
proj_x_v = x_visual
h_av_new = self.av_new(proj_x_v, proj_x_a)
h_va_new = self.va_new(proj_x_a, proj_x_v)
if h_av_new.size(1) > 1: #if more than 1 head, take average
h_av_new = torch.mean(h_av_new, axis=1).unsqueeze(1)
h_av_new = h_av_new.sum([-2])
if h_va_new.size(1) > 1: #if more than 1 head, take average
h_va_new = torch.mean(h_va_new, axis=1).unsqueeze(1)
h_va_new = h_va_new.sum([-2])
x_audio = h_va_new*x_audio
x_visual = h_av_new*x_visual
audio_pooled = x_audio.mean([-1]) #mean accross temporal dimension
video_pooled = x_visual.mean([-1])
x = torch.cat((audio_pooled, video_pooled), dim=-1)
x1 = self.classifier_1(x)
return x1
ERROR:
train_epoch(i, train_loader, model, criterion, optimizer, opt,
File "C:\Users\HP pav\Desktop\Capstone\multimodal-emotion-recognition\train.py", line 119, in train_epoch
train_epoch_multimodal(epoch, data_loader, model, criterion, optimizer, opt, epoch_logger, batch_logger)
File "C:\Users\HP pav\Desktop\Capstone\multimodal-emotion-recognition\train.py", line 64, in train_epoch_multimodal
outputs = model(audio_inputs, visual_inputs)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\HP pav\Desktop\Capstone\multimodal-emotion-recognition\models\multimodalcnn2.py", line 208, in forward
return self.forward_feature_3(x_audio, x_visual)
File "C:\Users\HP pav\Desktop\Capstone\multimodal-emotion-recognition\models\multimodalcnn2.py", line 235, in forward_feature_3
h_av_new = self.av_new(proj_x_v, proj_x_a)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\HP pav\Desktop\Capstone\multimodal-emotion-recognition\models\transformer_timm.py", line 88, in forward
q = self.q(x_q).reshape(B, Nq, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Python310\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Python310\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x144 and 128x128)