Hello, thanks for sharing this amazing repo! Could we have more information how to process our own data for training and inference, please?
The inference demo works perfectly, but any attempt to use my own "musicxml" throws an error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-d11c35a85dab> in <module>
----> 1 data = get_data_from_musicxml('data/haendel_hallelujah3.musicxml', 110, convert_stress=True)
2 panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]}
C:\mellotron\mellotron_utils.py in get_data_from_musicxml(filepath, bpm, phoneme_durations, convert_stress)
460 continue
461
--> 462 events = track2events(v)
463 events = adjust_words(events)
464 events_arpabet = [events2eventsarpabet(e) for e in events]
C:\mellotron\mellotron_utils.py in track2events(track)
285 events = []
286 for e in track:
--> 287 events.extend(adjust_event(e))
288 group_ids = [i for i in range(len(events))
289 if events[i][0] in [' '] or events[i][0].isupper()]
C:\mellotron\mellotron_utils.py in adjust_event(event, hop_length, sampling_rate)
230
231 def adjust_event(event, hop_length=256, sampling_rate=22050):
--> 232 tokens, freq, start_time, end_time = event
233
234 if tokens == ' ':
ValueError: not enough values to unpack (expected 4, got 2)
I confirm that even trying to change one single letter on the "haendel_hallelujah.musicxml" lyrics (ie. "jah" into "yah") will throw an error, if I change it back to "jah" it works again, so I doubt it's my text editor fault or wrong musicxml format (there're tiny differences how the text is organized depending on which software it was exported from), I get this error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-19-f648a3b7ff04> in <module>
18 with torch.no_grad():
19 mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = tacotron.inference_noattention(
---> 20 (text_encoded, mel, speaker_id, pitch_contour*frequency_scaling, rhythm))
21
22 audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0]
C:\mellotron\model.py in inference_noattention(self, inputs)
665
666 mel_outputs, gate_outputs, alignments = self.decoder.inference_noattention(
--> 667 encoder_outputs, f0s, attention_map)
668
669 mel_outputs_postnet = self.postnet(mel_outputs)
C:\mellotron\model.py in inference_noattention(self, memory, f0s, attention_map)
523 attention = attention_map[i]
524 decoder_input = torch.cat((self.prenet(decoder_input), f0), dim=1)
--> 525 mel_output, gate_output, alignment = self.decode(decoder_input, attention)
526
527 mel_outputs += [mel_output.squeeze(1)]
C:\mellotron\model.py in decode(self, decoder_input, attention_weights)
382 self.attention_context, self.attention_weights = self.attention_layer(
383 self.attention_hidden, self.memory, self.processed_memory,
--> 384 attention_weights_cat, self.mask, attention_weights)
385
386 self.attention_weights_cum += self.attention_weights
C:\ProgramData\Anaconda3\envs\ptlast37\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
539 result = self._slow_forward(*input, **kwargs)
540 else:
--> 541 result = self.forward(*input, **kwargs)
542 for hook in self._forward_hooks.values():
543 hook_result = hook(self, input, result)
C:\mellotron\model.py in forward(self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask, attention_weights)
84
85 attention_weights = F.softmax(alignment, dim=1)
---> 86 attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
87 attention_context = attention_context.squeeze(1)
88
RuntimeError: invalid argument 6: wrong matrix size at C:/w/1/s/tmp_conda_3.7_104508/conda/conda-bld/pytorch_1572950778684/work/aten/src\THC/generic/THCTensorMathBlas.cu:534
I tried training with my own audio data, files are in WAV format, 22050hz 16-bit mono, 1 to 4 seconds long, I pointed my data on the "ljs_audiopaths_text_sid_train_filelist.txt" and "ljs_audiopaths_text_sid_val_filelist.txt" file formatted like this on each line: data/speaker/audiofile1.wav|hello world|0
Used this command: python train.py --output_directory=outdir --log_directory=logdir -c models/mellotron_libritts.pt --warm_start
But it throws this error:
Traceback (most recent call last):
File "train.py", line 297, in <module>
args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
File "train.py", line 187, in train
train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams)
File "train.py", line 44, in prepare_dataloaders
trainset = TextMelLoader(hparams.training_files, hparams)
File "C:\mellotron\data_utils.py", line 45, in __init__
self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
File "C:\mellotron\data_utils.py", line 52, in create_speaker_lookup_table
d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
File "C:\mellotron\data_utils.py", line 52, in <dictcomp>
d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
ValueError: invalid literal for int() with base 10: ''
Any information how to solve this is much appreciated, thanks!