Implementation of Spear-TTS - multi-speaker text-to-speech attention network, in Pytorch
The text-to-semantic module built here will be used for SoundStorm for conditioning.
@misc{kharitonov2023speak,
title = {Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision},
author = {Eugene Kharitonov and Damien Vincent and Zalán Borsos and Raphaël Marinier and Sertan Girgin and Olivier Pietquin and Matt Sharifi and Marco Tagliasacchi and Neil Zeghidour},
year = {2023},
eprint = {2302.03540},
archivePrefix = {arXiv},
primaryClass = {cs.SD}
}