1The following datasets can be used to train a language-independent LPCNet model. 2A good choice is to include all the data from these datasets, except for 3hi_fi_tts for which only a small subset is recommended (since it's very large 4but has few speakers). Note that this data typically needs to be resampled 5before it can be used. 6 7https://www.openslr.org/resources/30/si_lk.tar.gz 8https://www.openslr.org/resources/32/af_za.tar.gz 9https://www.openslr.org/resources/32/st_za.tar.gz 10https://www.openslr.org/resources/32/tn_za.tar.gz 11https://www.openslr.org/resources/32/xh_za.tar.gz 12https://www.openslr.org/resources/37/bn_bd.zip 13https://www.openslr.org/resources/37/bn_in.zip 14https://www.openslr.org/resources/41/jv_id_female.zip 15https://www.openslr.org/resources/41/jv_id_male.zip 16https://www.openslr.org/resources/42/km_kh_male.zip 17https://www.openslr.org/resources/43/ne_np_female.zip 18https://www.openslr.org/resources/44/su_id_female.zip 19https://www.openslr.org/resources/44/su_id_male.zip 20https://www.openslr.org/resources/61/es_ar_female.zip 21https://www.openslr.org/resources/61/es_ar_male.zip 22https://www.openslr.org/resources/63/ml_in_female.zip 23https://www.openslr.org/resources/63/ml_in_male.zip 24https://www.openslr.org/resources/64/mr_in_female.zip 25https://www.openslr.org/resources/65/ta_in_female.zip 26https://www.openslr.org/resources/65/ta_in_male.zip 27https://www.openslr.org/resources/66/te_in_female.zip 28https://www.openslr.org/resources/66/te_in_male.zip 29https://www.openslr.org/resources/69/ca_es_female.zip 30https://www.openslr.org/resources/69/ca_es_male.zip 31https://www.openslr.org/resources/70/en_ng_female.zip 32https://www.openslr.org/resources/70/en_ng_male.zip 33https://www.openslr.org/resources/71/es_cl_female.zip 34https://www.openslr.org/resources/71/es_cl_male.zip 35https://www.openslr.org/resources/72/es_co_female.zip 36https://www.openslr.org/resources/72/es_co_male.zip 37https://www.openslr.org/resources/73/es_pe_female.zip 38https://www.openslr.org/resources/73/es_pe_male.zip 39https://www.openslr.org/resources/74/es_pr_female.zip 40https://www.openslr.org/resources/75/es_ve_female.zip 41https://www.openslr.org/resources/75/es_ve_male.zip 42https://www.openslr.org/resources/76/eu_es_female.zip 43https://www.openslr.org/resources/76/eu_es_male.zip 44https://www.openslr.org/resources/77/gl_es_female.zip 45https://www.openslr.org/resources/77/gl_es_male.zip 46https://www.openslr.org/resources/78/gu_in_female.zip 47https://www.openslr.org/resources/78/gu_in_male.zip 48https://www.openslr.org/resources/79/kn_in_female.zip 49https://www.openslr.org/resources/79/kn_in_male.zip 50https://www.openslr.org/resources/80/my_mm_female.zip 51https://www.openslr.org/resources/83/irish_english_male.zip 52https://www.openslr.org/resources/83/midlands_english_female.zip 53https://www.openslr.org/resources/83/midlands_english_male.zip 54https://www.openslr.org/resources/83/northern_english_female.zip 55https://www.openslr.org/resources/83/northern_english_male.zip 56https://www.openslr.org/resources/83/scottish_english_female.zip 57https://www.openslr.org/resources/83/scottish_english_male.zip 58https://www.openslr.org/resources/83/southern_english_female.zip 59https://www.openslr.org/resources/83/southern_english_male.zip 60https://www.openslr.org/resources/83/welsh_english_female.zip 61https://www.openslr.org/resources/83/welsh_english_male.zip 62https://www.openslr.org/resources/86/yo_ng_female.zip 63https://www.openslr.org/resources/86/yo_ng_male.zip 64https://www.openslr.org/resources/109/hi_fi_tts_v0.tar.gz 65 66The corresponding citations for all these datasets are: 67 68 @inproceedings{demirsahin-etal-2020-open, 69 title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}}, 70 author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara}, 71 booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)}, 72 month = may, 73 year = {2020}, 74 pages = {6532--6541}, 75 address = {Marseille, France}, 76 publisher = {European Language Resources Association (ELRA)}, 77 url = {https://www.aclweb.org/anthology/2020.lrec-1.804}, 78 ISBN = {979-10-95546-34-4}, 79 } 80 @inproceedings{kjartansson-etal-2020-open, 81 title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}}, 82 author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara}, 83 booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)}, 84 year = {2020}, 85 pages = {21--27}, 86 month = may, 87 address = {Marseille, France}, 88 publisher = {European Language Resources association (ELRA)}, 89 url = {https://www.aclweb.org/anthology/2020.sltu-1.3}, 90 ISBN = {979-10-95546-35-1}, 91 } 92 93 94 @inproceedings{guevara-rukoz-etal-2020-crowdsourcing, 95 title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}}, 96 author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur}, 97 booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)}, 98 year = {2020}, 99 month = may, 100 address = {Marseille, France}, 101 publisher = {European Language Resources Association (ELRA)}, 102 url = {https://www.aclweb.org/anthology/2020.lrec-1.801}, 103 pages = {6504--6513}, 104 ISBN = {979-10-95546-34-4}, 105 } 106 @inproceedings{he-etal-2020-open, 107 title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and Telugu Speech Synthesis Systems}}, 108 author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin, Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot}, 109 booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)}, 110 month = may, 111 year = {2020}, 112 address = {Marseille, France}, 113 publisher = {European Language Resources Association (ELRA)}, 114 pages = {6494--6503}, 115 url = {https://www.aclweb.org/anthology/2020.lrec-1.800}, 116 ISBN = "{979-10-95546-34-4}", 117 } 118 119 120 @inproceedings{kjartansson-etal-tts-sltu2018, 121 title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese, Khmer, Nepali, Sinhala, and Sundanese}}, 122 author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu De Silva and Supheakmungkol Sarin}, 123 booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)}, 124 year = {2018}, 125 address = {Gurugram, India}, 126 month = aug, 127 pages = {66--70}, 128 URL = {http://dx.doi.org/10.21437/SLTU.2018-14} 129 } 130 131 132 @inproceedings{oo-etal-2020-burmese, 133 title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application to Text-to-Speech}}, 134 author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin, Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander}, 135 booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)}, 136 month = may, 137 year = {2020}, 138 pages = "6328--6339", 139 address = {Marseille, France}, 140 publisher = {European Language Resources Association (ELRA)}, 141 url = {https://www.aclweb.org/anthology/2020.lrec-1.777}, 142 ISBN = {979-10-95546-34-4}, 143 } 144 @inproceedings{van-niekerk-etal-2017, 145 title = {{Rapid development of TTS corpora for four South African languages}}, 146 author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson and Martin Jansche and Linne Ha}, 147 booktitle = {Proc. Interspeech 2017}, 148 pages = {2178--2182}, 149 address = {Stockholm, Sweden}, 150 month = aug, 151 year = {2017}, 152 URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139} 153 } 154 155 @inproceedings{gutkin-et-al-yoruba2020, 156 title = {{Developing an Open-Source Corpus of Yoruba Speech}}, 157 author = {Alexander Gutkin and I{\c{s}}{\i}n Demir{\c{s}}ahin and Oddur Kjartansson and Clara Rivera and K\d{\'o}lá Túb\d{\`o}sún}, 158 booktitle = {Proceedings of Interspeech 2020}, 159 pages = {404--408}, 160 month = {October}, 161 year = {2020}, 162 address = {Shanghai, China}, 163 publisher = {International Speech and Communication Association (ISCA)}, 164 doi = {10.21437/Interspeech.2020-1096}, 165 url = {http://dx.doi.org/10.21437/Interspeech.2020-1096}, 166 } 167 168@article{bakhturina2021hi, 169 title={{Hi-Fi Multi-Speaker English TTS Dataset}}, 170 author={Bakhturina, Evelina and Lavrukhin, Vitaly and Ginsburg, Boris and Zhang, Yang}, 171 journal={arXiv preprint arXiv:2104.01497}, 172 year={2021} 173} 174