doc: update audio related tasks page. (#721)

Vaibhavs10 · pcuenca · web-flow · commit e38b70562a27 · 2024-05-30T14:17:57.000+02:00
---------

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/about.md b/packages/tasks/src/tasks/automatic-speech-recognition/about.md
@@ -18,7 +18,7 @@ The use of Multilingual ASR has become popular, the idea of maintaining just a s
 
 ## Inference
 
-The Hub contains over [~9,000 ASR models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) that you can use right away by trying out the widgets directly in the browser or calling the models as a service using Inference Endpoints. Here is a simple code snippet to do exactly this:
+The Hub contains over [17,000 ASR models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) that you can test right away in your browser using the model page widgets. You can also use any model as a service using the Inference API. Here is a simple code snippet to do exactly this:
 
 ```python
 import json
@@ -39,12 +39,12 @@ data = query("sample1.flac")
 You can also use libraries such as [transformers](https://huggingface.co/models?library=transformers&pipeline_tag=automatic-speech-recognition&sort=downloads), [speechbrain](https://huggingface.co/models?library=speechbrain&pipeline_tag=automatic-speech-recognition&sort=downloads), [NeMo](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=nemo&sort=downloads) and [espnet](https://huggingface.co/models?library=espnet&pipeline_tag=automatic-speech-recognition&sort=downloads) if you want one-click managed Inference without any hassle.
 
 ```python
+# pip install --upgrade transformers
+
 from transformers import pipeline
 
-with open("sample.flac", "rb") as f:
-  data = f.read()
+pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v3")
 
-pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v2")
 pipe("sample.flac")
 # {'text': "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOL ROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"}
 ```
@@ -57,7 +57,7 @@ import { HfInference } from "@huggingface/inference";
 const inference = new HfInference(HF_TOKEN);
 await inference.automaticSpeechRecognition({
 	data: await (await fetch("sample.flac")).blob(),
-	model: "openai/whisper-large-v2",
+	model: "openai/whisper-large-v3",
 });
 ```
 
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/data.ts b/packages/tasks/src/tasks/automatic-speech-recognition/data.ts
@@ -3,16 +3,16 @@ import type { TaskDataCustom } from "..";
 const taskData: TaskDataCustom = {
 	datasets: [
 		{
-			description: "18,000 hours of multilingual audio-text dataset in 108 languages.",
-			id: "mozilla-foundation/common_voice_13_0",
+			description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
+			id: "mozilla-foundation/common_voice_17_0",
 		},
 		{
 			description: "An English dataset with 1,000 hours of data.",
 			id: "librispeech_asr",
 		},
 		{
-			description: "High quality, multi-speaker audio data and their transcriptions in various languages.",
-			id: "openslr",
+			description: "A multi-lingual audio dataset with 370K hours of audio.",
+			id: "espnet/yodas",
 		},
 	],
 	demo: {
@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
 			id: "openai/whisper-large-v3",
 		},
 		{
-			description: "A good generic ASR model by MetaAI.",
-			id: "facebook/wav2vec2-base-960h",
+			description: "A good generic speech model by MetaAI for fine-tuning.",
+			id: "facebook/w2v-bert-2.0",
 		},
 		{
 			description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
-			id: "facebook/s2t-small-mustc-en-fr-st",
+			id: "facebook/seamless-m4t-v2-large",
 		},
 	],
 	spaces: [
diff --git a/packages/tasks/src/tasks/text-to-speech/about.md b/packages/tasks/src/tasks/text-to-speech/about.md
@@ -58,8 +58,6 @@ await inference.textToSpeech({
 
 - [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter6/introduction)
 - [ML for Audio Study Group - Text to Speech Deep Dive](https://www.youtube.com/watch?v=aLBedWj-5CQ)
-- [An introduction to SpeechT5, a multi-purpose speech recognition and synthesis model](https://huggingface.co/blog/speecht5).
-- [A guide on Fine-tuning Whisper For Multilingual ASR with 🤗Transformers](https://huggingface.co/blog/fine-tune-whisper)
 - [Speech Synthesis, Recognition, and More With SpeechT5](https://huggingface.co/blog/speecht5)
 - [Optimizing a Text-To-Speech model using 🤗 Transformers](https://huggingface.co/blog/optimizing-bark)
--
+- [Train your own TTS models with Parler-TTS](https://github.com/huggingface/parler-tts)
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -4,8 +4,8 @@ const taskData: TaskDataCustom = {
 	canonicalId: "text-to-audio",
 	datasets: [
 		{
-			description: "Thousands of short audio clips of a single speaker.",
-			id: "lj_speech",
+			description: "10K hours of multi-speaker English dataset.",
+			id: "parler-tts/mls_eng_10k",
 		},
 		{
 			description: "Multi-speaker English dataset.",
@@ -43,8 +43,8 @@ const taskData: TaskDataCustom = {
 			id: "facebook/mms-tts",
 		},
 		{
-			description: "An end-to-end speech synthesis model.",
-			id: "microsoft/speecht5_tts",
+			description: "A prompt based, powerful TTS model.",
+			id: "parler-tts/parler_tts_mini_v0.1",
 		},
 	],
 	spaces: [
@@ -57,8 +57,8 @@ const taskData: TaskDataCustom = {
 			id: "coqui/xtts",
 		},
 		{
-			description: "An application that synthesizes speech for various speaker types.",
-			id: "Matthijs/speecht5-tts-demo",
+			description: "An application that synthesizes speech for diverse speaker prompts.",
+			id: "parler-tts/parler_tts_mini",
 		},
 	],
 	summary: