Tasks: update new models and Spaces (#980)

merveenoyan · pcuenca · web-flow · commit 2e7ec43882e2 · 2024-10-22T19:07:24.000+02:00
Updated new models and Spaces

---------

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
diff --git a/packages/tasks/src/tasks/depth-estimation/data.ts b/packages/tasks/src/tasks/depth-estimation/data.ts
@@ -33,11 +33,15 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A strong monocular depth estimation model.",
-			id: "Bingxin/Marigold",
+			id: "jingheya/lotus-depth-g-v1-0",
 		},
 		{
-			description: "A metric depth estimation model trained on NYU dataset.",
-			id: "Intel/zoedepth-nyu",
+			description: "A depth estimation model that predicts depth in videos.",
+			id: "tencent/DepthCrafter",
+		},
+		{
+			description: "A robust depth estimation model.",
+			id: "apple/DepthPro",
 		},
 	],
 	spaces: [
@@ -46,12 +50,16 @@ const taskData: TaskDataCustom = {
 			id: "radames/dpt-depth-estimation-3d-voxels",
 		},
 		{
-			description: "An application on cutting-edge depth estimation.",
-			id: "depth-anything/Depth-Anything-V2",
+			description: "An application for bleeding-edge depth estimation.",
+			id: "akhaliq/depth-pro",
+		},
+		{
+			description: "An application on cutting-edge depth estimation in videos.",
+			id: "tencent/DepthCrafter",
 		},
 		{
-			description: "An application to try state-of-the-art depth estimation.",
-			id: "merve/compare_depth_models",
+			description: "A human-centric depth estimation application.",
+			id: "facebook/sapiens-depth",
 		},
 	],
 	summary: "Depth estimation is the task of predicting depth of the objects present in an image.",
diff --git a/packages/tasks/src/tasks/image-segmentation/data.ts b/packages/tasks/src/tasks/image-segmentation/data.ts
@@ -57,11 +57,11 @@ const taskData: TaskDataCustom = {
 			id: "ZhengPeng7/BiRefNet",
 		},
 		{
-			description: "Semantic segmentation model trained on ADE20k dataset.",
-			id: "nvidia/segformer-b0-finetuned-ade-512-512",
+			description: "Powerful human-centric image segmentation model.",
+			id: "facebook/sapiens-seg-1b",
 		},
 		{
-			description: "Panoptic segmentation model trained COCO (common objects) dataset.",
+			description: "Panoptic segmentation model trained on the COCO (common objects) dataset.",
 			id: "facebook/mask2former-swin-large-coco-panoptic",
 		},
 	],
@@ -75,8 +75,8 @@ const taskData: TaskDataCustom = {
 			id: "jbrinkma/segment-anything",
 		},
 		{
-			description: "A semantic segmentation application that predicts human silhouettes.",
-			id: "keras-io/Human-Part-Segmentation",
+			description: "A human-centric segmentation model.",
+			id: "facebook/sapiens-pose",
 		},
 		{
 			description: "An instance segmentation application to predict neuronal cell types from microscopy images.",
diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts
@@ -47,20 +47,24 @@ const taskData: TaskDataCustom = {
 			id: "meta-llama/Llama-3.2-11B-Vision-Instruct",
 		},
 		{
-			description: "Cutting-edge conversational vision language model that can take multiple image inputs.",
-			id: "HuggingFaceM4/idefics2-8b-chatty",
+			description: "Cutting-edge vision language models.",
+			id: "allenai/Molmo-7B-D-0924",
 		},
 		{
 			description: "Small yet powerful model.",
 			id: "vikhyatk/moondream2",
 		},
 		{
-			description: "Strong image-text-to-text model made to understand documents.",
-			id: "mPLUG/DocOwl1.5",
+			description: "Strong image-text-to-text model.",
+			id: "Qwen/Qwen2-VL-7B-Instruct",
 		},
 		{
 			description: "Strong image-text-to-text model.",
-			id: "microsoft/Phi-3.5-vision-instruct",
+			id: "mistralai/Pixtral-12B-2409",
+		},
+		{
+			description: "Strong image-text-to-text model focused on documents.",
+			id: "stepfun-ai/GOT-OCR2_0",
 		},
 	],
 	spaces: [
@@ -74,15 +78,19 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Powerful vision-language model assistant.",
-			id: "liuhaotian/LLaVA-1.6",
+			id: "akhaliq/Molmo-7B-D-0924",
+		},
+		{
+			description: "An image-text-to-text application focused on documents.",
+			id: "stepfun-ai/GOT_official_online_demo",
 		},
 		{
 			description: "An application to compare outputs of different vision language models.",
 			id: "merve/compare_VLMs",
 		},
 		{
-			description: "An application for document vision language tasks.",
-			id: "mPLUG/DocOwl",
+			description: "An application for chatting with an image-text-to-text model.",
+			id: "GanymedeNil/Qwen2-VL-7B",
 		},
 	],
 	summary:
diff --git a/packages/tasks/src/tasks/keypoint-detection/data.ts b/packages/tasks/src/tasks/keypoint-detection/data.ts
@@ -29,7 +29,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Strong keypoint detection model used to detect human pose.",
-			id: "qualcomm/MediaPipe-Pose-Estimation",
+			id: "facebook/sapiens-pose-1b",
 		},
 	],
 	spaces: [
diff --git a/packages/tasks/src/tasks/text-generation/data.ts b/packages/tasks/src/tasks/text-generation/data.ts
@@ -58,10 +58,6 @@ const taskData: TaskDataCustom = {
 			description: "A text-generation model trained to follow instructions.",
 			id: "google/gemma-2-2b-it",
 		},
-		{
-			description: "A code generation model that can generate code in 80+ languages.",
-			id: "bigcode/starcoder",
-		},
 		{
 			description: "Very powerful text generation model trained to follow instructions.",
 			id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -75,19 +71,23 @@ const taskData: TaskDataCustom = {
 			id: "AI-MO/NuminaMath-7B-TIR",
 		},
 		{
-			description: "Strong coding assistant model.",
-			id: "HuggingFaceH4/starchat2-15b-v0.1",
+			description: "Strong text generation model to follow instructions.",
+			id: "Qwen/Qwen2.5-7B-Instruct",
 		},
 		{
 			description: "Very strong open-source large language model.",
-			id: "mistralai/Mistral-Nemo-Instruct-2407",
+			id: "nvidia/Llama-3.1-Nemotron-70B-Instruct",
 		},
 	],
 	spaces: [
 		{
 			description: "A leaderboard to compare different open-source text generation models based on various benchmarks.",
 			id: "open-llm-leaderboard/open_llm_leaderboard",
 		},
+		{
+			description: "A leaderboard for comparing chain-of-thought performance of models.",
+			id: "logikon/open_cot_leaderboard",
+		},
 		{
 			description: "An text generation based application based on a very powerful LLaMA2 model.",
 			id: "ysharma/Explore_llamav2_with_TGI",
diff --git a/packages/tasks/src/tasks/text-to-image/data.ts b/packages/tasks/src/tasks/text-to-image/data.ts
@@ -71,8 +71,8 @@ const taskData: TaskDataCustom = {
 			id: "jbilcke-hf/ai-comic-factory",
 		},
 		{
-			description: "A text-to-image application that can generate coherent text inside the image.",
-			id: "DeepFloyd/IF",
+			description: "An application to match multiple custom image generation models.",
+			id: "multimodalart/flux-lora-lab",
 		},
 		{
 			description: "A powerful yet very fast image generation application.",
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -57,9 +57,13 @@ const taskData: TaskDataCustom = {
 			id: "suno/bark",
 		},
 		{
-			description: "XTTS is a Voice generation model that lets you clone voices into different languages.",
+			description: "An application on XTTS, a voice generation model that lets you clone voices into different languages.",
 			id: "coqui/xtts",
 		},
+		{
+			description: "An application that generates speech in different styles in English and Chinese.",
+			id: "mrfakename/E2-F5-TTS",
+		},
 		{
 			description: "An application that synthesizes speech for diverse speaker prompts.",
 			id: "parler-tts/parler_tts_mini",
diff --git a/packages/tasks/src/tasks/text-to-video/data.ts b/packages/tasks/src/tasks/text-to-video/data.ts
@@ -67,30 +67,30 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "A strong model for video generation.",
-			id: "Vchitect/LaVie",
+			description: "A strong model for consistent video generation.",
+			id: "rain1011/pyramid-flow-sd3",
 		},
 		{
 			description: "A robust model for text-to-video generation.",
-			id: "damo-vilab/text-to-video-ms-1.7b",
+			id: "VideoCrafter/VideoCrafter2",
 		},
 		{
-			description: "A text-to-video generation model with high quality and smooth outputs.",
-			id: "hotshotco/Hotshot-XL",
+			description: "A cutting-edge text-to-video generation model.",
+			id: "TIGER-Lab/T2V-Turbo-V2",
 		},
 	],
 	spaces: [
 		{
 			description: "An application that generates video from text.",
-			id: "fffiloni/zeroscope",
+			id: "VideoCrafter/VideoCrafter",
 		},
 		{
-			description: "An application that generates video from image and text.",
-			id: "Vchitect/LaVie",
+			description: "Consistent video generation application.",
+			id: "TIGER-Lab/T2V-Turbo-V2",
 		},
 		{
-			description: "An application that generates videos from text and provides multi-model support.",
-			id: "ArtGAN/Video-Diffusion-WebUI",
+			description: "A cutting edge video generation application.",
+			id: "Pyramid-Flow/pyramid-flow",
 		},
 	],
 	summary:
diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts
@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
 			description: "A dataset of instructions and question-answer pairs about videos.",
 			id: "lmms-lab/VideoChatGPT",
 		},
+		{
+			description: "Large video understanding dataset.",
+			id: "HuggingFaceFV/finevideo",
+		},
 	],
 	demo: {
 		inputs: [
@@ -48,6 +52,10 @@ const taskData: TaskDataCustom = {
 			description: "An application to chat with a video-text-to-text model.",
 			id: "llava-hf/video-llava",
 		},
+		{
+			description: "A leaderboard for various video-text-to-text models.",
+			id: "opencompass/openvlm_video_leaderboard",
+		},
 	],
 	summary:
 		"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ const taskData: TaskDataCustom = {`
`29`	`29`	`},`
`30`	`30`	`{`
`31`	`31`	`description: "Strong keypoint detection model used to detect human pose.",`
`32`		`- id: "qualcomm/MediaPipe-Pose-Estimation",`
	`32`	`+ id: "facebook/sapiens-pose-1b",`
`33`	`33`	`},`
`34`	`34`	`],`
`35`	`35`	`spaces: [`
Original file line number	Diff line number	Diff line change
`@@ -71,8 +71,8 @@ const taskData: TaskDataCustom = {`
`71`	`71`	`id: "jbilcke-hf/ai-comic-factory",`
`72`	`72`	`},`
`73`	`73`	`{`
`74`		`- description: "A text-to-image application that can generate coherent text inside the image.",`
`75`		`- id: "DeepFloyd/IF",`
	`74`	`+ description: "An application to match multiple custom image generation models.",`
	`75`	`+ id: "multimodalart/flux-lora-lab",`
`76`	`76`	`},`
`77`	`77`	`{`
`78`	`78`	`description: "A powerful yet very fast image generation application.",`