Skip to content

Commit 2e7ec43

Browse files
merveenoyanpcuenca
andauthored
Tasks: update new models and Spaces (#980)
Updated new models and Spaces --------- Co-authored-by: Pedro Cuenca <[email protected]>
1 parent 327fa1b commit 2e7ec43

File tree

9 files changed

+69
-41
lines changed

9 files changed

+69
-41
lines changed

packages/tasks/src/tasks/depth-estimation/data.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,15 @@ const taskData: TaskDataCustom = {
3333
},
3434
{
3535
description: "A strong monocular depth estimation model.",
36-
id: "Bingxin/Marigold",
36+
id: "jingheya/lotus-depth-g-v1-0",
3737
},
3838
{
39-
description: "A metric depth estimation model trained on NYU dataset.",
40-
id: "Intel/zoedepth-nyu",
39+
description: "A depth estimation model that predicts depth in videos.",
40+
id: "tencent/DepthCrafter",
41+
},
42+
{
43+
description: "A robust depth estimation model.",
44+
id: "apple/DepthPro",
4145
},
4246
],
4347
spaces: [
@@ -46,12 +50,16 @@ const taskData: TaskDataCustom = {
4650
id: "radames/dpt-depth-estimation-3d-voxels",
4751
},
4852
{
49-
description: "An application on cutting-edge depth estimation.",
50-
id: "depth-anything/Depth-Anything-V2",
53+
description: "An application for bleeding-edge depth estimation.",
54+
id: "akhaliq/depth-pro",
55+
},
56+
{
57+
description: "An application on cutting-edge depth estimation in videos.",
58+
id: "tencent/DepthCrafter",
5159
},
5260
{
53-
description: "An application to try state-of-the-art depth estimation.",
54-
id: "merve/compare_depth_models",
61+
description: "A human-centric depth estimation application.",
62+
id: "facebook/sapiens-depth",
5563
},
5664
],
5765
summary: "Depth estimation is the task of predicting depth of the objects present in an image.",

packages/tasks/src/tasks/image-segmentation/data.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ const taskData: TaskDataCustom = {
5757
id: "ZhengPeng7/BiRefNet",
5858
},
5959
{
60-
description: "Semantic segmentation model trained on ADE20k dataset.",
61-
id: "nvidia/segformer-b0-finetuned-ade-512-512",
60+
description: "Powerful human-centric image segmentation model.",
61+
id: "facebook/sapiens-seg-1b",
6262
},
6363
{
64-
description: "Panoptic segmentation model trained COCO (common objects) dataset.",
64+
description: "Panoptic segmentation model trained on the COCO (common objects) dataset.",
6565
id: "facebook/mask2former-swin-large-coco-panoptic",
6666
},
6767
],
@@ -75,8 +75,8 @@ const taskData: TaskDataCustom = {
7575
id: "jbrinkma/segment-anything",
7676
},
7777
{
78-
description: "A semantic segmentation application that predicts human silhouettes.",
79-
id: "keras-io/Human-Part-Segmentation",
78+
description: "A human-centric segmentation model.",
79+
id: "facebook/sapiens-pose",
8080
},
8181
{
8282
description: "An instance segmentation application to predict neuronal cell types from microscopy images.",

packages/tasks/src/tasks/image-text-to-text/data.ts

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,20 +47,24 @@ const taskData: TaskDataCustom = {
4747
id: "meta-llama/Llama-3.2-11B-Vision-Instruct",
4848
},
4949
{
50-
description: "Cutting-edge conversational vision language model that can take multiple image inputs.",
51-
id: "HuggingFaceM4/idefics2-8b-chatty",
50+
description: "Cutting-edge vision language models.",
51+
id: "allenai/Molmo-7B-D-0924",
5252
},
5353
{
5454
description: "Small yet powerful model.",
5555
id: "vikhyatk/moondream2",
5656
},
5757
{
58-
description: "Strong image-text-to-text model made to understand documents.",
59-
id: "mPLUG/DocOwl1.5",
58+
description: "Strong image-text-to-text model.",
59+
id: "Qwen/Qwen2-VL-7B-Instruct",
6060
},
6161
{
6262
description: "Strong image-text-to-text model.",
63-
id: "microsoft/Phi-3.5-vision-instruct",
63+
id: "mistralai/Pixtral-12B-2409",
64+
},
65+
{
66+
description: "Strong image-text-to-text model focused on documents.",
67+
id: "stepfun-ai/GOT-OCR2_0",
6468
},
6569
],
6670
spaces: [
@@ -74,15 +78,19 @@ const taskData: TaskDataCustom = {
7478
},
7579
{
7680
description: "Powerful vision-language model assistant.",
77-
id: "liuhaotian/LLaVA-1.6",
81+
id: "akhaliq/Molmo-7B-D-0924",
82+
},
83+
{
84+
description: "An image-text-to-text application focused on documents.",
85+
id: "stepfun-ai/GOT_official_online_demo",
7886
},
7987
{
8088
description: "An application to compare outputs of different vision language models.",
8189
id: "merve/compare_VLMs",
8290
},
8391
{
84-
description: "An application for document vision language tasks.",
85-
id: "mPLUG/DocOwl",
92+
description: "An application for chatting with an image-text-to-text model.",
93+
id: "GanymedeNil/Qwen2-VL-7B",
8694
},
8795
],
8896
summary:

packages/tasks/src/tasks/keypoint-detection/data.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const taskData: TaskDataCustom = {
2929
},
3030
{
3131
description: "Strong keypoint detection model used to detect human pose.",
32-
id: "qualcomm/MediaPipe-Pose-Estimation",
32+
id: "facebook/sapiens-pose-1b",
3333
},
3434
],
3535
spaces: [

packages/tasks/src/tasks/text-generation/data.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,6 @@ const taskData: TaskDataCustom = {
5858
description: "A text-generation model trained to follow instructions.",
5959
id: "google/gemma-2-2b-it",
6060
},
61-
{
62-
description: "A code generation model that can generate code in 80+ languages.",
63-
id: "bigcode/starcoder",
64-
},
6561
{
6662
description: "Very powerful text generation model trained to follow instructions.",
6763
id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
@@ -75,19 +71,23 @@ const taskData: TaskDataCustom = {
7571
id: "AI-MO/NuminaMath-7B-TIR",
7672
},
7773
{
78-
description: "Strong coding assistant model.",
79-
id: "HuggingFaceH4/starchat2-15b-v0.1",
74+
description: "Strong text generation model to follow instructions.",
75+
id: "Qwen/Qwen2.5-7B-Instruct",
8076
},
8177
{
8278
description: "Very strong open-source large language model.",
83-
id: "mistralai/Mistral-Nemo-Instruct-2407",
79+
id: "nvidia/Llama-3.1-Nemotron-70B-Instruct",
8480
},
8581
],
8682
spaces: [
8783
{
8884
description: "A leaderboard to compare different open-source text generation models based on various benchmarks.",
8985
id: "open-llm-leaderboard/open_llm_leaderboard",
9086
},
87+
{
88+
description: "A leaderboard for comparing chain-of-thought performance of models.",
89+
id: "logikon/open_cot_leaderboard",
90+
},
9191
{
9292
description: "An text generation based application based on a very powerful LLaMA2 model.",
9393
id: "ysharma/Explore_llamav2_with_TGI",

packages/tasks/src/tasks/text-to-image/data.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ const taskData: TaskDataCustom = {
7171
id: "jbilcke-hf/ai-comic-factory",
7272
},
7373
{
74-
description: "A text-to-image application that can generate coherent text inside the image.",
75-
id: "DeepFloyd/IF",
74+
description: "An application to match multiple custom image generation models.",
75+
id: "multimodalart/flux-lora-lab",
7676
},
7777
{
7878
description: "A powerful yet very fast image generation application.",

packages/tasks/src/tasks/text-to-speech/data.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,13 @@ const taskData: TaskDataCustom = {
5757
id: "suno/bark",
5858
},
5959
{
60-
description: "XTTS is a Voice generation model that lets you clone voices into different languages.",
60+
description: "An application on XTTS, a voice generation model that lets you clone voices into different languages.",
6161
id: "coqui/xtts",
6262
},
63+
{
64+
description: "An application that generates speech in different styles in English and Chinese.",
65+
id: "mrfakename/E2-F5-TTS",
66+
},
6367
{
6468
description: "An application that synthesizes speech for diverse speaker prompts.",
6569
id: "parler-tts/parler_tts_mini",

packages/tasks/src/tasks/text-to-video/data.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,30 +67,30 @@ const taskData: TaskDataCustom = {
6767
],
6868
models: [
6969
{
70-
description: "A strong model for video generation.",
71-
id: "Vchitect/LaVie",
70+
description: "A strong model for consistent video generation.",
71+
id: "rain1011/pyramid-flow-sd3",
7272
},
7373
{
7474
description: "A robust model for text-to-video generation.",
75-
id: "damo-vilab/text-to-video-ms-1.7b",
75+
id: "VideoCrafter/VideoCrafter2",
7676
},
7777
{
78-
description: "A text-to-video generation model with high quality and smooth outputs.",
79-
id: "hotshotco/Hotshot-XL",
78+
description: "A cutting-edge text-to-video generation model.",
79+
id: "TIGER-Lab/T2V-Turbo-V2",
8080
},
8181
],
8282
spaces: [
8383
{
8484
description: "An application that generates video from text.",
85-
id: "fffiloni/zeroscope",
85+
id: "VideoCrafter/VideoCrafter",
8686
},
8787
{
88-
description: "An application that generates video from image and text.",
89-
id: "Vchitect/LaVie",
88+
description: "Consistent video generation application.",
89+
id: "TIGER-Lab/T2V-Turbo-V2",
9090
},
9191
{
92-
description: "An application that generates videos from text and provides multi-model support.",
93-
id: "ArtGAN/Video-Diffusion-WebUI",
92+
description: "A cutting edge video generation application.",
93+
id: "Pyramid-Flow/pyramid-flow",
9494
},
9595
],
9696
summary:

packages/tasks/src/tasks/video-text-to-text/data.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
1010
description: "A dataset of instructions and question-answer pairs about videos.",
1111
id: "lmms-lab/VideoChatGPT",
1212
},
13+
{
14+
description: "Large video understanding dataset.",
15+
id: "HuggingFaceFV/finevideo",
16+
},
1317
],
1418
demo: {
1519
inputs: [
@@ -48,6 +52,10 @@ const taskData: TaskDataCustom = {
4852
description: "An application to chat with a video-text-to-text model.",
4953
id: "llava-hf/video-llava",
5054
},
55+
{
56+
description: "A leaderboard for various video-text-to-text models.",
57+
id: "opencompass/openvlm_video_leaderboard",
58+
},
5159
],
5260
summary:
5361
"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",

0 commit comments

Comments
 (0)