[Inference snippets] VLM hf_hub, oai snippets (#985)

mishig25 · web-flow · commit fc57b44556f8 · 2024-10-28T11:16:50.000+01:00
Thanks to #976, now we can show `hf_hub`, `oai` snippets for VLMs ("conversational image-text-to-text" models).
diff --git a/packages/tasks/src/snippets/curl.ts b/packages/tasks/src/snippets/curl.ts
@@ -26,9 +26,24 @@ export const snippetTextGeneration = (
 	if (model.tags.includes("conversational")) {
 		// Conversational model detected, so we display a code snippet that features the Messages API
 		const streaming = opts?.streaming ?? true;
-		const messages: ChatCompletionInputMessage[] = opts?.messages ?? [
-			{ role: "user", content: "What is the capital of France?" },
-		];
+		const exampleMessages: ChatCompletionInputMessage[] =
+			model.pipeline_tag === "text-generation"
+				? [{ role: "user", content: "What is the capital of France?" }]
+				: [
+						{
+							role: "user",
+							content: [
+								{
+									type: "image_url",
+									image_url: {
+										url: "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+									},
+								},
+								{ type: "text", text: "Describe this image in one sentence." },
+							],
+						},
+				  ];
+		const messages = opts?.messages ?? exampleMessages;
 
 		const config = {
 			...(opts?.temperature ? { temperature: opts.temperature } : undefined),
@@ -63,34 +78,6 @@ export const snippetTextGeneration = (
 	}
 };
 
-export const snippetImageTextToTextGeneration = (model: ModelDataMinimal, accessToken: string): InferenceSnippet => {
-	if (model.tags.includes("conversational")) {
-		// Conversational model detected, so we display a code snippet that features the Messages API
-		return {
-			content: `curl 'https://api-inference.huggingface.co/models/${model.id}/v1/chat/completions' \\
--H "Authorization: Bearer ${accessToken || `{API_TOKEN}`}" \\
--H 'Content-Type: application/json' \\
--d '{
-	"model": "${model.id}",
-	"messages": [
-		{
-			"role": "user",
-			"content": [
-				{"type": "image_url", "image_url": {"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"}},
-				{"type": "text", "text": "Describe this image in one sentence."}
-			]
-		}
-	],
-	"max_tokens": 500,
-	"stream": false
-}'
-`,
-		};
-	} else {
-		return snippetBasic(model, accessToken);
-	}
-};
-
 export const snippetZeroShotClassification = (model: ModelDataMinimal, accessToken: string): InferenceSnippet => ({
 	content: `curl https://api-inference.huggingface.co/models/${model.id} \\
 	-X POST \\
@@ -122,7 +109,7 @@ export const curlSnippets: Partial<
 	summarization: snippetBasic,
 	"feature-extraction": snippetBasic,
 	"text-generation": snippetTextGeneration,
-	"image-text-to-text": snippetImageTextToTextGeneration,
+	"image-text-to-text": snippetTextGeneration,
 	"text2text-generation": snippetBasic,
 	"fill-mask": snippetBasic,
 	"sentence-similarity": snippetBasic,
diff --git a/packages/tasks/src/snippets/js.ts b/packages/tasks/src/snippets/js.ts
@@ -40,9 +40,24 @@ export const snippetTextGeneration = (
 	if (model.tags.includes("conversational")) {
 		// Conversational model detected, so we display a code snippet that features the Messages API
 		const streaming = opts?.streaming ?? true;
-		const messages: ChatCompletionInputMessage[] = opts?.messages ?? [
-			{ role: "user", content: "What is the capital of France?" },
-		];
+		const exampleMessages: ChatCompletionInputMessage[] =
+			model.pipeline_tag === "text-generation"
+				? [{ role: "user", content: "What is the capital of France?" }]
+				: [
+						{
+							role: "user",
+							content: [
+								{
+									type: "image_url",
+									image_url: {
+										url: "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+									},
+								},
+								{ type: "text", text: "Describe this image in one sentence." },
+							],
+						},
+				  ];
+		const messages = opts?.messages ?? exampleMessages;
 		const messagesStr = stringifyMessages(messages, { sep: ",\n\t\t", start: "[\n\t\t", end: "\n\t]" });
 
 		const config = {
@@ -148,36 +163,6 @@ console.log(chatCompletion.choices[0].message);`,
 	}
 };
 
-export const snippetImageTextToTextGeneration = (model: ModelDataMinimal, accessToken: string): InferenceSnippet => {
-	if (model.tags.includes("conversational")) {
-		// Conversational model detected, so we display a code snippet that features the Messages API
-		return {
-			content: `import { HfInference } from "@huggingface/inference";
-
-const inference = new HfInference("${accessToken || `{API_TOKEN}`}");
-const imageUrl = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg";
-
-for await (const chunk of inference.chatCompletionStream({
-	model: "${model.id}",
-	messages: [
-		{
-			"role": "user",
-			"content": [
-				{"type": "image_url", "image_url": {"url": imageUrl}},
-				{"type": "text", "text": "Describe this image in one sentence."},
-			],
-		}
-	],
-	max_tokens: 500,
-})) {
-	process.stdout.write(chunk.choices[0]?.delta?.content || "");
-}`,
-		};
-	} else {
-		return snippetBasic(model, accessToken);
-	}
-};
-
 export const snippetZeroShotClassification = (model: ModelDataMinimal, accessToken: string): InferenceSnippet => ({
 	content: `async function query(data) {
 	const response = await fetch(
@@ -307,7 +292,7 @@ export const jsSnippets: Partial<
 	summarization: snippetBasic,
 	"feature-extraction": snippetBasic,
 	"text-generation": snippetTextGeneration,
-	"image-text-to-text": snippetImageTextToTextGeneration,
+	"image-text-to-text": snippetTextGeneration,
 	"text2text-generation": snippetBasic,
 	"fill-mask": snippetBasic,
 	"sentence-similarity": snippetBasic,
diff --git a/packages/tasks/src/snippets/python.ts b/packages/tasks/src/snippets/python.ts
@@ -16,9 +16,24 @@ export const snippetConversational = (
 	}
 ): InferenceSnippet[] => {
 	const streaming = opts?.streaming ?? true;
-	const messages: ChatCompletionInputMessage[] = opts?.messages ?? [
-		{ role: "user", content: "What is the capital of France?" },
-	];
+	const exampleMessages: ChatCompletionInputMessage[] =
+		model.pipeline_tag === "text-generation"
+			? [{ role: "user", content: "What is the capital of France?" }]
+			: [
+					{
+						role: "user",
+						content: [
+							{
+								type: "image_url",
+								image_url: {
+									url: "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+								},
+							},
+							{ type: "text", text: "Describe this image in one sentence." },
+						],
+					},
+			  ];
+	const messages = opts?.messages ?? exampleMessages;
 	const messagesStr = stringifyMessages(messages, {
 		sep: ",\n\t",
 		start: `[\n\t`,
@@ -121,30 +136,6 @@ print(completion.choices[0].message)`,
 	}
 };
 
-export const snippetConversationalWithImage = (model: ModelDataMinimal, accessToken: string): InferenceSnippet => ({
-	content: `from huggingface_hub import InferenceClient
-
-client = InferenceClient(api_key="${accessToken || "{API_TOKEN}"}")
-
-image_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-
-for message in client.chat_completion(
-	model="${model.id}",
-	messages=[
-		{
-			"role": "user",
-			"content": [
-				{"type": "image_url", "image_url": {"url": image_url}},
-				{"type": "text", "text": "Describe this image in one sentence."},
-			],
-		}
-	],
-	max_tokens=500,
-	stream=True,
-):
-	print(message.choices[0].delta.content, end="")`,
-});
-
 export const snippetZeroShotClassification = (model: ModelDataMinimal): InferenceSnippet => ({
 	content: `def query(payload):
 	response = requests.post(API_URL, headers=headers, json=payload)
@@ -282,7 +273,7 @@ export const pythonSnippets: Partial<
 	"feature-extraction": snippetBasic,
 	"text-generation": snippetBasic,
 	"text2text-generation": snippetBasic,
-	"image-text-to-text": snippetConversationalWithImage,
+	"image-text-to-text": snippetConversational,
 	"fill-mask": snippetBasic,
 	"sentence-similarity": snippetBasic,
 	"automatic-speech-recognition": snippetFile,
@@ -306,12 +297,9 @@ export function getPythonInferenceSnippet(
 	accessToken: string,
 	opts?: Record<string, unknown>
 ): InferenceSnippet | InferenceSnippet[] {
-	if (model.pipeline_tag === "text-generation" && model.tags.includes("conversational")) {
+	if (model.tags.includes("conversational")) {
 		// Conversational model detected, so we display a code snippet that features the Messages API
 		return snippetConversational(model, accessToken, opts);
-	} else if (model.pipeline_tag === "image-text-to-text" && model.tags.includes("conversational")) {
-		// Example sending an image to the Message API
-		return snippetConversationalWithImage(model, accessToken);
 	} else {
 		let snippets =
 			model.pipeline_tag && model.pipeline_tag in pythonSnippets