Skip to content

Enable multimodal response generation in android #6901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions firebase-vertexai/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* [fixed] Fixed an issue with `LiveContentResponse` audio data not being present when the model was
interrupted or the turn completed. (#6870)
* [fixed] Fixed an issue with `LiveSession` not converting exceptions to `FirebaseVertexAIException`. (#6870)
* [feature] Enable response generation in multiple modalities. (#6901)


# 16.3.0
Expand Down
9 changes: 6 additions & 3 deletions firebase-vertexai/api.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ package com.google.firebase.vertexai.java {
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> send(String text);
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendFunctionResponse(java.util.List<com.google.firebase.vertexai.type.FunctionResponsePart> functionList);
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendMediaStream(java.util.List<com.google.firebase.vertexai.type.MediaData> mediaChunks);
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.vertexai.type.FunctionCallPart,com.google.firebase.vertexai.type.FunctionResponsePart>? functionCallHandler);
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> stopAudioConversation();
method public abstract void stopReceiving();
field public static final com.google.firebase.vertexai.java.LiveSessionFutures.Companion Companion;
}
Expand Down Expand Up @@ -330,11 +330,13 @@ package com.google.firebase.vertexai.type {
ctor public GenerateContentResponse(java.util.List<com.google.firebase.vertexai.type.Candidate> candidates, com.google.firebase.vertexai.type.PromptFeedback? promptFeedback, com.google.firebase.vertexai.type.UsageMetadata? usageMetadata);
method public java.util.List<com.google.firebase.vertexai.type.Candidate> getCandidates();
method public java.util.List<com.google.firebase.vertexai.type.FunctionCallPart> getFunctionCalls();
method public java.util.List<com.google.firebase.vertexai.type.InlineDataPart> getInlineDataParts();
method public com.google.firebase.vertexai.type.PromptFeedback? getPromptFeedback();
method public String? getText();
method public com.google.firebase.vertexai.type.UsageMetadata? getUsageMetadata();
property public final java.util.List<com.google.firebase.vertexai.type.Candidate> candidates;
property public final java.util.List<com.google.firebase.vertexai.type.FunctionCallPart> functionCalls;
property public final java.util.List<com.google.firebase.vertexai.type.InlineDataPart> inlineDataParts;
property public final com.google.firebase.vertexai.type.PromptFeedback? promptFeedback;
property public final String? text;
property public final com.google.firebase.vertexai.type.UsageMetadata? usageMetadata;
Expand All @@ -352,6 +354,7 @@ package com.google.firebase.vertexai.type {
field public Integer? maxOutputTokens;
field public Float? presencePenalty;
field public String? responseMimeType;
field public java.util.List<com.google.firebase.vertexai.type.ResponseModality>? responseModalities;
field public com.google.firebase.vertexai.type.Schema? responseSchema;
field public java.util.List<java.lang.String>? stopSequences;
field public Float? temperature;
Expand Down Expand Up @@ -690,7 +693,7 @@ package com.google.firebase.vertexai.type {
public final class RequestTimeoutException extends com.google.firebase.vertexai.type.FirebaseVertexAIException {
}

@com.google.firebase.vertexai.type.PublicPreviewAPI public final class ResponseModality {
public final class ResponseModality {
method public int getOrdinal();
property public final int ordinal;
field public static final com.google.firebase.vertexai.type.ResponseModality AUDIO;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,18 @@ public class GenerateContentResponse(
candidates.first().content.parts.filterIsInstance<FunctionCallPart>()
}

/**
* Convenience field representing all the [InlineDataPart]s in the first candidate, if they exist.
*
* This also includes any [ImagePart], but they will be represented as [InlineDataPart] instead.
*/
public val inlineDataParts: List<InlineDataPart> by lazy {
candidates.first().content.parts.let { parts ->
parts.filterIsInstance<ImagePart>().map { it.toInlineDataPart() } +
parts.filterIsInstance<InlineDataPart>()
}
}

@Serializable
internal data class Internal(
val candidates: List<Candidate.Internal>? = null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ import kotlinx.serialization.Serializable
* @property responseSchema Output schema of the generated candidate text. If set, a compatible
* [responseMimeType] must also be set.
*
* @property responseModalities The format of data in which the model should respond with.
*
* Compatible MIME types:
* - `application/json`: Schema for JSON response.
*
Expand All @@ -88,6 +90,7 @@ private constructor(
internal val stopSequences: List<String>?,
internal val responseMimeType: String?,
internal val responseSchema: Schema?,
internal val responseModalities: List<ResponseModality>?,
) {

/**
Expand Down Expand Up @@ -115,6 +118,9 @@ private constructor(
* @property responseMimeType See [GenerationConfig.responseMimeType].
*
* @property responseSchema See [GenerationConfig.responseSchema].
*
* @property responseModalities See [GenerationConfig.responseModalities].
*
* @see [generationConfig]
*/
public class Builder {
Expand All @@ -128,6 +134,7 @@ private constructor(
@JvmField public var stopSequences: List<String>? = null
@JvmField public var responseMimeType: String? = null
@JvmField public var responseSchema: Schema? = null
@JvmField public var responseModalities: List<ResponseModality>? = null

/** Create a new [GenerationConfig] with the attached arguments. */
public fun build(): GenerationConfig =
Expand All @@ -142,6 +149,7 @@ private constructor(
frequencyPenalty = frequencyPenalty,
responseMimeType = responseMimeType,
responseSchema = responseSchema,
responseModalities = responseModalities
)
}

Expand All @@ -156,7 +164,8 @@ private constructor(
frequencyPenalty = frequencyPenalty,
presencePenalty = presencePenalty,
responseMimeType = responseMimeType,
responseSchema = responseSchema?.toInternal()
responseSchema = responseSchema?.toInternal(),
responseModalities = responseModalities?.map { it.toInternal() }
)

@Serializable
Expand All @@ -171,6 +180,7 @@ private constructor(
@SerialName("presence_penalty") val presencePenalty: Float? = null,
@SerialName("frequency_penalty") val frequencyPenalty: Float? = null,
@SerialName("response_schema") val responseSchema: Schema.Internal? = null,
@SerialName("response_modalities") val responseModalities: List<String>? = null
)

public companion object {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,14 @@ public class TextPart(public val text: String) : Part {
*
* @param image [Bitmap] to convert into a [Part]
*/
public class ImagePart(public val image: Bitmap) : Part
public class ImagePart(public val image: Bitmap) : Part {

internal fun toInlineDataPart() =
InlineDataPart(
android.util.Base64.decode(encodeBitmapToBase64Png(image), BASE_64_FLAGS),
"image/jpeg"
)
}

/**
* Represents binary data with an associated MIME type sent to and received from requests.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import kotlinx.serialization.KSerializer
import kotlinx.serialization.Serializable

/** Represents the type of content present in a response (e.g., text, image, audio). */
@PublicPreviewAPI
public class ResponseModality private constructor(public val ordinal: Int) {

@Serializable(Internal.Serializer::class)
Expand Down
Loading