New OpenAIService audio functions/endpoints - createAudioTranscription and createAudioTranslation (impl, settings, response, json formatting)

peterbanda · peterbanda · commit 1d4841fde68e · 2023-03-08T15:18:56.000+01:00
diff --git a/openai-client-stream/README.md b/openai-client-stream/README.md
@@ -1,15 +1,15 @@
-# OpenAI Scala Client - Stream Support [![version](https://img.shields.io/badge/version-0.2.0-green.svg)](https://cequence.io) [![License](https://img.shields.io/badge/License-MIT-lightgrey.svg)](https://opensource.org/licenses/MIT)
+# OpenAI Scala Client - Stream Support [![version](https://img.shields.io/badge/version-0.3.0-green.svg)](https://cequence.io) [![License](https://img.shields.io/badge/License-MIT-lightgrey.svg)](https://opensource.org/licenses/MIT)
 
 This module provides streaming support for the client. Note that the full project documentation can be found [here](../README.md).
 
 ## Installation 🚀
 
-The currently supported Scala versions are **2.12** and **2.13**.
+The currently supported Scala versions are **2.12, 2.13**, and **3**.
 
 To pull the library you have to add the following dependency to your *build.sbt*
 
 ```
-"io.cequence" %% "openai-scala-client-stream" % "0.2.0"
+"io.cequence" %% "openai-scala-client-stream" % "0.3.0"
 ```
 
 or to *pom.xml* (if you use maven)
@@ -18,6 +18,6 @@ or to *pom.xml* (if you use maven)
 <dependency>
     <groupId>io.cequence</groupId>
     <artifactId>openai-scala-client-stream_2.12</artifactId>
-    <version>0.2.0</version>
+    <version>0.3.0</version>
 </dependency>
 ```
diff --git a/openai-client/src/main/scala/io/cequence/openaiscala/service/Command.scala b/openai-client/src/main/scala/io/cequence/openaiscala/service/Command.scala
@@ -9,6 +9,8 @@ object Command extends Enumeration {
   val images_edits = Value("images/edits")
   val images_variations = Value("images/variations")
   val embeddings = Value
+  val audio_transcriptions = Value("audio/transcriptions")
+  val audio_translations = Value("audio/translations")
   val files = Value
   val fine_tunes = Value("fine-tunes")
   val moderations = Value
@@ -23,5 +25,5 @@ object Tag extends Enumeration {
   input, image, mask, instruction, size, response_format, file, purpose, file_id,
   training_file, validation_file, n_epochs, batch_size, learning_rate_multiplier, prompt_loss_weight,
   compute_classification_metrics, classification_n_classes, classification_positive_class,
-  classification_betas, fine_tune_id = Value
+  classification_betas, fine_tune_id, language = Value
 }
diff --git a/openai-client/src/main/scala/io/cequence/openaiscala/service/OpenAIServiceImpl.scala b/openai-client/src/main/scala/io/cequence/openaiscala/service/OpenAIServiceImpl.scala
@@ -2,7 +2,7 @@ package io.cequence.openaiscala.service
 
 import akka.stream.Materializer
 import play.api.libs.ws.StandaloneWSRequest
-import play.api.libs.json.JsObject
+import play.api.libs.json.{JsArray, JsObject, JsValue, Json}
 import io.cequence.openaiscala.JsonUtil.JsonOps
 import io.cequence.openaiscala.JsonFormats._
 import io.cequence.openaiscala.OpenAIScalaClientException
@@ -259,6 +259,71 @@ private class OpenAIServiceImpl(
       _.asSafe[EmbeddingResponse]
     )
 
+  override def createAudioTranscription(
+    file: File,
+    prompt: Option[String],
+    settings: CreateTranscriptionSettings
+  ): Future[TranscriptResponse] =
+    execPOSTMultipartWithStatusString(
+      Command.audio_transcriptions,
+      fileParams = Seq(Tag.file -> file),
+      bodyParams = Seq(
+        Tag.prompt -> prompt,
+        Tag.model -> Some(settings.model),
+        Tag.response_format -> settings.response_format.map(_.toString),
+        Tag.temperature -> settings.temperature,
+        Tag.language -> settings.language
+      )
+    ).map(processAudioTranscriptResponse(settings.response_format))
+
+  override def createAudioTranslation(
+    file: File,
+    prompt: Option[String],
+    settings: CreateTranslationSettings
+  ): Future[TranscriptResponse] =
+    execPOSTMultipartWithStatusString(
+      Command.audio_translations,
+      fileParams = Seq(Tag.file -> file),
+      bodyParams = Seq(
+        Tag.prompt -> prompt,
+        Tag.model -> Some(settings.model),
+        Tag.response_format -> settings.response_format.map(_.toString),
+        Tag.temperature -> settings.temperature
+      )
+    ).map(processAudioTranscriptResponse(settings.response_format))
+
+  private def processAudioTranscriptResponse(
+    responseFormat: Option[TranscriptResponseFormatType.Value])(
+    stringRichResponse: RichStringResponse
+  ) = {
+    val stringResponse = handleErrorResponse(stringRichResponse)
+
+    def textFromJsonString(json: JsValue) =
+      (json.asSafe[JsObject] \ "text").toOption.map {
+        _.asSafe[String]
+      }.getOrElse(
+        throw new OpenAIScalaClientException(s"The attribute 'text' is not present in the response: ${stringResponse}.")
+      )
+
+    val FormatType = TranscriptResponseFormatType
+
+    responseFormat.getOrElse(FormatType.json) match {
+      case FormatType.json =>
+        val json = Json.parse(stringResponse)
+        TranscriptResponse(textFromJsonString(json))
+
+      case FormatType.verbose_json =>
+        val json = Json.parse(stringResponse)
+        TranscriptResponse(
+          text = textFromJsonString(json),
+          verboseJson = Some(Json.prettyPrint(json))
+        )
+
+      case FormatType.text | FormatType.srt | FormatType.vtt =>
+        TranscriptResponse(stringResponse)
+    }
+  }
+
   override def listFiles: Future[Seq[FileInfo]] =
     execGET(Command.files).map { response =>
       (response.asSafe[JsObject] \ "data").toOption.map {
diff --git a/openai-client/src/main/scala/io/cequence/openaiscala/service/ws/WSRequestHelper.scala b/openai-client/src/main/scala/io/cequence/openaiscala/service/ws/WSRequestHelper.scala
@@ -32,8 +32,9 @@ trait WSRequestHelper extends WSHelper {
 
   private val defaultAcceptableStatusCodes = Seq(200)
 
-  protected type RichJsResponse = Either[JsValue, (Int, String)]
-  protected type RichStringResponse = Either[String, (Int, String)]
+  protected type RichResponse[T] = Either[T, (Int, String)]
+  protected type RichJsResponse = RichResponse[JsValue]
+  protected type RichStringResponse = RichResponse[String]
 
   /////////
   // GET //
@@ -105,22 +106,42 @@ trait WSRequestHelper extends WSHelper {
     acceptableStatusCodes: Seq[Int] = defaultAcceptableStatusCodes
   ): Future[RichJsResponse] = {
     val request = getWSRequestOptional(Some(endPoint), endPointParam, params)
+    val formData = createMultipartFormData(fileParams, bodyParams)
 
-    // create a multipart form data holder contain classic data (key-value) parts as well as file parts
-    val formData = MultipartFormData(
-      dataParts = bodyParams.collect { case (key, Some(value)) =>
-        (key.toString, Seq(value.toString))
-      }.toMap,
+    implicit val writeable: BodyWritable[MultipartFormData] = writeableOf_MultipartFormData("utf-8")
 
-      // TODO: we can potentially use here header-file-names as well (if provided as function's params)
-      files = fileParams.map { case (key, file) => FilePart(key.toString, file.getPath) }
-    )
+    execPOSTJsonAux(request, formData, Some(endPoint), acceptableStatusCodes)
+  }
+
+  protected def execPOSTMultipartWithStatusString(
+    endPoint: PEP,
+    endPointParam: Option[String] = None,
+    params: Seq[(PT, Option[Any])] = Nil,
+    fileParams: Seq[(PT, File)] = Nil,
+    bodyParams: Seq[(PT, Option[Any])] = Nil,
+    acceptableStatusCodes: Seq[Int] = defaultAcceptableStatusCodes
+  ): Future[RichStringResponse] = {
+    val request = getWSRequestOptional(Some(endPoint), endPointParam, params)
+    val formData = createMultipartFormData(fileParams, bodyParams)
 
     implicit val writeable: BodyWritable[MultipartFormData] = writeableOf_MultipartFormData("utf-8")
 
-    execPOSTAux(request, formData, Some(endPoint), acceptableStatusCodes)
+    execPOSTStringAux(request, formData, Some(endPoint), acceptableStatusCodes)
   }
 
+  // create a multipart form data holder contain classic data (key-value) parts as well as file parts
+  private def createMultipartFormData(
+    fileParams: Seq[(PT, File)] = Nil,
+    bodyParams: Seq[(PT, Option[Any])] = Nil
+  ) = MultipartFormData(
+    dataParts = bodyParams.collect { case (key, Some(value)) =>
+      (key.toString, Seq(value.toString))
+    }.toMap,
+
+    // TODO: we can potentially use here header-file-names as well (if provided as function's params)
+    files = fileParams.map { case (key, file) => FilePart(key.toString, file.getPath) }
+  )
+
   protected def execPOST(
     endPoint: PEP,
     endPointParam: Option[String] = None,
@@ -141,10 +162,10 @@ trait WSRequestHelper extends WSHelper {
     val request = getWSRequestOptional(Some(endPoint), endPointParam, params)
     val bodyParamsX = bodyParams.collect { case (fieldName, Some(jsValue)) => (fieldName.toString, jsValue) }
 
-    execPOSTAux(request, JsObject(bodyParamsX), Some(endPoint), acceptableStatusCodes)
+    execPOSTJsonAux(request, JsObject(bodyParamsX), Some(endPoint), acceptableStatusCodes)
   }
 
-  protected def execPOSTAux[T: BodyWritable](
+  protected def execPOSTJsonAux[T: BodyWritable](
     request: StandaloneWSRequest,
     body: T,
     endPointForLogging: Option[PEP], // only for logging
@@ -156,6 +177,18 @@ trait WSRequestHelper extends WSHelper {
       endPointForLogging
     )
 
+  protected def execPOSTStringAux[T: BodyWritable](
+    request: StandaloneWSRequest,
+    body: T,
+    endPointForLogging: Option[PEP], // only for logging
+    acceptableStatusCodes: Seq[Int] = defaultAcceptableStatusCodes
+  ) =
+    execRequestStringAux(
+      request, _.post(body),
+      acceptableStatusCodes,
+      endPointForLogging
+    )
+
   ////////////
   // DELETE //
   ////////////
@@ -279,9 +312,9 @@ trait WSRequestHelper extends WSHelper {
   ) =
     params.map { case (paramName, value) => (paramName, value.map(toJson)) }
 
-  protected def handleErrorResponse(response: RichJsResponse) =
+  protected def handleErrorResponse[T](response: RichResponse[T]) =
     response match {
-      case Left(json) => json
+      case Left(data) => data
 
       case Right((errorCode, message)) => throw new OpenAIScalaClientException(s"Code ${errorCode} : ${message}")
     }
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/domain/response/TranscriptResponse.scala b/openai-core/src/main/scala/io/cequence/openaiscala/domain/response/TranscriptResponse.scala
@@ -0,0 +1,6 @@
+package io.cequence.openaiscala.domain.response
+
+case class TranscriptResponse(
+  text: String,
+  verboseJson: Option[String] = None
+)
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateCompletionSettings.scala b/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateCompletionSettings.scala
@@ -12,8 +12,8 @@ case class CreateCompletionSettings(
   // Most models have a context length of 2048 tokens (except for the newest models, which support 4096). Defaults to 16.
   max_tokens: Option[Int] = None,
 
-  // What sampling temperature to use. Higher values means the model will take more risks.
-  // Try 0.9 for more creative applications, and 0 (argmax sampling) for ones with a well-defined answer.
+  // What sampling temperature to use, between 0 and 2.
+  // Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
   // We generally recommend altering this or top_p but not both. Defaults to 1.
   temperature: Option[Double] = None,
 
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateImageSettings.scala b/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateImageSettings.scala
@@ -8,7 +8,7 @@ case class CreateImageSettings(
   size: Option[ImageSizeType.Value] = None,
 
   // The format in which the generated images are returned. Must be one of url or b64_json. Defaults to url
-  response_format: Option[ResponseFormatType.Value] = None,
+  response_format: Option[ImageResponseFormatType.Value] = None,
 
   // A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
   user: Option[String] = None
@@ -20,6 +20,6 @@ object ImageSizeType extends Enumeration {
   val Large = Value("1024x1024")
 }
 
-object ResponseFormatType extends Enumeration {
+object ImageResponseFormatType extends Enumeration {
   val url, b64_json = Value
 }
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateTranscriptionSettings.scala b/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateTranscriptionSettings.scala
@@ -0,0 +1,24 @@
+package io.cequence.openaiscala.domain.settings
+
+case class CreateTranscriptionSettings(
+  // ID of the model to use. Only whisper-1 is currently available.
+  model: String,
+
+  // The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
+  // Defaults to json.
+  response_format: Option[TranscriptResponseFormatType.Value] = None,
+
+  // The sampling temperature, between 0 and 1.
+  // Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+  // If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+  // Defaults to 0.
+  temperature: Option[Double] = None,
+
+  // The language of the input audio.
+  // Supplying the input language in ISO-639-1 ('en', 'de', 'es', etc.) format will improve accuracy and latency.
+  language: Option[String] = None
+)
+
+object TranscriptResponseFormatType extends Enumeration {
+  val json, text, srt, verbose_json, vtt = Value
+}
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateTranslationSettings.scala b/openai-core/src/main/scala/io/cequence/openaiscala/domain/settings/CreateTranslationSettings.scala
@@ -0,0 +1,16 @@
+package io.cequence.openaiscala.domain.settings
+
+case class CreateTranslationSettings(
+  // ID of the model to use. Only whisper-1 is currently available.
+  model: String,
+
+  // The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
+  // Defaults to json.
+  response_format: Option[TranscriptResponseFormatType.Value] = None,
+
+  // The sampling temperature, between 0 and 1.
+  // Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
+  // If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.
+  // Defaults to 0.
+  temperature: Option[Double] = None
+)
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/service/OpenAIService.scala b/openai-core/src/main/scala/io/cequence/openaiscala/service/OpenAIService.scala
@@ -164,6 +164,40 @@ trait OpenAIService extends OpenAIServiceConsts {
     settings: CreateEmbeddingsSettings = DefaultSettings.CreateEmbeddings
   ): Future[EmbeddingResponse]
 
+  /**
+   * Transcribes audio into the input language.
+   *
+   * @param file The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
+   * @param prompt An optional text to guide the model's style or continue a previous audio segment.
+   *               The prompt should match the audio language.
+   * @param settings
+   * @return transcription text
+   *
+   * @see <a href="https://platform.openai.com/docs/api-reference/audio/create">OpenAI Doc</a>
+   */
+  def createAudioTranscription(
+    file: File,
+    prompt: Option[String] = None,
+    settings: CreateTranscriptionSettings = DefaultSettings.CreateTranscription
+  ): Future[TranscriptResponse]
+
+  /**
+   * Translates audio into into English.
+   *
+   * @param file The audio file to translate, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.
+   * @param prompt An optional text to guide the model's style or continue a previous audio segment.
+   *               The prompt should match the audio language.
+   * @param settings
+   * @return translation text
+   *
+   * @see <a href="https://platform.openai.com/docs/api-reference/audio/create">OpenAI Doc</a>
+   */
+  def createAudioTranslation(
+    file: File,
+    prompt: Option[String] = None,
+    settings: CreateTranslationSettings = DefaultSettings.CreateTranslation
+  ): Future[TranscriptResponse]
+
   /**
    * Returns a list of files that belong to the user's organization.
    *
diff --git a/openai-core/src/main/scala/io/cequence/openaiscala/service/OpenAIServiceConsts.scala b/openai-core/src/main/scala/io/cequence/openaiscala/service/OpenAIServiceConsts.scala
@@ -49,6 +49,15 @@ trait OpenAIServiceConsts {
       model = ModelId.text_embedding_ada_002
     )
 
+    val CreateTranscription = CreateTranscriptionSettings(
+      model = ModelId.whisper_1,
+      language = Some("en")
+    )
+
+    val CreateTranslation = CreateTranslationSettings(
+      model = ModelId.whisper_1
+    )
+
     val UploadFile = UploadFileSettings(
       purpose = "fine-tune"
     )

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ case class CreateImageSettings(`
`8`	`8`	`size: Option[ImageSizeType.Value] = None,`
`9`	`9`
`10`	`10`	`// The format in which the generated images are returned. Must be one of url or b64_json. Defaults to url`
`11`		`- response_format: Option[ResponseFormatType.Value] = None,`
	`11`	`+ response_format: Option[ImageResponseFormatType.Value] = None,`
`12`	`12`
`13`	`13`	`// A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.`
`14`	`14`	`user: Option[String] = None`
`@@ -20,6 +20,6 @@ object ImageSizeType extends Enumeration {`
`20`	`20`	`val Large = Value("1024x1024")`
`21`	`21`	`}`
`22`	`22`
`23`		`-object ResponseFormatType extends Enumeration {`
	`23`	`+object ImageResponseFormatType extends Enumeration {`
`24`	`24`	`val url, b64_json = Value`
`25`	`25`	`}`