samples: Add video v1p1beta samples for face detection and video transcription (#1070)

nnegrey · chingor13 · commit e073e9b730b2 · 2020-08-24T13:20:07.000-07:00
* Add video v1p1beta samples for face detection and video transcription

* Update based on Feedback

* Clean up READMEs

* Add timeout for tests
diff --git a/video/src/main/java/com/example/video/Detect.java b/video/src/main/java/com/example/video/Detect.java
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.video;
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoProgress;
+import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoRequest;
+import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoResponse;
+import com.google.cloud.videointelligence.v1p1beta1.EmotionAttribute;
+import com.google.cloud.videointelligence.v1p1beta1.FaceConfig;
+import com.google.cloud.videointelligence.v1p1beta1.FaceDetectionAnnotation;
+import com.google.cloud.videointelligence.v1p1beta1.FaceDetectionFrame;
+import com.google.cloud.videointelligence.v1p1beta1.FaceSegment;
+import com.google.cloud.videointelligence.v1p1beta1.Feature;
+import com.google.cloud.videointelligence.v1p1beta1.NormalizedBoundingBox;
+import com.google.cloud.videointelligence.v1p1beta1.SpeechRecognitionAlternative;
+import com.google.cloud.videointelligence.v1p1beta1.SpeechTranscription;
+import com.google.cloud.videointelligence.v1p1beta1.SpeechTranscriptionConfig;
+import com.google.cloud.videointelligence.v1p1beta1.VideoAnnotationResults;
+import com.google.cloud.videointelligence.v1p1beta1.VideoContext;
+import com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient;
+import com.google.cloud.videointelligence.v1p1beta1.WordInfo;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+public class Detect {
+  /**
+   * Detects face's bounding boxes, emotions, and video transcription using the Video Intelligence
+   * API
+   * @param args specifies features to detect and the path to the video on Google Cloud Storage.
+   */
+  public static void main(String[] args) {
+    try {
+      argsHelper(args);
+    } catch (Exception e) {
+      System.out.println("Exception while running:\n" + e.getMessage() + "\n");
+      e.printStackTrace(System.out);
+    }
+  }
+
+  /**
+   * Helper that handles the input passed to the program.
+   * @param args specifies features to detect and the path to the video on Google Cloud Storage.
+   *
+   * @throws IOException on Input/Output errors.
+   */
+  public static void argsHelper(String[] args) throws Exception {
+    if (args.length < 1) {
+      System.out.println("Usage:");
+      System.out.printf(
+          "\tjava %s \"<command>\" \"<path-to-video>\"\n"
+              + "Commands:\n"
+              + "\tfaces-bounding-boxes | faces-emotions | speech-transcription\n"
+              + "Path:\n\tA URI for a Cloud Storage resource (gs://...)\n"
+              + "Examples: ",
+          Detect.class.getCanonicalName());
+      return;
+    }
+    String command = args[0];
+    String path = args.length > 1 ? args[1] : "";
+
+    if (command.equals("faces-bounding-boxes")) {
+      analyzeFacesBoundingBoxes(path);
+    }
+    if (command.equals("faces-emotions")) {
+      analyzeFaceEmotions(path);
+    }
+    if (command.equals("speech-transcription")) {
+      speechTranscription(path);
+    }
+  }
+
+
+  // [START video_face_bounding_boxes]
+  /**
+   * Detects faces' bounding boxes on the video at the provided Cloud Storage path.
+   *
+   * @param gcsUri the path to the video file to analyze.
+   */
+  public static void analyzeFacesBoundingBoxes(String gcsUri) throws Exception {
+    // Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
+    try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
+      // Set the configuration to include bounding boxes
+      FaceConfig config = FaceConfig.newBuilder()
+          .setIncludeBoundingBoxes(true)
+          .build();
+
+      // Set the video context with the above configuration
+      VideoContext context = VideoContext.newBuilder()
+          .setFaceDetectionConfig(config)
+          .build();
+
+      // Create the request
+      AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
+          .setInputUri(gcsUri)
+          .addFeatures(Feature.FACE_DETECTION)
+          .setVideoContext(context)
+          .build();
+
+      // asynchronously perform facial analysis on videos
+      OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
+          client.annotateVideoAsync(request);
+
+      System.out.println("Waiting for operation to complete...");
+      boolean faceFound = false;
+      // Display the results
+      for (VideoAnnotationResults results : response.get(900, TimeUnit.SECONDS)
+          .getAnnotationResultsList()) {
+        int faceCount = 0;
+        // Display the results for each face
+        for (FaceDetectionAnnotation faceAnnotation : results.getFaceDetectionAnnotationsList()) {
+          faceFound = true;
+          System.out.println("\nFace: " + ++faceCount);
+          // Each FaceDetectionAnnotation has only one segment.
+          for (FaceSegment segment : faceAnnotation.getSegmentsList()) {
+            double startTime = segment.getSegment().getStartTimeOffset().getSeconds()
+                + segment.getSegment().getStartTimeOffset().getNanos() / 1e9;
+            double endTime = segment.getSegment().getEndTimeOffset().getSeconds()
+                + segment.getSegment().getEndTimeOffset().getNanos() / 1e9;
+            System.out.printf("Segment location: %.3fs to %.3f\n", startTime, endTime);
+          }
+          // There are typically many frames for each face,
+          try {
+            // Here we process only the first frame.
+            if (faceAnnotation.getFramesCount() > 0) {
+              FaceDetectionFrame frame = faceAnnotation.getFrames(0); // get the first frame
+              double timeOffset = frame.getTimeOffset().getSeconds()
+                  + frame.getTimeOffset().getNanos() / 1e9;
+              System.out.printf("First frame time offset: %.3fs\n", timeOffset);
+              // print info on the first normalized bounding box
+              NormalizedBoundingBox box = frame.getAttributes(0).getNormalizedBoundingBox();
+              System.out.printf("\tLeft: %.3f\n", box.getLeft());
+              System.out.printf("\tTop: %.3f\n", box.getTop());
+              System.out.printf("\tBottom: %.3f\n", box.getBottom());
+              System.out.printf("\tRight: %.3f\n", box.getRight());
+            } else {
+              System.out.println("No frames found in annotation");
+            }
+          } catch (IndexOutOfBoundsException ioe) {
+            System.out.println("Could not retrieve frame: " + ioe.getMessage());
+          }
+        }
+      }
+
+      if (!faceFound) {
+        System.out.println("No faces detected in " + gcsUri);
+      }
+    }
+  }
+  // [END video_face_bounding_boxes]
+
+  // [START video_face_emotions]
+  /**
+   * Analyze faces' emotions over frames on the video at the provided Cloud Storage path.
+   *
+   * @param gcsUri the path to the video file to analyze.
+   */
+  public static void analyzeFaceEmotions(String gcsUri) throws Exception {
+    // Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
+    try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
+      // Set the configuration to include bounding boxes
+      FaceConfig config = FaceConfig.newBuilder()
+          .setIncludeEmotions(true)
+          .build();
+
+      // Set the video context with the above configuration
+      VideoContext context = VideoContext.newBuilder()
+          .setFaceDetectionConfig(config)
+          .build();
+
+      // Create the request
+      AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
+          .setInputUri(gcsUri)
+          .addFeatures(Feature.FACE_DETECTION)
+          .setVideoContext(context)
+          .build();
+
+      // asynchronously perform facial analysis on videos
+      OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
+          client.annotateVideoAsync(request);
+
+      System.out.println("Waiting for operation to complete...");
+      boolean faceFound = false;
+      // Display the results
+      for (VideoAnnotationResults results : response.get(600, TimeUnit.SECONDS)
+          .getAnnotationResultsList()) {
+        int faceCount = 0;
+        // Display the results for each face
+        for (FaceDetectionAnnotation faceAnnotation : results.getFaceDetectionAnnotationsList()) {
+          faceFound = true;
+          System.out.println("\nFace: " + ++faceCount);
+          // Each FaceDetectionAnnotation has only one segment.
+          for (FaceSegment segment : faceAnnotation.getSegmentsList()) {
+            double startTime = segment.getSegment().getStartTimeOffset().getSeconds()
+                + segment.getSegment().getStartTimeOffset().getNanos() / 1e9;
+            double endTime = segment.getSegment().getEndTimeOffset().getSeconds()
+                + segment.getSegment().getEndTimeOffset().getNanos() / 1e9;
+            System.out.printf("Segment location: %.3fs to %.3f\n", startTime, endTime);
+          }
+
+          try {
+            // Print each frame's highest emotion
+            for (FaceDetectionFrame frame : faceAnnotation.getFramesList()) {
+              double timeOffset = frame.getTimeOffset().getSeconds()
+                  + frame.getTimeOffset().getNanos() / 1e9;
+              float highestScore = 0.0f;
+              String emotion = "";
+              // Get the highest scoring emotion for the current frame
+              for (EmotionAttribute emotionAttribute : frame.getAttributes(0).getEmotionsList()) {
+                if (emotionAttribute.getScore() > highestScore) {
+                  highestScore = emotionAttribute.getScore();
+                  emotion = emotionAttribute.getEmotion().name();
+                }
+              }
+              System.out.printf("\t%4.2fs: %14s %4.3f\n", timeOffset, emotion, highestScore);
+            }
+
+          } catch (IndexOutOfBoundsException ioe) {
+            System.out.println("Could not retrieve frame: " + ioe.getMessage());
+          }
+        }
+      }
+
+      if (!faceFound) {
+        System.out.println("No faces detected in " + gcsUri);
+      }
+    }
+  }
+  // [END video_face_emotions]
+
+  // [START video_speech_transcription]
+  /**
+   * Transcribe speech from a video stored on GCS.
+   *
+   * @param gcsUri the path to the video file to analyze.
+   */
+  public static void speechTranscription(String gcsUri) throws Exception {
+    // Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
+    try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
+      // Set the language code
+      SpeechTranscriptionConfig config = SpeechTranscriptionConfig.newBuilder()
+          .setLanguageCode("en-US")
+          .build();
+
+      // Set the video context with the above configuration
+      VideoContext context = VideoContext.newBuilder()
+          .setSpeechTranscriptionConfig(config)
+          .build();
+
+      // Create the request
+      AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
+          .setInputUri(gcsUri)
+          .addFeatures(Feature.SPEECH_TRANSCRIPTION)
+          .setVideoContext(context)
+          .build();
+
+      // asynchronously perform facial analysis on videos
+      OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
+          client.annotateVideoAsync(request);
+
+      System.out.println("Waiting for operation to complete...");
+      // Display the results
+      for (VideoAnnotationResults results : response.get(180, TimeUnit.SECONDS)
+          .getAnnotationResultsList()) {
+        for (SpeechTranscription speechTranscription : results.getSpeechTranscriptionsList()) {
+          try {
+            // Print the transcription
+            if (speechTranscription.getAlternativesCount() > 0) {
+              SpeechRecognitionAlternative alternative = speechTranscription.getAlternatives(0);
+
+              System.out.printf("Transcript: %s\n", alternative.getTranscript());
+              System.out.printf("Confidence: %.2f\n", alternative.getConfidence());
+
+              System.out.println("Word level information:");
+              for (WordInfo wordInfo : alternative.getWordsList()) {
+                double startTime = wordInfo.getStartTime().getSeconds()
+                    + wordInfo.getStartTime().getNanos() / 1e9;
+                double endTime = wordInfo.getEndTime().getSeconds()
+                    + wordInfo.getEndTime().getNanos() / 1e9;
+                System.out.printf("\t%4.2fs - %4.2fs: %s\n",
+                    startTime, endTime, wordInfo.getWord());
+              }
+            } else {
+              System.out.println("No transcription found");
+            }
+          } catch (IndexOutOfBoundsException ioe) {
+            System.out.println("Could not retrieve frame: " + ioe.getMessage());
+          }
+        }
+      }
+    }
+  }
+  // [END video_speech_transcription]
+}
diff --git a/video/src/test/java/com/example/video/DetectIT.java b/video/src/test/java/com/example/video/DetectIT.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.video;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Tests for video analysis sample. */
+@RunWith(JUnit4.class)
+@SuppressWarnings("checkstyle:abbreviationaswordinname")
+public class DetectIT {
+
+  private ByteArrayOutputStream bout;
+  private PrintStream out;
+
+  static final String FACES_FILE_LOCATION =
+      "gs://java-docs-samples-testing/video/googlework_short.mp4";
+
+  @Before
+  public void setUp() {
+    bout = new ByteArrayOutputStream();
+    out = new PrintStream(bout);
+    System.setOut(out);
+  }
+
+  @After
+  public void tearDown() {
+    System.setOut(null);
+  }
+
+  @Test
+  public void testFacesBoundingBoxes() throws Exception {
+    String[] args = {"faces-bounding-boxes", FACES_FILE_LOCATION};
+    Detect.argsHelper(args);
+    String got = bout.toString();
+
+    assertThat(got).contains("Top:");
+  }
+
+  @Test
+  public void testFacesEmotions() throws Exception {
+    String[] args = {"faces-emotions", FACES_FILE_LOCATION};
+    Detect.argsHelper(args);
+    String got = bout.toString();
+
+    assertThat(got).contains("CONCENTRATION");
+  }
+
+  @Test
+  public void testSpeechTranscription() throws Exception {
+    String[] args = {"speech-transcription", FACES_FILE_LOCATION};
+    Detect.argsHelper(args);
+    String got = bout.toString();
+
+    assertThat(got).contains("cultural");
+  }
+}