Skip to content

Commit e073e9b

Browse files
nnegreychingor13
authored andcommitted
samples: Add video v1p1beta samples for face detection and video transcription (#1070)
* Add video v1p1beta samples for face detection and video transcription * Update based on Feedback * Clean up READMEs * Add timeout for tests
1 parent 254882f commit e073e9b

File tree

2 files changed

+386
-0
lines changed

2 files changed

+386
-0
lines changed
Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
/*
2+
* Copyright 2018 Google Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.example.video;
18+
19+
import com.google.api.gax.longrunning.OperationFuture;
20+
import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoProgress;
21+
import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoRequest;
22+
import com.google.cloud.videointelligence.v1p1beta1.AnnotateVideoResponse;
23+
import com.google.cloud.videointelligence.v1p1beta1.EmotionAttribute;
24+
import com.google.cloud.videointelligence.v1p1beta1.FaceConfig;
25+
import com.google.cloud.videointelligence.v1p1beta1.FaceDetectionAnnotation;
26+
import com.google.cloud.videointelligence.v1p1beta1.FaceDetectionFrame;
27+
import com.google.cloud.videointelligence.v1p1beta1.FaceSegment;
28+
import com.google.cloud.videointelligence.v1p1beta1.Feature;
29+
import com.google.cloud.videointelligence.v1p1beta1.NormalizedBoundingBox;
30+
import com.google.cloud.videointelligence.v1p1beta1.SpeechRecognitionAlternative;
31+
import com.google.cloud.videointelligence.v1p1beta1.SpeechTranscription;
32+
import com.google.cloud.videointelligence.v1p1beta1.SpeechTranscriptionConfig;
33+
import com.google.cloud.videointelligence.v1p1beta1.VideoAnnotationResults;
34+
import com.google.cloud.videointelligence.v1p1beta1.VideoContext;
35+
import com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient;
36+
import com.google.cloud.videointelligence.v1p1beta1.WordInfo;
37+
import java.io.IOException;
38+
import java.util.concurrent.TimeUnit;
39+
40+
public class Detect {
41+
/**
42+
* Detects face's bounding boxes, emotions, and video transcription using the Video Intelligence
43+
* API
44+
* @param args specifies features to detect and the path to the video on Google Cloud Storage.
45+
*/
46+
public static void main(String[] args) {
47+
try {
48+
argsHelper(args);
49+
} catch (Exception e) {
50+
System.out.println("Exception while running:\n" + e.getMessage() + "\n");
51+
e.printStackTrace(System.out);
52+
}
53+
}
54+
55+
/**
56+
* Helper that handles the input passed to the program.
57+
* @param args specifies features to detect and the path to the video on Google Cloud Storage.
58+
*
59+
* @throws IOException on Input/Output errors.
60+
*/
61+
public static void argsHelper(String[] args) throws Exception {
62+
if (args.length < 1) {
63+
System.out.println("Usage:");
64+
System.out.printf(
65+
"\tjava %s \"<command>\" \"<path-to-video>\"\n"
66+
+ "Commands:\n"
67+
+ "\tfaces-bounding-boxes | faces-emotions | speech-transcription\n"
68+
+ "Path:\n\tA URI for a Cloud Storage resource (gs://...)\n"
69+
+ "Examples: ",
70+
Detect.class.getCanonicalName());
71+
return;
72+
}
73+
String command = args[0];
74+
String path = args.length > 1 ? args[1] : "";
75+
76+
if (command.equals("faces-bounding-boxes")) {
77+
analyzeFacesBoundingBoxes(path);
78+
}
79+
if (command.equals("faces-emotions")) {
80+
analyzeFaceEmotions(path);
81+
}
82+
if (command.equals("speech-transcription")) {
83+
speechTranscription(path);
84+
}
85+
}
86+
87+
88+
// [START video_face_bounding_boxes]
89+
/**
90+
* Detects faces' bounding boxes on the video at the provided Cloud Storage path.
91+
*
92+
* @param gcsUri the path to the video file to analyze.
93+
*/
94+
public static void analyzeFacesBoundingBoxes(String gcsUri) throws Exception {
95+
// Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
96+
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
97+
// Set the configuration to include bounding boxes
98+
FaceConfig config = FaceConfig.newBuilder()
99+
.setIncludeBoundingBoxes(true)
100+
.build();
101+
102+
// Set the video context with the above configuration
103+
VideoContext context = VideoContext.newBuilder()
104+
.setFaceDetectionConfig(config)
105+
.build();
106+
107+
// Create the request
108+
AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
109+
.setInputUri(gcsUri)
110+
.addFeatures(Feature.FACE_DETECTION)
111+
.setVideoContext(context)
112+
.build();
113+
114+
// asynchronously perform facial analysis on videos
115+
OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
116+
client.annotateVideoAsync(request);
117+
118+
System.out.println("Waiting for operation to complete...");
119+
boolean faceFound = false;
120+
// Display the results
121+
for (VideoAnnotationResults results : response.get(900, TimeUnit.SECONDS)
122+
.getAnnotationResultsList()) {
123+
int faceCount = 0;
124+
// Display the results for each face
125+
for (FaceDetectionAnnotation faceAnnotation : results.getFaceDetectionAnnotationsList()) {
126+
faceFound = true;
127+
System.out.println("\nFace: " + ++faceCount);
128+
// Each FaceDetectionAnnotation has only one segment.
129+
for (FaceSegment segment : faceAnnotation.getSegmentsList()) {
130+
double startTime = segment.getSegment().getStartTimeOffset().getSeconds()
131+
+ segment.getSegment().getStartTimeOffset().getNanos() / 1e9;
132+
double endTime = segment.getSegment().getEndTimeOffset().getSeconds()
133+
+ segment.getSegment().getEndTimeOffset().getNanos() / 1e9;
134+
System.out.printf("Segment location: %.3fs to %.3f\n", startTime, endTime);
135+
}
136+
// There are typically many frames for each face,
137+
try {
138+
// Here we process only the first frame.
139+
if (faceAnnotation.getFramesCount() > 0) {
140+
FaceDetectionFrame frame = faceAnnotation.getFrames(0); // get the first frame
141+
double timeOffset = frame.getTimeOffset().getSeconds()
142+
+ frame.getTimeOffset().getNanos() / 1e9;
143+
System.out.printf("First frame time offset: %.3fs\n", timeOffset);
144+
// print info on the first normalized bounding box
145+
NormalizedBoundingBox box = frame.getAttributes(0).getNormalizedBoundingBox();
146+
System.out.printf("\tLeft: %.3f\n", box.getLeft());
147+
System.out.printf("\tTop: %.3f\n", box.getTop());
148+
System.out.printf("\tBottom: %.3f\n", box.getBottom());
149+
System.out.printf("\tRight: %.3f\n", box.getRight());
150+
} else {
151+
System.out.println("No frames found in annotation");
152+
}
153+
} catch (IndexOutOfBoundsException ioe) {
154+
System.out.println("Could not retrieve frame: " + ioe.getMessage());
155+
}
156+
}
157+
}
158+
159+
if (!faceFound) {
160+
System.out.println("No faces detected in " + gcsUri);
161+
}
162+
}
163+
}
164+
// [END video_face_bounding_boxes]
165+
166+
// [START video_face_emotions]
167+
/**
168+
* Analyze faces' emotions over frames on the video at the provided Cloud Storage path.
169+
*
170+
* @param gcsUri the path to the video file to analyze.
171+
*/
172+
public static void analyzeFaceEmotions(String gcsUri) throws Exception {
173+
// Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
174+
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
175+
// Set the configuration to include bounding boxes
176+
FaceConfig config = FaceConfig.newBuilder()
177+
.setIncludeEmotions(true)
178+
.build();
179+
180+
// Set the video context with the above configuration
181+
VideoContext context = VideoContext.newBuilder()
182+
.setFaceDetectionConfig(config)
183+
.build();
184+
185+
// Create the request
186+
AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
187+
.setInputUri(gcsUri)
188+
.addFeatures(Feature.FACE_DETECTION)
189+
.setVideoContext(context)
190+
.build();
191+
192+
// asynchronously perform facial analysis on videos
193+
OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
194+
client.annotateVideoAsync(request);
195+
196+
System.out.println("Waiting for operation to complete...");
197+
boolean faceFound = false;
198+
// Display the results
199+
for (VideoAnnotationResults results : response.get(600, TimeUnit.SECONDS)
200+
.getAnnotationResultsList()) {
201+
int faceCount = 0;
202+
// Display the results for each face
203+
for (FaceDetectionAnnotation faceAnnotation : results.getFaceDetectionAnnotationsList()) {
204+
faceFound = true;
205+
System.out.println("\nFace: " + ++faceCount);
206+
// Each FaceDetectionAnnotation has only one segment.
207+
for (FaceSegment segment : faceAnnotation.getSegmentsList()) {
208+
double startTime = segment.getSegment().getStartTimeOffset().getSeconds()
209+
+ segment.getSegment().getStartTimeOffset().getNanos() / 1e9;
210+
double endTime = segment.getSegment().getEndTimeOffset().getSeconds()
211+
+ segment.getSegment().getEndTimeOffset().getNanos() / 1e9;
212+
System.out.printf("Segment location: %.3fs to %.3f\n", startTime, endTime);
213+
}
214+
215+
try {
216+
// Print each frame's highest emotion
217+
for (FaceDetectionFrame frame : faceAnnotation.getFramesList()) {
218+
double timeOffset = frame.getTimeOffset().getSeconds()
219+
+ frame.getTimeOffset().getNanos() / 1e9;
220+
float highestScore = 0.0f;
221+
String emotion = "";
222+
// Get the highest scoring emotion for the current frame
223+
for (EmotionAttribute emotionAttribute : frame.getAttributes(0).getEmotionsList()) {
224+
if (emotionAttribute.getScore() > highestScore) {
225+
highestScore = emotionAttribute.getScore();
226+
emotion = emotionAttribute.getEmotion().name();
227+
}
228+
}
229+
System.out.printf("\t%4.2fs: %14s %4.3f\n", timeOffset, emotion, highestScore);
230+
}
231+
232+
} catch (IndexOutOfBoundsException ioe) {
233+
System.out.println("Could not retrieve frame: " + ioe.getMessage());
234+
}
235+
}
236+
}
237+
238+
if (!faceFound) {
239+
System.out.println("No faces detected in " + gcsUri);
240+
}
241+
}
242+
}
243+
// [END video_face_emotions]
244+
245+
// [START video_speech_transcription]
246+
/**
247+
* Transcribe speech from a video stored on GCS.
248+
*
249+
* @param gcsUri the path to the video file to analyze.
250+
*/
251+
public static void speechTranscription(String gcsUri) throws Exception {
252+
// Instantiate a com.google.cloud.videointelligence.v1p1beta1.VideoIntelligenceServiceClient
253+
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
254+
// Set the language code
255+
SpeechTranscriptionConfig config = SpeechTranscriptionConfig.newBuilder()
256+
.setLanguageCode("en-US")
257+
.build();
258+
259+
// Set the video context with the above configuration
260+
VideoContext context = VideoContext.newBuilder()
261+
.setSpeechTranscriptionConfig(config)
262+
.build();
263+
264+
// Create the request
265+
AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
266+
.setInputUri(gcsUri)
267+
.addFeatures(Feature.SPEECH_TRANSCRIPTION)
268+
.setVideoContext(context)
269+
.build();
270+
271+
// asynchronously perform facial analysis on videos
272+
OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
273+
client.annotateVideoAsync(request);
274+
275+
System.out.println("Waiting for operation to complete...");
276+
// Display the results
277+
for (VideoAnnotationResults results : response.get(180, TimeUnit.SECONDS)
278+
.getAnnotationResultsList()) {
279+
for (SpeechTranscription speechTranscription : results.getSpeechTranscriptionsList()) {
280+
try {
281+
// Print the transcription
282+
if (speechTranscription.getAlternativesCount() > 0) {
283+
SpeechRecognitionAlternative alternative = speechTranscription.getAlternatives(0);
284+
285+
System.out.printf("Transcript: %s\n", alternative.getTranscript());
286+
System.out.printf("Confidence: %.2f\n", alternative.getConfidence());
287+
288+
System.out.println("Word level information:");
289+
for (WordInfo wordInfo : alternative.getWordsList()) {
290+
double startTime = wordInfo.getStartTime().getSeconds()
291+
+ wordInfo.getStartTime().getNanos() / 1e9;
292+
double endTime = wordInfo.getEndTime().getSeconds()
293+
+ wordInfo.getEndTime().getNanos() / 1e9;
294+
System.out.printf("\t%4.2fs - %4.2fs: %s\n",
295+
startTime, endTime, wordInfo.getWord());
296+
}
297+
} else {
298+
System.out.println("No transcription found");
299+
}
300+
} catch (IndexOutOfBoundsException ioe) {
301+
System.out.println("Could not retrieve frame: " + ioe.getMessage());
302+
}
303+
}
304+
}
305+
}
306+
}
307+
// [END video_speech_transcription]
308+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright 2018 Google Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.example.video;
18+
19+
import static com.google.common.truth.Truth.assertThat;
20+
21+
import java.io.ByteArrayOutputStream;
22+
import java.io.PrintStream;
23+
import org.junit.After;
24+
import org.junit.Before;
25+
import org.junit.Test;
26+
import org.junit.runner.RunWith;
27+
import org.junit.runners.JUnit4;
28+
29+
/** Tests for video analysis sample. */
30+
@RunWith(JUnit4.class)
31+
@SuppressWarnings("checkstyle:abbreviationaswordinname")
32+
public class DetectIT {
33+
34+
private ByteArrayOutputStream bout;
35+
private PrintStream out;
36+
37+
static final String FACES_FILE_LOCATION =
38+
"gs://java-docs-samples-testing/video/googlework_short.mp4";
39+
40+
@Before
41+
public void setUp() {
42+
bout = new ByteArrayOutputStream();
43+
out = new PrintStream(bout);
44+
System.setOut(out);
45+
}
46+
47+
@After
48+
public void tearDown() {
49+
System.setOut(null);
50+
}
51+
52+
@Test
53+
public void testFacesBoundingBoxes() throws Exception {
54+
String[] args = {"faces-bounding-boxes", FACES_FILE_LOCATION};
55+
Detect.argsHelper(args);
56+
String got = bout.toString();
57+
58+
assertThat(got).contains("Top:");
59+
}
60+
61+
@Test
62+
public void testFacesEmotions() throws Exception {
63+
String[] args = {"faces-emotions", FACES_FILE_LOCATION};
64+
Detect.argsHelper(args);
65+
String got = bout.toString();
66+
67+
assertThat(got).contains("CONCENTRATION");
68+
}
69+
70+
@Test
71+
public void testSpeechTranscription() throws Exception {
72+
String[] args = {"speech-transcription", FACES_FILE_LOCATION};
73+
Detect.argsHelper(args);
74+
String got = bout.toString();
75+
76+
assertThat(got).contains("cultural");
77+
}
78+
}

0 commit comments

Comments
 (0)