Draft 1: indefinitely-long streaming transcription

Jerjou Cheng · Jerjou Cheng · commit c6f5f1137747 · 2018-05-31T10:28:35.000-07:00
diff --git a/speech/cloud-client/transcribe_streaming_indefinite.py b/speech/cloud-client/transcribe_streaming_indefinite.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python
+
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Google Cloud Speech API sample application using the streaming API.
+
+NOTE: This module requires the additional dependency `pyaudio`. To install
+using pip:
+
+    pip install pyaudio
+
+Example usage:
+    python transcribe_streaming_mic.py
+"""
+
+# [START import_libraries]
+from __future__ import division
+
+import collections
+import itertools
+import re
+import sys
+
+from google.cloud import speech
+from google.cloud.speech import enums
+from google.cloud.speech import types
+from google import gax
+import grpc
+import pyaudio
+from six.moves import queue
+# [END import_libraries]
+
+# Audio recording parameters
+RATE = 16000
+CHUNK = int(RATE / 10)  # 100ms
+
+
+class MicrophoneStream(object):
+    """Opens a recording stream as a generator yielding the audio chunks."""
+    def __init__(self, rate, chunk_size, max_replay_secs=5):
+        self._rate = rate
+        self._chunk_size = chunk_size
+        self._max_replay_secs = max_replay_secs
+
+        # Create a thread-safe buffer of audio data
+        self._buff = queue.Queue()
+        self.closed = True
+
+    def __enter__(self):
+        num_channels = 1
+        self._audio_interface = pyaudio.PyAudio()
+        self._audio_stream = self._audio_interface.open(
+            format=pyaudio.paInt16,
+            # The API currently only supports 1-channel (mono) audio
+            # https://goo.gl/z757pE
+            channels=num_channels, rate=self._rate,
+            input=True, frames_per_buffer=self._chunk_size,
+            # Run the audio stream asynchronously to fill the buffer object.
+            # This is necessary so that the input device's buffer doesn't
+            # overflow while the calling thread makes network requests, etc.
+            stream_callback=self._fill_buffer,
+        )
+
+        self.closed = False
+
+        bytes_per_sample = 2 * num_channels  # 2 bytes in 16 bit samples
+        self._bytes_per_second = self._rate * bytes_per_sample
+
+        bytes_per_chunk = (self._chunk_size * bytes_per_sample)
+        chunks_per_second = self._bytes_per_second / bytes_per_chunk
+        self._untranscribed = collections.deque(
+                maxlen=self._max_replay_secs * chunks_per_second)
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._audio_stream.stop_stream()
+        self._audio_stream.close()
+        self.closed = True
+        # Signal the generator to terminate so that the client's
+        # streaming_recognize method will not block the process termination.
+        self._buff.put(None)
+        self._audio_interface.terminate()
+
+    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
+        """Continuously collect data from the audio stream, into the buffer."""
+        self._buff.put(in_data)
+        return None, pyaudio.paContinue
+
+    def on_transcribe(self, end_time):
+        while self._untranscribed and end_time > self._untranscribed[0][1]:
+            self._untranscribed.popleft()
+
+    def generator(self, resume=False):
+        total_bytes_sent = 0
+        if resume:
+            # Yield all the untranscribed chunks first
+            for chunk, _ in self._untranscribed:
+                yield chunk
+        while not self.closed:
+            # Use a blocking get() to ensure there's at least one chunk of
+            # data, and stop iteration if the chunk is None, indicating the
+            # end of the audio stream.
+            chunk = self._buff.get()
+            if chunk is None:
+                return
+            data = [chunk]
+
+            # Now consume whatever other data's still buffered.
+            while True:
+                try:
+                    chunk = self._buff.get(block=False)
+                    if chunk is None:
+                        return
+                    data.append(chunk)
+                except queue.Empty:
+                    break
+
+            byte_data = b''.join(data)
+
+            # Populate the replay buffer of untranscribed audio bytes
+            total_bytes_sent += len(byte_data)
+            chunk_end_time = total_bytes_sent / self._bytes_per_second
+            self._untranscribed.append((byte_data, chunk_end_time))
+
+            yield byte_data
+# [END audio_stream]
+
+
+def duration_to_secs(duration):
+    return duration.seconds + (duration.nanos / float(1e9))
+
+
+def listen_print_loop(responses, stream):
+    """Iterates through server responses and prints them.
+
+    The responses passed is a generator that will block until a response
+    is provided by the server.
+
+    Each response may contain multiple results, and each result may contain
+    multiple alternatives; for details, see https://goo.gl/tjCPAU.  Here we
+    print only the transcription for the top alternative of the top result.
+
+    In this case, responses are provided for interim results as well. If the
+    response is an interim one, print a line feed at the end of it, to allow
+    the next result to overwrite it, until the response is a final one. For the
+    final one, print a newline to preserve the finalized transcription.
+    """
+    num_chars_printed = 0
+    for response in responses:
+        if not response.results:
+            continue
+
+        # The `results` list is consecutive. For streaming, we only care about
+        # the first result being considered, since once it's `is_final`, it
+        # moves on to considering the next utterance.
+        result = response.results[0]
+        if not result.alternatives:
+            continue
+
+        top_alternative = result.alternatives[0]
+        # Display the transcription of the top alternative.
+        transcript = top_alternative.transcript
+
+        # Display interim results, but with a carriage return at the end of the
+        # line, so subsequent lines will overwrite them.
+        #
+        # If the previous result was longer than this one, we need to print
+        # some extra spaces to overwrite the previous result
+        overwrite_chars = ' ' * (num_chars_printed - len(transcript))
+
+        if not result.is_final:
+            sys.stdout.write(transcript + overwrite_chars + '\r')
+            sys.stdout.flush()
+
+            num_chars_printed = len(transcript)
+        else:
+            print(transcript + overwrite_chars)
+
+            # Exit recognition if any of the transcribed phrases could be
+            # one of our keywords.
+            if re.search(r'\b(exit|quit)\b', transcript, re.I):
+                print('Exiting..')
+                break
+
+            num_chars_printed = 0
+
+            # Keep track of what transcripts we've received, so we can resume
+            # intelligently when we hit the deadline
+            stream.on_transcribe(duration_to_secs(
+                    top_alternative.words[-1].end_time))
+
+
+def main():
+    # See http://g.co/cloud/speech/docs/languages
+    # for a list of supported languages.
+    language_code = 'en-US'  # a BCP-47 language tag
+
+    client = speech.SpeechClient()
+    config = types.RecognitionConfig(
+        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
+        sample_rate_hertz=RATE,
+        language_code=language_code,
+        max_alternatives=1,
+        enable_word_time_offsets=True)
+    streaming_config = types.StreamingRecognitionConfig(
+        config=config,
+        interim_results=True)
+
+    with MicrophoneStream(RATE, CHUNK) as stream:
+        resume = False
+        while True:
+            audio_generator = stream.generator(resume=resume)
+            requests = (types.StreamingRecognizeRequest(audio_content=content)
+                        for content in audio_generator)
+
+            responses = client.streaming_recognize(
+                    streaming_config, requests,
+                    options=gax.CallOptions(timeout=(60 * 4)))
+
+            try:
+                # Now, put the transcription responses to use.
+                listen_print_loop(responses, stream)
+                break
+            except grpc.RpcError, e:  # TODO: wrong exception
+                if e.code() != grpc.StatusCode.INVALID_ARGUMENT:
+                    raise
+
+                details = e.details()
+                if 'deadline too short' not in details:
+                    raise
+
+                print('Resuming..')
+                resume = True
+
+
+if __name__ == '__main__':
+    main()