GoogleCloudPlatform
diff --git a/‎speech/cloud-client/transcribe_streaming_indefinite.py
Lines changed: 119 additions & 133 deletions b/‎speech/cloud-client/transcribe_streaming_indefinite.py
Lines changed: 119 additions & 133 deletions
@@ -28,10 +28,13 @@
 # [START import_libraries]
 from __future__ import division
 
+import argparse
 import collections
 import itertools
 import re
 import sys
+import threading
+import time
 
 from google.cloud import speech
 from google.cloud.speech import enums
@@ -40,64 +43,32 @@
 import grpc
 import pyaudio
 from six.moves import queue
+import six
+
+import transcribe_streaming_mic
 # [END import_libraries]
 
-# Audio recording parameters
-RATE = 16000
-CHUNK = int(RATE / 10)  # 100ms
+
+def duration_to_secs(duration):
+    return duration.seconds + (duration.nanos / float(1e9))
 
 
-class MicrophoneStream(object):
+class ResumableMicrophoneStream(transcribe_streaming_mic.MicrophoneStream):
     """Opens a recording stream as a generator yielding the audio chunks."""
     def __init__(self, rate, chunk_size, max_replay_secs=5):
-        self._rate = rate
-        self._chunk_size = chunk_size
+        super(ResumableMicrophoneStream, self).__init__(rate, chunk_size)
         self._max_replay_secs = max_replay_secs
 
-        # Create a thread-safe buffer of audio data
-        self._buff = queue.Queue()
-        self.closed = True
-
-    def __enter__(self):
-        num_channels = 1
-        self._audio_interface = pyaudio.PyAudio()
-        self._audio_stream = self._audio_interface.open(
-            format=pyaudio.paInt16,
-            # The API currently only supports 1-channel (mono) audio
-            # https://goo.gl/z757pE
-            channels=num_channels, rate=self._rate,
-            input=True, frames_per_buffer=self._chunk_size,
-            # Run the audio stream asynchronously to fill the buffer object.
-            # This is necessary so that the input device's buffer doesn't
-            # overflow while the calling thread makes network requests, etc.
-            stream_callback=self._fill_buffer,
-        )
-
-        self.closed = False
+        # Some useful numbers
+        # 2 bytes in 16 bit samples
+        self._bytes_per_sample = 2 * self._num_channels
+        self._bytes_per_second = self._rate * self._bytes_per_sample
 
-        bytes_per_sample = 2 * num_channels  # 2 bytes in 16 bit samples
-        self._bytes_per_second = self._rate * bytes_per_sample
-
-        bytes_per_chunk = (self._chunk_size * bytes_per_sample)
-        chunks_per_second = self._bytes_per_second / bytes_per_chunk
+        self._bytes_per_chunk = (self._chunk_size * self._bytes_per_sample)
+        self._chunks_per_second = (
+                self._bytes_per_second / self._bytes_per_chunk)
         self._untranscribed = collections.deque(
-                maxlen=self._max_replay_secs * chunks_per_second)
-
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self._audio_stream.stop_stream()
-        self._audio_stream.close()
-        self.closed = True
-        # Signal the generator to terminate so that the client's
-        # streaming_recognize method will not block the process termination.
-        self._buff.put(None)
-        self._audio_interface.terminate()
-
-    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
-        """Continuously collect data from the audio stream, into the buffer."""
-        self._buff.put(in_data)
-        return None, pyaudio.paContinue
+                maxlen=self._max_replay_secs * self._chunks_per_second)
 
     def on_transcribe(self, end_time):
         while self._untranscribed and end_time > self._untranscribed[0][1]:
@@ -106,145 +77,160 @@ def on_transcribe(self, end_time):
     def generator(self, resume=False):
         total_bytes_sent = 0
         if resume:
+            # Make a copy, in case on_transcribe is called while yielding them
+            catchup = list(self._untranscribed)
             # Yield all the untranscribed chunks first
-            for chunk, _ in self._untranscribed:
+            for chunk, _ in catchup:
                 yield chunk
-        while not self.closed:
-            # Use a blocking get() to ensure there's at least one chunk of
-            # data, and stop iteration if the chunk is None, indicating the
-            # end of the audio stream.
-            chunk = self._buff.get()
-            if chunk is None:
-                return
-            data = [chunk]
-
-            # Now consume whatever other data's still buffered.
-            while True:
-                try:
-                    chunk = self._buff.get(block=False)
-                    if chunk is None:
-                        return
-                    data.append(chunk)
-                except queue.Empty:
-                    break
-
-            byte_data = b''.join(data)
 
+        for byte_data in super(ResumableMicrophoneStream, self).generator():
             # Populate the replay buffer of untranscribed audio bytes
             total_bytes_sent += len(byte_data)
             chunk_end_time = total_bytes_sent / self._bytes_per_second
             self._untranscribed.append((byte_data, chunk_end_time))
 
             yield byte_data
-# [END audio_stream]
 
 
-def duration_to_secs(duration):
-    return duration.seconds + (duration.nanos / float(1e9))
+class SimulatedMicrophoneStream(ResumableMicrophoneStream):
+    def __init__(self, audio_src, *args, **kwargs):
+        super(SimulatedMicrophoneStream, self).__init__(*args, **kwargs)
+        self._audio_src = audio_src
 
+    def _delayed(self, get_data):
+        total_bytes_read = 0
+        start_time = time.time()
 
-def listen_print_loop(responses, stream):
-    """Iterates through server responses and prints them.
+        chunk = get_data(self._bytes_per_chunk)
 
-    The responses passed is a generator that will block until a response
-    is provided by the server.
+        while chunk and not self.closed:
+            total_bytes_read += len(chunk)
+            expected_yield_time = start_time + (
+                    total_bytes_read / self._bytes_per_second)
+            now = time.time()
+            if expected_yield_time > now:
+                time.sleep(expected_yield_time - now)
 
-    Each response may contain multiple results, and each result may contain
-    multiple alternatives; for details, see https://goo.gl/tjCPAU.  Here we
-    print only the transcription for the top alternative of the top result.
+            yield chunk
 
-    In this case, responses are provided for interim results as well. If the
-    response is an interim one, print a line feed at the end of it, to allow
-    the next result to overwrite it, until the response is a final one. For the
-    final one, print a newline to preserve the finalized transcription.
-    """
-    num_chars_printed = 0
-    for response in responses:
-        if not response.results:
-            continue
-
-        # The `results` list is consecutive. For streaming, we only care about
-        # the first result being considered, since once it's `is_final`, it
-        # moves on to considering the next utterance.
-        result = response.results[0]
-        if not result.alternatives:
-            continue
-
-        top_alternative = result.alternatives[0]
-        # Display the transcription of the top alternative.
-        transcript = top_alternative.transcript
-
-        # Display interim results, but with a carriage return at the end of the
-        # line, so subsequent lines will overwrite them.
-        #
-        # If the previous result was longer than this one, we need to print
-        # some extra spaces to overwrite the previous result
-        overwrite_chars = ' ' * (num_chars_printed - len(transcript))
-
-        if not result.is_final:
-            sys.stdout.write(transcript + overwrite_chars + '\r')
-            sys.stdout.flush()
-
-            num_chars_printed = len(transcript)
-        else:
-            print(transcript + overwrite_chars)
-
-            # Exit recognition if any of the transcribed phrases could be
-            # one of our keywords.
-            if re.search(r'\b(exit|quit)\b', transcript, re.I):
-                print('Exiting..')
-                break
+            chunk = get_data(self._bytes_per_chunk)
+
+    def _stream_from_file(self, audio_src):
+        with open(audio_src, 'rb') as f:
+            for chunk in self._delayed(
+                    lambda b_per_chunk: f.read(b_per_chunk)):
+                yield chunk
+
+        # Continue sending silence - 10s worth
+        trailing_silence = six.StringIO(
+                b'\0' * self._bytes_per_second * 10)
+        for chunk in self._delayed(trailing_silence.read):
+            yield chunk
+
+    def _thread(self):
+        for chunk in self._stream_from_file(self._audio_src):
+            self._fill_buffer(chunk)
+        self._fill_buffer(None)
+
+    def __enter__(self):
+        self.closed = False
+
+        threading.Thread(target=self._thread).start()
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.closed = True
 
-            num_chars_printed = 0
 
+def _record_keeper(responses, stream):
+    """Calls the stream's on_transcribe callback for each final response.
+
+    Args:
+        responses - a generator of responses. The responses must already be
+            filtered for ones with results and alternatives.
+        stream - a ResumableMicrophoneStream.
+    """
+    for r in responses:
+        result = r.results[0]
+        if result.is_final:
+            top_alternative = result.alternatives[0]
             # Keep track of what transcripts we've received, so we can resume
             # intelligently when we hit the deadline
             stream.on_transcribe(duration_to_secs(
                     top_alternative.words[-1].end_time))
+        yield r
+
 
+def listen_print_loop(responses, stream):
+    """Iterates through server responses and prints them.
 
-def main():
+    Same as in transcribe_streaming_mic, but keeps track of when a sent
+    audio_chunk has been transcribed.
+    """
+    with_results = (r for r in responses if (
+            r.results and r.results[0].alternatives))
+    transcribe_streaming_mic.listen_print_loop(
+            _record_keeper(with_results, stream))
+
+
+def main(sample_rate, audio_src):
     # See http://g.co/cloud/speech/docs/languages
     # for a list of supported languages.
     language_code = 'en-US'  # a BCP-47 language tag
 
     client = speech.SpeechClient()
     config = types.RecognitionConfig(
         encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=RATE,
+        sample_rate_hertz=sample_rate,
         language_code=language_code,
         max_alternatives=1,
         enable_word_time_offsets=True)
     streaming_config = types.StreamingRecognitionConfig(
         config=config,
         interim_results=True)
 
-    with MicrophoneStream(RATE, CHUNK) as stream:
+    if audio_src:
+        mic_manager = SimulatedMicrophoneStream(
+                audio_src, sample_rate, int(sample_rate / 10))
+    else:
+        mic_manager = ResumableMicrophoneStream(
+                sample_rate, int(sample_rate / 10))
+
+    with mic_manager as stream:
         resume = False
         while True:
             audio_generator = stream.generator(resume=resume)
             requests = (types.StreamingRecognizeRequest(audio_content=content)
                         for content in audio_generator)
 
-            responses = client.streaming_recognize(
-                    streaming_config, requests,
-                    options=gax.CallOptions(timeout=(60 * 4)))
+            responses = client.streaming_recognize(streaming_config, requests)
 
             try:
                 # Now, put the transcription responses to use.
                 listen_print_loop(responses, stream)
                 break
-            except grpc.RpcError, e:  # TODO: wrong exception
-                if e.code() != grpc.StatusCode.INVALID_ARGUMENT:
+            except grpc.RpcError, e:
+                if e.code() not in (grpc.StatusCode.INVALID_ARGUMENT,
+                                    grpc.StatusCode.OUT_OF_RANGE):
                     raise
-
                 details = e.details()
-                if 'deadline too short' not in details:
-                    raise
+                if e.code() == grpc.StatusCode.INVALID_ARGUMENT:
+                    if 'deadline too short' not in details:
+                        raise
+                else:
+                    if 'maximum allowed stream duration' not in details:
+                        raise
 
                 print('Resuming..')
                 resume = True
 
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('--rate', default=16000, help='Sample rate.', type=int)
+    parser.add_argument('--audio_src', help='File to simulate streaming of.')
+    args = parser.parse_args()
+    main(args.rate, args.audio_src)