Skip to content

Commit dcd119a

Browse files
blechdomdandhlee
authored andcommitted
adding infinite streaming sample to samples folder, unlike indefinite… [(#2161)](#2161)
* adding infinite streaming sample to samples folder, unlike indefinite streaming, this sample uses result_end_time to calculate unfinalized audio, and to resend it to maintain context across into the next streaming request * fixed some travis ci lint issues * fixed some travis ci lint issues * fixed some travis ci lint issues * fixed some travis ci lint issues
1 parent 9e41834 commit dcd119a

File tree

1 file changed

+294
-0
lines changed

1 file changed

+294
-0
lines changed
Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2019 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""Google Cloud Speech API sample application using the streaming API.
18+
19+
NOTE: This module requires the dependencies `pyaudio` and `termcolor`.
20+
To install using pip:
21+
22+
pip install pyaudio
23+
pip install termcolor
24+
25+
Example usage:
26+
python transcribe_streaming_infinite.py
27+
"""
28+
29+
# [START speech_transcribe_infinite_streaming]
30+
31+
import time
32+
import re
33+
import sys
34+
35+
# uses result_end_time currently only avaialble in v1p1beta, will be in v1 soon
36+
from google.cloud import speech_v1p1beta1 as speech
37+
import pyaudio
38+
from six.moves import queue
39+
40+
# Audio recording parameters
41+
STREAMING_LIMIT = 10000
42+
SAMPLE_RATE = 16000
43+
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms
44+
45+
RED = '\033[0;31m'
46+
GREEN = '\033[0;32m'
47+
YELLOW = '\033[0;33m'
48+
49+
50+
def get_current_time():
51+
"""Return Current Time in MS."""
52+
53+
return int(round(time.time() * 1000))
54+
55+
56+
class ResumableMicrophoneStream:
57+
"""Opens a recording stream as a generator yielding the audio chunks."""
58+
59+
def __init__(self, rate, chunk_size):
60+
self._rate = rate
61+
self.chunk_size = chunk_size
62+
self._num_channels = 1
63+
self._buff = queue.Queue()
64+
self.closed = True
65+
self.start_time = get_current_time()
66+
self.restart_counter = 0
67+
self.audio_input = []
68+
self.last_audio_input = []
69+
self.result_end_time = 0
70+
self.is_final_end_time = 0
71+
self.final_request_end_time = 0
72+
self.bridging_offset = 0
73+
self.last_transcript_was_final = False
74+
self.new_stream = True
75+
self._audio_interface = pyaudio.PyAudio()
76+
self._audio_stream = self._audio_interface.open(
77+
format=pyaudio.paInt16,
78+
channels=self._num_channels,
79+
rate=self._rate,
80+
input=True,
81+
frames_per_buffer=self.chunk_size,
82+
# Run the audio stream asynchronously to fill the buffer object.
83+
# This is necessary so that the input device's buffer doesn't
84+
# overflow while the calling thread makes network requests, etc.
85+
stream_callback=self._fill_buffer,
86+
)
87+
88+
def __enter__(self):
89+
90+
self.closed = False
91+
return self
92+
93+
def __exit__(self, type, value, traceback):
94+
95+
self._audio_stream.stop_stream()
96+
self._audio_stream.close()
97+
self.closed = True
98+
# Signal the generator to terminate so that the client's
99+
# streaming_recognize method will not block the process termination.
100+
self._buff.put(None)
101+
self._audio_interface.terminate()
102+
103+
def _fill_buffer(self, in_data, *args, **kwargs):
104+
"""Continuously collect data from the audio stream, into the buffer."""
105+
106+
self._buff.put(in_data)
107+
return None, pyaudio.paContinue
108+
109+
def generator(self):
110+
"""Stream Audio from microphone to API and to local buffer"""
111+
112+
while not self.closed:
113+
data = []
114+
115+
if self.new_stream and self.last_audio_input:
116+
117+
chunk_time = STREAMING_LIMIT / len(self.last_audio_input)
118+
119+
if chunk_time != 0:
120+
121+
if self.bridging_offset < 0:
122+
self.bridging_offset = 0
123+
124+
if self.bridging_offset > self.final_request_end_time:
125+
self.bridging_offset = self.final_request_end_time
126+
127+
chunks_from_ms = round((self.final_request_end_time -
128+
self.bridging_offset) / chunk_time)
129+
130+
self.bridging_offset = (round((
131+
len(self.last_audio_input) - chunks_from_ms)
132+
* chunk_time))
133+
134+
for i in range(chunks_from_ms, len(self.last_audio_input)):
135+
data.append(self.last_audio_input[i])
136+
137+
self.new_stream = False
138+
139+
# Use a blocking get() to ensure there's at least one chunk of
140+
# data, and stop iteration if the chunk is None, indicating the
141+
# end of the audio stream.
142+
chunk = self._buff.get()
143+
self.audio_input.append(chunk)
144+
145+
if chunk is None:
146+
return
147+
data.append(chunk)
148+
# Now consume whatever other data's still buffered.
149+
while True:
150+
try:
151+
chunk = self._buff.get(block=False)
152+
153+
if chunk is None:
154+
return
155+
data.append(chunk)
156+
self.audio_input.append(chunk)
157+
158+
except queue.Empty:
159+
break
160+
161+
yield b''.join(data)
162+
163+
164+
def listen_print_loop(responses, stream):
165+
"""Iterates through server responses and prints them.
166+
167+
The responses passed is a generator that will block until a response
168+
is provided by the server.
169+
170+
Each response may contain multiple results, and each result may contain
171+
multiple alternatives; for details, see https://goo.gl/tjCPAU. Here we
172+
print only the transcription for the top alternative of the top result.
173+
174+
In this case, responses are provided for interim results as well. If the
175+
response is an interim one, print a line feed at the end of it, to allow
176+
the next result to overwrite it, until the response is a final one. For the
177+
final one, print a newline to preserve the finalized transcription.
178+
"""
179+
180+
for response in responses:
181+
182+
if get_current_time() - stream.start_time > STREAMING_LIMIT:
183+
stream.start_time = get_current_time()
184+
break
185+
186+
if not response.results:
187+
continue
188+
189+
result = response.results[0]
190+
191+
if not result.alternatives:
192+
continue
193+
194+
transcript = result.alternatives[0].transcript
195+
196+
result_seconds = 0
197+
result_nanos = 0
198+
199+
if result.result_end_time.seconds:
200+
result_seconds = result.result_end_time.seconds
201+
202+
if result.result_end_time.nanos:
203+
result_nanos = result.result_end_time.nanos
204+
205+
stream.result_end_time = int((result_seconds * 1000)
206+
+ (result_nanos / 1000000))
207+
208+
corrected_time = (stream.result_end_time - stream.bridging_offset
209+
+ (STREAMING_LIMIT * stream.restart_counter))
210+
# Display interim results, but with a carriage return at the end of the
211+
# line, so subsequent lines will overwrite them.
212+
213+
if result.is_final:
214+
215+
sys.stdout.write(GREEN)
216+
sys.stdout.write('\033[K')
217+
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n')
218+
219+
stream.is_final_end_time = stream.result_end_time
220+
stream.last_transcript_was_final = True
221+
222+
# Exit recognition if any of the transcribed phrases could be
223+
# one of our keywords.
224+
if re.search(r'\b(exit|quit)\b', transcript, re.I):
225+
sys.stdout.write(YELLOW)
226+
sys.stdout.write('Exiting...\n')
227+
stream.closed = True
228+
break
229+
230+
else:
231+
sys.stdout.write(RED)
232+
sys.stdout.write('\033[K')
233+
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r')
234+
235+
stream.last_transcript_was_final = False
236+
237+
238+
def main():
239+
"""start bidirectional streaming from microphone input to speech API"""
240+
241+
client = speech.SpeechClient()
242+
config = speech.types.RecognitionConfig(
243+
encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
244+
sample_rate_hertz=SAMPLE_RATE,
245+
language_code='en-US',
246+
max_alternatives=1)
247+
streaming_config = speech.types.StreamingRecognitionConfig(
248+
config=config,
249+
interim_results=True)
250+
251+
mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
252+
print(mic_manager.chunk_size)
253+
sys.stdout.write(YELLOW)
254+
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
255+
sys.stdout.write('End (ms) Transcript Results/Status\n')
256+
sys.stdout.write('=====================================================\n')
257+
258+
with mic_manager as stream:
259+
260+
while not stream.closed:
261+
sys.stdout.write(YELLOW)
262+
sys.stdout.write('\n' + str(
263+
STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')
264+
265+
stream.audio_input = []
266+
audio_generator = stream.generator()
267+
268+
requests = (speech.types.StreamingRecognizeRequest(
269+
audio_content=content)for content in audio_generator)
270+
271+
responses = client.streaming_recognize(streaming_config,
272+
requests)
273+
274+
# Now, put the transcription responses to use.
275+
listen_print_loop(responses, stream)
276+
277+
if stream.result_end_time > 0:
278+
stream.final_request_end_time = stream.is_final_end_time
279+
stream.result_end_time = 0
280+
stream.last_audio_input = []
281+
stream.last_audio_input = stream.audio_input
282+
stream.audio_input = []
283+
stream.restart_counter = stream.restart_counter + 1
284+
285+
if not stream.last_transcript_was_final:
286+
sys.stdout.write('\n')
287+
stream.new_stream = True
288+
289+
290+
if __name__ == '__main__':
291+
292+
main()
293+
294+
# [END speech_transcribe_infinite_streaming]

0 commit comments

Comments
 (0)