Skip to content

Merging samples code #2452

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions composer/workflows/codelab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Example Airflow DAG that checks if a local file exists, creates a Cloud Dataproc cluster, runs the Hadoop
wordcount example, and deletes the cluster.

This DAG relies on three Airflow variables
https://airflow.apache.org/concepts.html#variables
* gcp_project - Google Cloud Project to use for the Cloud Dataproc cluster.
* gce_zone - Google Compute Engine zone where Cloud Dataproc cluster should be
created.
* gcs_bucket - Google Cloud Storage bucket to use for result of Hadoop job.
See https://cloud.google.com/storage/docs/creating-buckets for creating a
bucket.
"""

import datetime
import os

from airflow import models
from airflow.contrib.operators import dataproc_operator
from airflow.operators import BashOperator
from airflow.utils import trigger_rule

# Output file for Cloud Dataproc job.
output_file = os.path.join(
models.Variable.get('gcs_bucket'), 'wordcount',
datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep
# Path to Hadoop wordcount example available on every Dataproc cluster.
WORDCOUNT_JAR = (
'file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar'
)

# Path to input file for Hadoop job.
input_file = '/home/airflow/gcs/data/rose.txt’

# Arguments to pass to Cloud Dataproc job.
wordcount_args = ['wordcount', input_file, output_file]

yesterday = datetime.datetime.combine(
datetime.datetime.today() - datetime.timedelta(1),
datetime.datetime.min.time())

default_dag_args = {
# Setting start date as yesterday starts the DAG immediately when it is
# detected in the Cloud Storage bucket.
'start_date': yesterday,
# To email on failure or retry set 'email' arg to your email and enable
# emailing here.
'email_on_failure': False,
'email_on_retry': False,
# If a task fails, retry it once after waiting at least 5 minutes
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
'Composer_sample_quickstart',
# Continue to run DAG once per day
schedule_interval=datetime.timedelta(days=1),
default_args=default_dag_args) as dag:

# Check if the input file exists.
check_file_existence = BashOperator(
task_id=’check_file_existence’,
bash_command=’if [ ! -f \“{}\” ]; then exit 1; fi’.format(input_file))

# Create a Cloud Dataproc cluster.
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
task_id='create_dataproc_cluster',
# Give the cluster a unique name by appending the date scheduled.
# See https://airflow.apache.org/code.html#default-variables
cluster_name='quickstart-cluster-{{ ds_nodash }}',
num_workers=2,
zone=models.Variable.get('gce_zone'),
master_machine_type='n1-standard-1',
worker_machine_type='n1-standard-1')


# Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
# master node.
run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
task_id='run_dataproc_hadoop',
main_jar=WORDCOUNT_JAR,
cluster_name='quickstart-cluster-{{ ds_nodash }}',
arguments=wordcount_args)

# Delete Cloud Dataproc cluster.
delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
task_id='delete_dataproc_cluster',
cluster_name='quickstart-cluster-{{ ds_nodash }}',
# Setting trigger_rule to ALL_DONE causes the cluster to be deleted
# even if the Dataproc job fails.
trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

# Define DAG dependencies.
check_file_existence >> create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster

61 changes: 32 additions & 29 deletions speech/microphone/transcribe_streaming_infinite.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,36 +22,35 @@
pip install pyaudio
pip install termcolor

pyaudio also requires installing "portaudio"

Example usage:
python transcribe_streaming_infinite.py
"""

# [START speech_transcribe_infinite_streaming]
# [START speech_transcribe_infinite_streaming_imports]

import time
import re
import sys

# uses result_end_time currently only avaialble in v1p1beta, will be in v1 soon
from google.cloud import speech_v1p1beta1 as speech
import pyaudio
from six.moves import queue

# Audio recording parameters
STREAMING_LIMIT = 10000
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms

RED = '\033[0;31m'
GREEN = '\033[0;32m'
YELLOW = '\033[0;33m'

# [END speech_transcribe_infinite_streaming_imports]

def get_current_time():
"""Return Current Time in MS."""

return int(round(time.time() * 1000))

# [START speech_transcribe_infinite_streaming_globals]

# Audio recording parameters
STREAMING_LIMIT = 10000
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms


class ResumableMicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
Expand Down Expand Up @@ -85,6 +84,8 @@ def __init__(self, rate, chunk_size):
stream_callback=self._fill_buffer,
)

# [END speech_transcribe_infinite_streaming_globals]

def __enter__(self):

self.closed = False
Expand All @@ -105,6 +106,10 @@ def _fill_buffer(self, in_data, *args, **kwargs):

self._buff.put(in_data)
return None, pyaudio.paContinue

# [END speech_transcribe_infinite_streaming_init]

# [START speech_transcribe_infinite_streaming_generator]

def generator(self):
"""Stream Audio from microphone to API and to local buffer"""
Expand Down Expand Up @@ -160,6 +165,9 @@ def generator(self):

yield b''.join(data)

# [END speech_transcribe_infinite_streaming_generator]

# [START speech_transcribe_infinite_streaming_output]

def listen_print_loop(responses, stream):
"""Iterates through server responses and prints them.
Expand Down Expand Up @@ -212,28 +220,26 @@ def listen_print_loop(responses, stream):

if result.is_final:

sys.stdout.write(GREEN)
sys.stdout.write('\033[K')
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n')
print(str(corrected_time) + ': ' + transcript + '\n')

stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True

# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r'\b(exit|quit)\b', transcript, re.I):
sys.stdout.write(YELLOW)
sys.stdout.write('Exiting...\n')
print('Exiting...\n')
stream.closed = True
break

else:
sys.stdout.write(RED)
sys.stdout.write('\033[K')
sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r')
print(str(corrected_time) + ': ' + "PROCESSING:" + transcript + '\r')

stream.last_transcript_was_final = False

# [END speech_transcribe_infinite_streaming_output]

# [START speech_transcribe_infinite_streaming_main]

def main():
"""start bidirectional streaming from microphone input to speech API"""
Expand All @@ -250,17 +256,14 @@ def main():

mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
print(mic_manager.chunk_size)
sys.stdout.write(YELLOW)
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
sys.stdout.write('End (ms) Transcript Results/Status\n')
sys.stdout.write('=====================================================\n')
print('\nListening, say "Quit" or "Exit" to stop.\n\n')
print('End (ms) Transcript Results/Status\n')
print('=====================================================\n')

with mic_manager as stream:

while not stream.closed:
sys.stdout.write(YELLOW)
sys.stdout.write('\n' + str(
STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')
print('\n' + str(STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')

stream.audio_input = []
audio_generator = stream.generator()
Expand All @@ -283,12 +286,12 @@ def main():
stream.restart_counter = stream.restart_counter + 1

if not stream.last_transcript_was_final:
sys.stdout.write('\n')
print('\n')
stream.new_stream = True


if __name__ == '__main__':

main()

# [END speech_transcribe_infinite_streaming]
# [END speech_transcribe_infinite_streaming_main]