Merging samples code (#2452)

bradmiro · web-flow · commit 872c62cd52e7 · 2019-10-07T17:54:24.000-04:00
* Added codelab.py

* Fixed copyright year in codelab.py

* Added tags to infinite streaming sample

* refactored and added tags to infinite speech streaming sample
diff --git a/composer/workflows/codelab.py b/composer/workflows/codelab.py
@@ -0,0 +1,110 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example Airflow DAG that checks if a local file exists, creates a Cloud Dataproc cluster, runs the Hadoop
+wordcount example, and deletes the cluster.
+
+This DAG relies on three Airflow variables
+https://airflow.apache.org/concepts.html#variables
+* gcp_project - Google Cloud Project to use for the Cloud Dataproc cluster.
+* gce_zone - Google Compute Engine zone where Cloud Dataproc cluster should be
+  created.
+* gcs_bucket - Google Cloud Storage bucket to use for result of Hadoop job.
+  See https://cloud.google.com/storage/docs/creating-buckets for creating a
+  bucket.
+"""
+
+import datetime
+import os
+
+from airflow import models
+from airflow.contrib.operators import dataproc_operator
+from airflow.operators import BashOperator
+from airflow.utils import trigger_rule
+
+# Output file for Cloud Dataproc job.
+output_file = os.path.join(
+    models.Variable.get('gcs_bucket'), 'wordcount',
+    datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep
+# Path to Hadoop wordcount example available on every Dataproc cluster.
+WORDCOUNT_JAR = (
+    'file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar'
+)
+
+# Path to input file for Hadoop job.
+input_file = '/home/airflow/gcs/data/rose.txt’
+
+# Arguments to pass to Cloud Dataproc job.
+wordcount_args = ['wordcount', input_file, output_file]
+
+yesterday = datetime.datetime.combine(
+    datetime.datetime.today() - datetime.timedelta(1),
+    datetime.datetime.min.time())
+
+default_dag_args = {
+    # Setting start date as yesterday starts the DAG immediately when it is
+    # detected in the Cloud Storage bucket.
+    'start_date': yesterday,
+    # To email on failure or retry set 'email' arg to your email and enable
+    # emailing here.
+    'email_on_failure': False,
+    'email_on_retry': False,
+    # If a task fails, retry it once after waiting at least 5 minutes
+    'retries': 1,
+    'retry_delay': datetime.timedelta(minutes=5),
+    'project_id': models.Variable.get('gcp_project')
+}
+
+with models.DAG(
+        'Composer_sample_quickstart',
+        # Continue to run DAG once per day
+        schedule_interval=datetime.timedelta(days=1),
+        default_args=default_dag_args) as dag:
+
+    # Check if the input file exists.
+    check_file_existence =  BashOperator(
+        task_id=’check_file_existence’,
+        bash_command=’if [ ! -f \“{}\” ]; then exit 1;  fi’.format(input_file)) 
+
+   # Create a Cloud Dataproc cluster.
+    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
+        task_id='create_dataproc_cluster',
+        # Give the cluster a unique name by appending the date scheduled.
+        # See https://airflow.apache.org/code.html#default-variables
+        cluster_name='quickstart-cluster-{{ ds_nodash }}',
+        num_workers=2,
+        zone=models.Variable.get('gce_zone'),
+        master_machine_type='n1-standard-1',
+        worker_machine_type='n1-standard-1')
+
+
+   # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
+    # master node.
+    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
+        task_id='run_dataproc_hadoop',
+        main_jar=WORDCOUNT_JAR,
+        cluster_name='quickstart-cluster-{{ ds_nodash }}',
+        arguments=wordcount_args)
+
+   # Delete Cloud Dataproc cluster.
+    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
+        task_id='delete_dataproc_cluster',
+        cluster_name='quickstart-cluster-{{ ds_nodash }}',
+        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
+        # even if the Dataproc job fails.
+        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
+
+   # Define DAG dependencies.
+    check_file_existence >> create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
+
diff --git a/speech/microphone/transcribe_streaming_infinite.py b/speech/microphone/transcribe_streaming_infinite.py
@@ -22,36 +22,35 @@
     pip install pyaudio
     pip install termcolor
 
+pyaudio also requires installing "portaudio"
+
 Example usage:
     python transcribe_streaming_infinite.py
 """
 
-# [START speech_transcribe_infinite_streaming]
+# [START speech_transcribe_infinite_streaming_imports]
 
 import time
 import re
-import sys
 
 # uses result_end_time currently only avaialble in v1p1beta, will be in v1 soon
 from google.cloud import speech_v1p1beta1 as speech
 import pyaudio
 from six.moves import queue
-
-# Audio recording parameters
-STREAMING_LIMIT = 10000
-SAMPLE_RATE = 16000
-CHUNK_SIZE = int(SAMPLE_RATE / 10)  # 100ms
-
-RED = '\033[0;31m'
-GREEN = '\033[0;32m'
-YELLOW = '\033[0;33m'
-
+# [END speech_transcribe_infinite_streaming_imports]
 
 def get_current_time():
     """Return Current Time in MS."""
 
     return int(round(time.time() * 1000))
 
+# [START speech_transcribe_infinite_streaming_globals]
+
+# Audio recording parameters
+STREAMING_LIMIT = 10000
+SAMPLE_RATE = 16000
+CHUNK_SIZE = int(SAMPLE_RATE / 10)  # 100ms
+
 
 class ResumableMicrophoneStream:
     """Opens a recording stream as a generator yielding the audio chunks."""
@@ -85,6 +84,8 @@ def __init__(self, rate, chunk_size):
             stream_callback=self._fill_buffer,
         )
 
+# [END speech_transcribe_infinite_streaming_globals]
+
     def __enter__(self):
 
         self.closed = False
@@ -105,6 +106,10 @@ def _fill_buffer(self, in_data, *args, **kwargs):
 
         self._buff.put(in_data)
         return None, pyaudio.paContinue
+    
+# [END speech_transcribe_infinite_streaming_init]
+
+# [START speech_transcribe_infinite_streaming_generator]
 
     def generator(self):
         """Stream Audio from microphone to API and to local buffer"""
@@ -160,6 +165,9 @@ def generator(self):
 
             yield b''.join(data)
 
+# [END speech_transcribe_infinite_streaming_generator]
+
+# [START speech_transcribe_infinite_streaming_output]
 
 def listen_print_loop(responses, stream):
     """Iterates through server responses and prints them.
@@ -212,28 +220,26 @@ def listen_print_loop(responses, stream):
 
         if result.is_final:
 
-            sys.stdout.write(GREEN)
-            sys.stdout.write('\033[K')
-            sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n')
+            print(str(corrected_time) + ': ' + transcript + '\n')
 
             stream.is_final_end_time = stream.result_end_time
             stream.last_transcript_was_final = True
 
             # Exit recognition if any of the transcribed phrases could be
             # one of our keywords.
             if re.search(r'\b(exit|quit)\b', transcript, re.I):
-                sys.stdout.write(YELLOW)
-                sys.stdout.write('Exiting...\n')
+                print('Exiting...\n')
                 stream.closed = True
                 break
 
         else:
-            sys.stdout.write(RED)
-            sys.stdout.write('\033[K')
-            sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r')
+            print(str(corrected_time) + ': ' + "PROCESSING:" + transcript + '\r')
 
             stream.last_transcript_was_final = False
 
+# [END speech_transcribe_infinite_streaming_output]
+
+# [START speech_transcribe_infinite_streaming_main]
 
 def main():
     """start bidirectional streaming from microphone input to speech API"""
@@ -250,17 +256,14 @@ def main():
 
     mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
     print(mic_manager.chunk_size)
-    sys.stdout.write(YELLOW)
-    sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
-    sys.stdout.write('End (ms)       Transcript Results/Status\n')
-    sys.stdout.write('=====================================================\n')
+    print('\nListening, say "Quit" or "Exit" to stop.\n\n')
+    print('End (ms)       Transcript Results/Status\n')
+    print('=====================================================\n')
 
     with mic_manager as stream:
 
         while not stream.closed:
-            sys.stdout.write(YELLOW)
-            sys.stdout.write('\n' + str(
-                STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')
+            print('\n' + str(STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')
 
             stream.audio_input = []
             audio_generator = stream.generator()
@@ -283,12 +286,12 @@ def main():
             stream.restart_counter = stream.restart_counter + 1
 
             if not stream.last_transcript_was_final:
-                sys.stdout.write('\n')
+                print('\n')
             stream.new_stream = True
 
 
 if __name__ == '__main__':
 
     main()
 
-# [END speech_transcribe_infinite_streaming]
+# [END speech_transcribe_infinite_streaming_main]