Skip to content

Commit 13401f8

Browse files
committed
automl initial commit
1 parent d797d75 commit 13401f8

21 files changed

+2581
-0
lines changed
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""This application demonstrates how to perform basic operations on Dataset
18+
with the Google AutoML Natural Language API.
19+
20+
For more information, see the tutorial page at
21+
https://cloud.google.com/natural-language/automl/docs/
22+
"""
23+
24+
# [START automl_natural_language_import]
25+
import argparse
26+
import os
27+
28+
from google.cloud import automl_v1beta1 as automl
29+
# [END automl_natural_language_import]
30+
31+
32+
# [START automl_natural_language_create_dataset]
33+
def create_dataset(project_id, compute_region, dataset_name, multilabel=False):
34+
"""Create a dataset.
35+
Args:
36+
project_id: Id of the project.
37+
compute_region: Region name.
38+
dataset_name: Name of the dataset.
39+
multilabel: Type of the classification problem.
40+
False - MULTICLASS, True - MULTILABEL.
41+
Default is False.
42+
"""
43+
client = automl.AutoMlClient()
44+
45+
# A resource that represents Google Cloud Platform location.
46+
project_location = client.location_path(project_id, compute_region)
47+
48+
# Classification type is assigned based on multilabel value.
49+
classification_type = 'MULTICLASS'
50+
if multilabel:
51+
classification_type = 'MULTILABEL'
52+
53+
# Specify the text classification type for the dataset.
54+
dataset_metadata = {
55+
'classification_type': classification_type
56+
}
57+
58+
# Set dataset name and metadata.
59+
my_dataset = {
60+
'display_name': dataset_name,
61+
'text_classification_dataset_metadata': dataset_metadata
62+
}
63+
64+
# Create a dataset with the dataset metadata in the region.
65+
dataset = client.create_dataset(project_location, my_dataset)
66+
67+
# Display the dataset information.
68+
print('Dataset name: {}'.format(dataset.name))
69+
print('Dataset id: {}'.format(dataset.name.split('/')[-1]))
70+
print('Dataset display name: {}'.format(dataset.display_name))
71+
print('Text classification dataset metadata:')
72+
print('\t{}'.format(dataset.text_classification_dataset_metadata))
73+
print('Dataset example count: {}'.format(dataset.example_count))
74+
print('Dataset create time:')
75+
print('\tseconds: {}'.format(dataset.create_time.seconds))
76+
print('\tnanos: {}'.format(dataset.create_time.nanos))
77+
# [END automl_natural_language_create_dataset]
78+
79+
80+
# [START automl_natural_language_list_datasets]
81+
def list_datasets(project_id, compute_region, filter_):
82+
"""List all datasets.
83+
Args:
84+
project_id: Id of the project.
85+
compute_region: Region name.
86+
filter_: Filter expression.
87+
"""
88+
client = automl.AutoMlClient()
89+
90+
# A resource that represents Google Cloud Platform location.
91+
project_location = client.location_path(project_id, compute_region)
92+
93+
# List all the datasets available in the region by applying filter.
94+
response = client.list_datasets(project_location, filter_)
95+
96+
print('List of datasets:')
97+
for dataset in response:
98+
# Display the dataset information.
99+
print('Dataset name: {}'.format(dataset.name))
100+
print('Dataset id: {}'.format(dataset.name.split('/')[-1]))
101+
print('Dataset display name: {}'.format(dataset.display_name))
102+
print('Text classification dataset metadata:')
103+
print('\t{}'.format(dataset.text_classification_dataset_metadata))
104+
print('Dataset example count: {}'.format(dataset.example_count))
105+
print('Dataset create time:')
106+
print('\tseconds: {}'.format(dataset.create_time.seconds))
107+
print('\tnanos: {}'.format(dataset.create_time.nanos))
108+
# [END automl_natural_language_list_datasets]
109+
110+
111+
# [START automl_natural_language_get_dataset]
112+
def get_dataset(project_id, compute_region, dataset_id):
113+
"""Get the dataset.
114+
Args:
115+
project_id: Id of the project.
116+
compute_region: Region name.
117+
dataset_id: Id of the dataset.
118+
"""
119+
client = automl.AutoMlClient()
120+
121+
# Get the full path of the dataset
122+
dataset_full_id = client.dataset_path(
123+
project_id, compute_region, dataset_id)
124+
125+
# Get complete detail of the dataset.
126+
dataset = client.get_dataset(dataset_full_id)
127+
128+
# Display the dataset information.
129+
print('Dataset name: {}'.format(dataset.name))
130+
print('Dataset id: {}'.format(dataset.name.split('/')[-1]))
131+
print('Dataset display name: {}'.format(dataset.display_name))
132+
print('Text classification dataset metadata:')
133+
print('\t{}'.format(dataset.text_classification_dataset_metadata))
134+
print('Dataset example count: {}'.format(dataset.example_count))
135+
print('Dataset create time:')
136+
print('\tseconds: {}'.format(dataset.create_time.seconds))
137+
print('\tnanos: {}'.format(dataset.create_time.nanos))
138+
# [END automl_natural_language_get_dataset]
139+
140+
141+
# [START automl_natural_language_import_data]
142+
def import_data(project_id, compute_region, dataset_id, path):
143+
"""Import labeled items.
144+
Args:
145+
project_id: Id of the project.
146+
compute_region: Region name.
147+
dataset_id: ID of the dataset into which the training content are to
148+
be imported.
149+
path: Google Cloud Storage URIs.
150+
Target files must be in AutoML Natural Language CSV format.
151+
"""
152+
client = automl.AutoMlClient()
153+
154+
# Get the full path of the dataset.
155+
dataset_full_id = client.dataset_path(
156+
project_id, compute_region, dataset_id)
157+
158+
# Get the multiple Google Cloud Storage URIs.
159+
input_uris = path.split(',')
160+
input_config = {'gcs_source': {
161+
'input_uris': input_uris
162+
}}
163+
164+
# Import the dataset from the input URI.
165+
response = client.import_data(dataset_full_id, input_config)
166+
167+
print('Processing import...')
168+
# synchronous check of operation status.
169+
print('Data imported. {}'.format(response.result()))
170+
# [END automl_natural_language_import_data]
171+
172+
173+
# [START automl_natural_language_export_data]
174+
def export_data(project_id, compute_region, dataset_id, output_uri):
175+
"""Export a dataset to a Google Cloud Storage bucket.
176+
Args:
177+
project_id: Id of the project.
178+
compute_region: Region name.
179+
dataset_id: Id of the dataset to which will be exported.
180+
output_uri: Google Cloud Storage URI for the export directory.
181+
"""
182+
client = automl.AutoMlClient()
183+
184+
# Get the full path of the dataset.
185+
dataset_full_id = client.dataset_path(
186+
project_id, compute_region, dataset_id)
187+
188+
# Set the output URI
189+
output_config = {'gcs_destination': {
190+
'output_uri_prefix': output_uri
191+
}}
192+
193+
# Export the data to the output URI.
194+
response = client.export_data(dataset_full_id, output_config)
195+
196+
print('Processing export...')
197+
# synchronous check of operation status.
198+
print('Data exported. {}'.format(response.result()))
199+
# [END automl_natural_language_export_data]
200+
201+
202+
# [START automl_natural_language_delete_dataset]
203+
def delete_dataset(project_id, compute_region, dataset_id):
204+
"""Delete a dataset.
205+
Args:
206+
project_id: Id of the project.
207+
compute_region: Region name.
208+
dataset_id: Id of the dataset.
209+
"""
210+
client = automl.AutoMlClient()
211+
212+
# Get the full path of the dataset.
213+
dataset_full_id = client.dataset_path(
214+
project_id, compute_region, dataset_id)
215+
216+
# Delete a dataset.
217+
response = client.delete_dataset(dataset_full_id)
218+
219+
# synchronous check of operation status.
220+
print('Dataset deleted. {}'.format(response.result()))
221+
# [END automl_natural_language_delete_dataset]
222+
223+
224+
if __name__ == '__main__':
225+
parser = argparse.ArgumentParser(
226+
description=__doc__,
227+
formatter_class=argparse.RawDescriptionHelpFormatter)
228+
subparsers = parser.add_subparsers(dest='command')
229+
230+
create_dataset_parser = subparsers.add_parser(
231+
'create_dataset', help=create_dataset.__doc__)
232+
create_dataset_parser.add_argument('dataset_name')
233+
create_dataset_parser.add_argument(
234+
'multilabel', nargs='?', choices=['False', 'True'], default='False')
235+
236+
list_datasets_parser = subparsers.add_parser(
237+
'list_datasets', help=list_datasets.__doc__)
238+
list_datasets_parser.add_argument(
239+
'filter_', nargs='?', default='text_classification_dataset_metadata:*')
240+
241+
get_dataset_parser = subparsers.add_parser(
242+
'get_dataset', help=get_dataset.__doc__)
243+
get_dataset_parser.add_argument('dataset_id')
244+
245+
import_data_parser = subparsers.add_parser(
246+
'import_data', help=import_data.__doc__)
247+
import_data_parser.add_argument('dataset_id')
248+
import_data_parser.add_argument('path')
249+
250+
export_data_parser = subparsers.add_parser(
251+
'export_data', help=export_data.__doc__)
252+
export_data_parser.add_argument('dataset_id')
253+
export_data_parser.add_argument('output_uri')
254+
255+
delete_dataset_parser = subparsers.add_parser(
256+
'delete_dataset', help=delete_dataset.__doc__)
257+
delete_dataset_parser.add_argument('dataset_id')
258+
259+
project_id = os.environ['PROJECT_ID']
260+
compute_region = os.environ['REGION_NAME']
261+
262+
args = parser.parse_args()
263+
264+
if args.command == 'create_dataset':
265+
multilabel = True if args.multilabel == 'True' else False
266+
create_dataset(
267+
project_id, compute_region, args.dataset_name, multilabel)
268+
if args.command == 'list_datasets':
269+
list_datasets(project_id, compute_region, args.filter_)
270+
if args.command == 'get_dataset':
271+
get_dataset(project_id, compute_region, args.dataset_id)
272+
if args.command == 'import_data':
273+
import_data(project_id, compute_region, args.dataset_id, args.path)
274+
if args.command == 'export_data':
275+
export_data(
276+
project_id, compute_region, args.dataset_id, args.output_uri)
277+
if args.command == 'delete_dataset':
278+
delete_dataset(project_id, compute_region, args.dataset_id)

0 commit comments

Comments
 (0)