Skip to content

Commit 0f85ad0

Browse files
tswastJon Wayne Parrott
authored andcommitted
Add DML samples for BigQuery. (#546)
Add BigQuery DML sample for inserts. This sample reads a SQL file (for example: one that was output from mysqldump) and executes each line as a query. At least in my configuration of mysqldump, each insert statement was on a single line, so I was able to write data from MySQL to BigQuery with this sample.
1 parent de3cf80 commit 0f85ad0

File tree

9 files changed

+356
-0
lines changed

9 files changed

+356
-0
lines changed

bigquery/dml/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sample_db_export.sql

bigquery/dml/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# BigQuery DML Samples
2+
3+
<!-- auto-doc-link -->
4+
These samples are used on the following documentation page:
5+
6+
> https://cloud.google.combigquery/docs/loading-data-sql-dml
7+
8+
<!-- end-auto-doc-link -->
9+
10+
To create a test database, run the `populate_db.py` script.
11+
12+
```
13+
python populate_db.py 100 localhost root 'mysql-password' sample_db
14+
```
15+

bigquery/dml/insert_sql.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2016 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""Sample that runs a file containing INSERT SQL statements in Big Query.
18+
19+
This could be used to run the INSERT statements in a mysqldump output such as
20+
21+
mysqldump --user=root \
22+
--password='secret-password' \
23+
--host=127.0.0.1 \
24+
--no-create-info sample_db \
25+
--skip-add-locks > sample_db_export.sql
26+
27+
To run, first create tables with the same names and columns as the sample
28+
database. Then run this script.
29+
30+
python insert_sql.py my-project my_dataset sample_db_export.sql
31+
"""
32+
33+
# [START insert_sql]
34+
import argparse
35+
36+
from gcloud import bigquery
37+
38+
39+
def insert_sql(project, default_dataset, sql_path):
40+
"""Run all the SQL statements in a SQL file."""
41+
42+
client = bigquery.Client(project=project)
43+
44+
with open(sql_path) as f:
45+
for line in f:
46+
line = line.strip()
47+
48+
if not line.startswith('INSERT'):
49+
continue
50+
51+
print('Running query: {}{}'.format(
52+
line[:60],
53+
'...' if len(line) > 60 else ''))
54+
query = client.run_sync_query(line)
55+
56+
# Set use_legacy_sql to False to enable standard SQL syntax.
57+
# This is required to use the Data Manipulation Language features.
58+
#
59+
# For more information about enabling standard SQL, see:
60+
# https://cloud.google.com/bigquery/sql-reference/enabling-standard-sql
61+
query.use_legacy_sql = False
62+
query.default_dataset = client.dataset(default_dataset)
63+
query.run()
64+
65+
66+
if __name__ == "__main__":
67+
parser = argparse.ArgumentParser(
68+
description=__doc__,
69+
formatter_class=argparse.RawDescriptionHelpFormatter)
70+
parser.add_argument('project', help='Google Cloud project name')
71+
parser.add_argument(
72+
'default_dataset', help='Default BigQuery dataset name')
73+
parser.add_argument('sql_path', help='Path to SQL file')
74+
75+
args = parser.parse_args()
76+
77+
insert_sql(args.project, args.default_dataset, args.sql_path)
78+
# [END insert_sql]

bigquery/dml/insert_sql_test.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2016 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os.path
16+
17+
from insert_sql import insert_sql
18+
19+
20+
def test_insert_sql(cloud_config, capsys):
21+
sql_path = os.path.join(
22+
os.path.dirname(__file__),
23+
'resources',
24+
'insert_sql_test.sql')
25+
26+
insert_sql(cloud_config.project, 'test_dataset', sql_path)
27+
28+
out, _ = capsys.readouterr()
29+
30+
assert (
31+
'INSERT INTO `test_table` (`Name`) VALUES (\'hello world\')'
32+
in out)

bigquery/dml/populate_db.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2016 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""Command-line tool to simulate user actions and write to SQL database.
18+
"""
19+
20+
from __future__ import division
21+
22+
import argparse
23+
import datetime
24+
import random
25+
import uuid
26+
27+
from six.moves.urllib import parse
28+
import sqlalchemy
29+
from sqlalchemy.ext import declarative
30+
import sqlalchemy.orm
31+
32+
33+
SECONDS_IN_DAY = 24 * 60 * 60
34+
SECONDS_IN_2016 = 366 * SECONDS_IN_DAY
35+
36+
# Unix timestamp for the beginning of 2016.
37+
# http://stackoverflow.com/a/19801806/101923
38+
TIMESTAMP_2016 = (
39+
datetime.datetime(2016, 1, 1, 0, 0, 0) -
40+
datetime.datetime.fromtimestamp(0)).total_seconds()
41+
42+
43+
Base = declarative.declarative_base()
44+
45+
46+
class User(Base):
47+
__tablename__ = 'Users'
48+
49+
id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
50+
date_joined = sqlalchemy.Column(sqlalchemy.DateTime)
51+
52+
53+
class UserSession(Base):
54+
__tablename__ = 'UserSessions'
55+
56+
id = sqlalchemy.Column(sqlalchemy.String(length=36), primary_key=True)
57+
user_id = sqlalchemy.Column(
58+
sqlalchemy.Integer, sqlalchemy.ForeignKey('Users.id'))
59+
login_time = sqlalchemy.Column(sqlalchemy.DateTime)
60+
logout_time = sqlalchemy.Column(sqlalchemy.DateTime)
61+
ip_address = sqlalchemy.Column(sqlalchemy.String(length=40))
62+
63+
64+
def generate_users(session, num_users):
65+
users = []
66+
67+
for userid in range(1, num_users + 1):
68+
year_portion = random.random()
69+
date_joined = datetime.datetime.fromtimestamp(
70+
TIMESTAMP_2016 + SECONDS_IN_2016 * year_portion)
71+
user = User(id=userid, date_joined=date_joined)
72+
users.append(user)
73+
session.add(user)
74+
75+
session.commit()
76+
return users
77+
78+
79+
def random_ip():
80+
"""Choose a random example IP address.
81+
82+
Examples are chosen from the test networks described in
83+
https://tools.ietf.org/html/rfc5737
84+
"""
85+
network = random.choice([
86+
'192.0.2', # RFC-5737 TEST-NET-1
87+
'198.51.100', # RFC-5737 TEST-NET-2
88+
'203.0.113', # RFC-5737 TEST-NET-3
89+
])
90+
ip_address = '{}.{}'.format(network, random.randrange(256))
91+
return ip_address
92+
93+
94+
def simulate_user_session(session, user, previous_user_session=None):
95+
"""Simulates a single session (login to logout) of a user's history."""
96+
login_time = user.date_joined
97+
98+
if previous_user_session is not None:
99+
login_time = (
100+
previous_user_session.logout_time +
101+
datetime.timedelta(
102+
days=1, seconds=random.randrange(SECONDS_IN_DAY)))
103+
104+
session_id = str(uuid.uuid4())
105+
user_session = UserSession(
106+
id=session_id,
107+
user_id=user.id,
108+
login_time=login_time,
109+
ip_address=random_ip())
110+
user_session.logout_time = (
111+
login_time +
112+
datetime.timedelta(seconds=(1 + random.randrange(59))))
113+
session.commit()
114+
session.add(user_session)
115+
return user_session
116+
117+
118+
def simulate_user_history(session, user):
119+
"""Simulates the entire history of activity for a single user."""
120+
total_sessions = random.randrange(10)
121+
previous_user_session = None
122+
123+
for _ in range(total_sessions):
124+
user_session = simulate_user_session(
125+
session, user, previous_user_session)
126+
previous_user_session = user_session
127+
128+
129+
def run_simulation(session, users):
130+
"""Simulates app activity for all users."""
131+
132+
for n, user in enumerate(users):
133+
if n % 100 == 0 and n != 0:
134+
print('Simulated data for {} users'.format(n))
135+
136+
simulate_user_history(session, user)
137+
138+
print('COMPLETE: Simulated data for {} users'.format(len(users)))
139+
140+
141+
def populate_db(session, total_users=3):
142+
"""Populate database with total_users simulated users and their actions."""
143+
users = generate_users(session, total_users)
144+
run_simulation(session, users)
145+
146+
147+
def create_session(engine):
148+
Base.metadata.drop_all(engine)
149+
Base.metadata.create_all(engine)
150+
Session = sqlalchemy.orm.sessionmaker(bind=engine)
151+
return Session()
152+
153+
154+
def main(total_users, host, user, password, db_name):
155+
engine = sqlalchemy.create_engine(
156+
'mysql+pymysql://{user}:{password}@{host}/{db_name}'.format(
157+
user=user,
158+
password=parse.quote_plus(password),
159+
host=host,
160+
db_name=db_name))
161+
session = create_session(engine)
162+
163+
try:
164+
populate_db(session, total_users)
165+
finally:
166+
session.close()
167+
168+
169+
if __name__ == '__main__':
170+
parser = argparse.ArgumentParser(
171+
description=__doc__,
172+
formatter_class=argparse.RawDescriptionHelpFormatter)
173+
parser.add_argument(
174+
'total_users', help='How many simulated users to create.', type=int)
175+
parser.add_argument('host', help='Host of the database to write to.')
176+
parser.add_argument('user', help='User to connect to the database.')
177+
parser.add_argument('password', help='Password for the database user.')
178+
parser.add_argument('db', help='Name of the database to write to.')
179+
180+
args = parser.parse_args()
181+
182+
main(args.total_users, args.host, args.user, args.password, args.db)

bigquery/dml/populate_db_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2016 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import sqlalchemy
16+
17+
from populate_db import create_session, populate_db
18+
19+
20+
def test_populate_db_populates_users():
21+
engine = sqlalchemy.create_engine('sqlite://')
22+
session = create_session(engine)
23+
24+
try:
25+
populate_db(session, total_users=10)
26+
27+
connection = session.connection().connection
28+
cursor = connection.cursor()
29+
cursor.execute('SELECT COUNT(*) FROM Users')
30+
assert cursor.fetchone()[0] == 10
31+
cursor.execute('SELECT COUNT(*) FROM UserSessions')
32+
assert cursor.fetchone()[0] >= 10
33+
finally:
34+
session.close()

bigquery/dml/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
flake8==3.0.4
2+
gcloud==0.18.1
3+
PyMySQL==0.7.7
4+
six==1.10.0
5+
SQLAlchemy==1.0.15
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-- This file is used to test ../insert_sql.py.
2+
-- These are comments.
3+
-- Each query to be executed should be on a single line.
4+
5+
/* Another ignored line. */
6+
INSERT INTO `test_table` (`Name`) VALUES ('hello world')

scripts/resources/docs-links.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,9 @@
361361
"/bigquery/docs/data": [
362362
"bigquery/api/sync_query.py"
363363
],
364+
"bigquery/docs/loading-data-sql-dml": [
365+
"bigquery/dml/insert_sql.py"
366+
],
364367
"/appengine/docs/python/memcache/examples": [
365368
"appengine/memcache/snippets/snippets.py",
366369
"appengine/memcache/guestbook/main.py"

0 commit comments

Comments
 (0)