Skip to content

Improve examples #390

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 15, 2021
Merged
6 changes: 3 additions & 3 deletions examples/cython/sycl_buffer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ CC=clang CXX=dpcpp python setup.py build_ext --inplace
#2 Running

```
# SYCL_BE=PI_OPENCL sets SYCL backend to OpenCL to avoid a
# SYCL_DEVICE_FILTER=opencl sets SYCL backend to OpenCL to avoid a
# transient issue with MKL's using the default Level-0 backend
(idp) [08:16:12 ansatnuc04 simple]$ SYCL_BE=PI_OPENCL ipython
(idp) [08:16:12 ansatnuc04 simple]$ SYCL_DEVICE_FILTER=opencl ipython
Python 3.7.7 (default, Jul 14 2020, 22:02:37)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.17.0 -- An enhanced Interactive Python. Type '?' for help.
Expand Down Expand Up @@ -67,7 +67,7 @@ Times for NumPy
Running run.py:

```
(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_BE=PI_OPENCL python run.py
(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_DEVICE_FILTER=opencl python run.py
Result computed by NumPy
[ 0.27170187 -23.36798583 7.31326489 -1.95121928]
Result computed by SYCL extension
Expand Down
9 changes: 7 additions & 2 deletions examples/cython/sycl_buffer/_buffer_example.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,19 @@ cdef extern from "use_sycl_buffer.h":
int c_columnwise_total(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil
int c_columnwise_total_no_mkl(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil

def columnwise_total(double[:, ::1] v, method='mkl'):
def columnwise_total(double[:, ::1] v, method='mkl', queue=None):
cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d')
cdef double[::1] res_memslice = res_array
cdef int ret_status
cdef c_dpctl.SyclQueue q
cdef c_dpctl.DPCTLSyclQueueRef q_ref

q = c_dpctl.get_current_queue()
if (queue is None):
q = c_dpctl.SyclQueue()
elif isinstance(queue, dpctl.SyclQueue):
q = <c_dpctl.SyclQueue> queue
else:
q = c_dpctl.SyclQueue(queue)
q_ref = q.get_queue_ref()

if method == 'mkl':
Expand Down
22 changes: 11 additions & 11 deletions examples/cython/sycl_buffer/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@
print("=" * 10 + " Executing warm-up " + "=" * 10)
print("NumPy result: ", X.sum(axis=0))

dpctl.set_global_queue("opencl:cpu")
q = dpctl.SyclQueue("opencl:cpu")
print(
"SYCL({}) result: {}".format(
dpctl.get_current_queue().sycl_device.name,
sb.columnwise_total(X),
q.sycl_device.name,
sb.columnwise_total(X, queue=q),
)
)

dpctl.set_default_queue("opencl:gpu")
q = dpctl.SyclQueue("opencl:gpu")
print(
"SYCL({}) result: {}".format(
dpctl.get_current_queue().sycl_device.name,
sb.columnwise_total(X),
q.sycl_device.name,
sb.columnwise_total(X, queue=q),
)
)

Expand All @@ -45,9 +45,9 @@
print("Times for 'opencl:cpu'")
print(
timeit.repeat(
stmt="sb.columnwise_total(X)",
setup='dpctl.set_global_queue("opencl:cpu"); '
"sb.columnwise_total(X)", # ensure JIT compilation is not counted
stmt="sb.columnwise_total(X, queue=q)",
setup='q = dpctl.SyclQueue("opencl:cpu"); '
"sb.columnwise_total(X, queue=q)", # ensure JIT compilation is not counted
number=100,
globals=globals(),
)
Expand All @@ -56,8 +56,8 @@
print("Times for 'opencl:gpu'")
print(
timeit.repeat(
stmt="sb.columnwise_total(X)",
setup='dpctl.set_default_queue("opencl:gpu"); sb.columnwise_total(X)',
stmt="sb.columnwise_total(X, queue=q)",
setup='q = dpctl.SyclQueue("opencl:gpu"); sb.columnwise_total(X, queue=q)',
number=100,
globals=globals(),
)
Expand Down
17 changes: 9 additions & 8 deletions examples/cython/sycl_buffer/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,24 @@

import syclbuffer as sb
import numpy as np
import dpctl

X = np.random.randn(100, 4)

print("Result computed by NumPy")
print(X.sum(axis=0))
print("Result computed by SYCL extension")
print("Result computed by SYCL extension using default offloading target")
print(sb.columnwise_total(X))


print("")

# controlling where to offload
import dpctl

with dpctl.device_context("opencl:gpu"):
print("Running on: ", dpctl.get_current_queue().sycl_device.name)
print(sb.columnwise_total(X))
q = dpctl.SyclQueue("opencl:gpu")
print("Running on: ", q.sycl_device.name)
print(sb.columnwise_total(X, queue=q))

with dpctl.device_context("opencl:cpu"):
print("Running on: ", dpctl.get_current_queue().sycl_device.name)
print(sb.columnwise_total(X))
q = dpctl.SyclQueue("opencl:cpu")
print("Running on: ", q.sycl_device.name)
print(sb.columnwise_total(X, queue=q))
4 changes: 2 additions & 2 deletions examples/cython/sycl_direct_linkage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ To illustrate the queue creation overhead in each call, compare execution of def
which is Intel Gen9 GPU on OpenCL backend:

```
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_DEVICE_FILTER=opencl:gpu python bench.py
========== Executing warm-up ==========
NumPy result: [1. 1. 1. ... 1. 1. 1.]
SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
Expand All @@ -37,7 +37,7 @@ Times for NumPy
[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
```

vs. timing when `dpctl`'s current queue is being reused:
vs. timing when `dpctl`'s queue is being reused:

```
(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
Expand Down
23 changes: 17 additions & 6 deletions examples/cython/usm_memory/blackscholes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,18 @@ cdef extern from "sycl_blackscholes.hpp":
cdef void cpp_blackscholes[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T* callput) except +
cdef void cpp_populate_params[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, int seed) except +

def black_scholes_price(floating[:, ::1] option_params):
cdef c_dpctl.SyclQueue from_queue_keyword(queue):
if (queue is None):
return c_dpctl.SyclQueue()
elif isinstance(queue, dpctl.SyclQueue):
return <c_dpctl.SyclQueue> queue
else:
return c_dpctl.SyclQueue(queue)
# use default
return c_dpctl.SyclQueue()


def black_scholes_price(floating[:, ::1] option_params, queue=None):
cdef size_t n_opts = option_params.shape[0]
cdef size_t n_params = option_params.shape[1]
cdef size_t n_bytes = 0
Expand All @@ -49,19 +60,19 @@ def black_scholes_price(floating[:, ::1] option_params):
"Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
).format(n_params))

q = c_dpctl.get_current_queue()
q = from_queue_keyword(queue)
q_ptr = q.get_queue_ref()
if (floating is double):
n_bytes = 2*n_opts * sizeof(double)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='d')
call_put_prices = callput_arr
dp1 = &option_params[0,0]
dp2 = &call_put_prices[0,0];
cpp_blackscholes[double](q_ptr, n_opts, dp1, dp2)
elif (floating is float):
n_bytes = 2*n_opts * sizeof(float)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='f')
call_put_prices = callput_arr
fp1 = &option_params[0,0]
Expand All @@ -70,7 +81,7 @@ def black_scholes_price(floating[:, ::1] option_params):

return callput_arr

def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed):
def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed, queue=None):
cdef size_t n_opts = option_params.shape[0]
cdef size_t n_params = option_params.shape[1]

Expand All @@ -85,7 +96,7 @@ def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl,
"Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
).format(n_params))

q = c_dpctl.get_current_queue()
q = from_queue_keyword(queue)
q_ptr = q.get_queue_ref()
if (floating is double):
dp = &option_params[0,0]
Expand Down
54 changes: 32 additions & 22 deletions examples/cython/usm_memory/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@
from reference_black_scholes import ref_python_black_scholes


def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize)
# usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize)
def gen_option_params(
n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None
):
nbytes = n_opts * 5 * np.dtype(dtype).itemsize
usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue)
params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype)
seed = 1234
bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
bs.populate_params(
params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue
)
return params


Expand All @@ -47,38 +51,44 @@ def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
# compute prices in CPython
X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d")

print(np.allclose(Xgpu, X_ref, atol=1e-5))
print(
"Correctness check: allclose(Xgpu, Xref) == ", np.allclose(Xgpu, X_ref, atol=1e-5)
)

n_opts = 3 * 10 ** 6

# compute on CPU sycl device
import timeit

for _ in range(3):
cpu_q = dpctl.SyclQueue("opencl:cpu:0")
opts1 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=cpu_q
)

gpu_q = dpctl.SyclQueue("level_zero:gpu:0")
opts2 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=gpu_q
)

dpctl.set_global_queue("opencl:cpu:0")
print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))
cpu_times = []
gpu_times = []
for _ in range(5):

t0 = timeit.default_timer()
opts1 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
)
X1 = bs.black_scholes_price(opts1)
X1 = bs.black_scholes_price(opts1, queue=cpu_q)
t1 = timeit.default_timer()

print("Elapsed: {}".format(t1 - t0))
cpu_times.append(t1 - t0)

# compute on GPU sycl device
dpctl.set_global_queue("level_zero:gpu:0")
print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))

t0 = timeit.default_timer()
opts2 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
)
X2 = bs.black_scholes_price(opts2)
X2 = bs.black_scholes_price(opts2, queue=gpu_q)
t1 = timeit.default_timer()
print("Elapsed: {}".format(t1 - t0))
gpu_times.append(t1 - t0)

print("Using : {}".format(cpu_q.sycl_device.name))
print("Wall times : {}".format(cpu_times))

print(np.abs(opts1 - opts2).max())
print(np.abs(X2 - X1).max())
print("Using : {}".format(gpu_q.sycl_device.name))
print("Wall times : {}".format(gpu_times))
83 changes: 83 additions & 0 deletions examples/python/_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Data Parallel Control (dpctl)
#
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import inspect


def has_nondefault_params(sgn):
for v in sgn.parameters.values():
if v.default is inspect._empty:
return True
return False


def run_examples(example_description, glbls_dict):
parser = argparse.ArgumentParser(
description=example_description,
)
parser.add_argument(
"-r",
"--run",
type=str,
help="Functions to execute. Use --run all to run all of them.",
)
parser.add_argument(
"-l", "--list", action="store_true", help="List available function names to run"
)
parser.add_argument(
"-q", "--quiet", action="store_true", help="Do not echo example name."
)
args = parser.parse_args()

if args.list or not args.run:
fns = []
for n in glbls_dict:
if inspect.isfunction(glbls_dict.get(n)):
fns.append(n)
if fns:
print("Available examples:")
print(", ".join(fns))
else:
print("No examples are availble.")
exit(0)
if args.run == "all":
fns = []
for n in glbls_dict:
if inspect.isfunction(glbls_dict.get(n)):
fns.append(n)
args.run = fns
else:
args.run = args.run.split()

if args.run:
for fn in args.run:
if fn in glbls_dict:
clbl = glbls_dict.get(fn)
sgn = inspect.signature(clbl)
print("")
if has_nondefault_params(sgn):
if not args.quiet:
print(f"INFO: Skip exectution of {fn} as it requires arguments")
else:
if not args.quiet:
print(f"INFO: Executing example {fn}")
clbl()
if not args.quiet:
print("INFO: ===========================")

else:
raise ValueError("No function to run was specified")
Loading