NVIDIA
diff --git a/‎continuous_integration/scripts/render-template.py
Lines changed: 27 additions & 17 deletions b/‎continuous_integration/scripts/render-template.py
Lines changed: 27 additions & 17 deletions
diff --git a/‎cuda_bindings/benchmarks/kernels.py
Lines changed: 2 additions & 2 deletions b/‎cuda_bindings/benchmarks/kernels.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎cuda_bindings/benchmarks/perf_test_utils.py
Lines changed: 30 additions & 17 deletions b/‎cuda_bindings/benchmarks/perf_test_utils.py
Lines changed: 30 additions & 17 deletions
diff --git a/‎cuda_bindings/benchmarks/test_cupy.py
Lines changed: 22 additions & 10 deletions b/‎cuda_bindings/benchmarks/test_cupy.py
Lines changed: 22 additions & 10 deletions
@@ -3,9 +3,9 @@
 import argparse
 import json
 from jinja2 import Environment, FileSystemLoader
-import os
 import re
 
+
 # TODO: make this work for arbitrary context. ie. implement replace_using_context()
 def replace_placeholder(source_str, variable_name, variable_value):
     # Escaping any regex special characters in variable_name
@@ -14,39 +14,49 @@ def replace_placeholder(source_str, variable_name, variable_value):
     # Using regular expression to replace ${variable_name} with actual variable_value
     # \s* means any amount of whitespace (including none)
     # pattern = rf'\$\{{\s*\{{\s*{variable_name_escaped}\s*\}}\s*\}}'
-    pattern = rf'<<\s*{variable_name_escaped}\s*>>'
+    pattern = rf"<<\s*{variable_name_escaped}\s*>>"
     return re.sub(pattern, variable_value.strip(), source_str)
 
+
 # Setup command-line argument parsing
-parser = argparse.ArgumentParser(description='Render a Jinja2 template using a JSON context.')
-parser.add_argument('template_file', type=str, help='Path to the Jinja2 template file (with .j2 extension).')
-parser.add_argument('json_file', type=str, help='Path to the JSON file to use as the rendering context.')
-parser.add_argument('output_file', type=str, help='Path to the output file.')
+parser = argparse.ArgumentParser(
+    description="Render a Jinja2 template using a JSON context."
+)
+parser.add_argument(
+    "template_file",
+    type=str,
+    help="Path to the Jinja2 template file (with .j2 extension).",
+)
+parser.add_argument(
+    "json_file", type=str, help="Path to the JSON file to use as the rendering context."
+)
+parser.add_argument("output_file", type=str, help="Path to the output file.")
 
 args = parser.parse_args()
 
 # Load JSON file as the rendering context
-with open(args.json_file, 'r') as file:
+with open(args.json_file, "r") as file:
     context = json.load(file)
 
 # Setup Jinja2 environment and load the template
 env = Environment(
-    loader=FileSystemLoader(searchpath='./'),
-    variable_start_string='<<',
-    variable_end_string='>>',
-    block_start_string='<%',
-    block_end_string='%>',
-    comment_start_string='<#',
-    comment_end_string='#>')
-env.filters['replace_placeholder'] = replace_placeholder
+    loader=FileSystemLoader(searchpath="./"),
+    variable_start_string="<<",
+    variable_end_string=">>",
+    block_start_string="<%",
+    block_end_string="%>",
+    comment_start_string="<#",
+    comment_end_string="#>",
+)
+env.filters["replace_placeholder"] = replace_placeholder
 
 template = env.get_template(args.template_file)
 
 # Render the template with the context
 rendered_content = template.render(context)
 # print(rendered_content)
 
-with open(args.output_file, 'w') as file:
+with open(args.output_file, "w") as file:
     file.write(rendered_content)
 
-print(f'Template rendered successfully. Output saved to {args.output_file}')
+print(f"Template rendered successfully. Output saved to {args.output_file}")
@@ -5,7 +5,7 @@
 # this software. Any use, reproduction, disclosure, or distribution of
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
-kernel_string = '''\
+kernel_string = """\
 #define ITEM_PARAM(x, T) T x
 #define REP1(x, T)   , ITEM_PARAM(x, T)	
 #define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
@@ -160,4 +160,4 @@
     // Do not touch param to prevent compiler from copying
     // the whole structure from const bank to lmem.
 }
-'''
+"""
@@ -9,23 +9,25 @@
 from cuda import cuda, cudart, nvrtc
 import numpy as np
 
+
 def ASSERT_DRV(err):
     if isinstance(err, cuda.CUresult):
         if err != cuda.CUresult.CUDA_SUCCESS:
-            raise RuntimeError('Cuda Error: {}'.format(err))
+            raise RuntimeError("Cuda Error: {}".format(err))
     elif isinstance(err, cudart.cudaError_t):
         if err != cudart.cudaError_t.cudaSuccess:
-            raise RuntimeError('Cudart Error: {}'.format(err))
+            raise RuntimeError("Cudart Error: {}".format(err))
     elif isinstance(err, nvrtc.nvrtcResult):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-            raise RuntimeError('Nvrtc Error: {}'.format(err))
+            raise RuntimeError("Nvrtc Error: {}".format(err))
     else:
-        raise RuntimeError('Unknown error type: {}'.format(err))
+        raise RuntimeError("Unknown error type: {}".format(err))
+
 
 @pytest.fixture
 def init_cuda():
     # Initialize
-    err, = cuda.cuInit(0)
+    (err,) = cuda.cuInit(0)
     ASSERT_DRV(err)
     err, device = cuda.cuDeviceGet(0)
     ASSERT_DRV(err)
@@ -38,31 +40,42 @@ def init_cuda():
 
     yield device, ctx, stream
 
-    err, = cuda.cuStreamDestroy(stream)
+    (err,) = cuda.cuStreamDestroy(stream)
     ASSERT_DRV(err)
-    err, = cuda.cuCtxDestroy(ctx)
+    (err,) = cuda.cuCtxDestroy(ctx)
     ASSERT_DRV(err)
 
+
 @pytest.fixture
 def load_module():
     module = None
+
     def _load_module(kernel_string, device):
         nonlocal module
         # Get module
-        err, major = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)
+        err, major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
+        )
         ASSERT_DRV(err)
-        err, minor = cuda.cuDeviceGetAttribute(cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)
+        err, minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device
+        )
         ASSERT_DRV(err)
 
-        err, prog = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b'kernelString.cu', 0, [], [])
+        err, prog = nvrtc.nvrtcCreateProgram(
+            str.encode(kernel_string), b"kernelString.cu", 0, [], []
+        )
         ASSERT_DRV(err)
-        opts = [b'--fmad=false', bytes('--gpu-architecture=sm_' + str(major) + str(minor), 'ascii')]
-        err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)
+        opts = [
+            b"--fmad=false",
+            bytes("--gpu-architecture=sm_" + str(major) + str(minor), "ascii"),
+        ]
+        (err,) = nvrtc.nvrtcCompileProgram(prog, 2, opts)
 
         err_log, logSize = nvrtc.nvrtcGetProgramLogSize(prog)
         ASSERT_DRV(err_log)
-        log = b' ' * logSize
-        err_log, = nvrtc.nvrtcGetProgramLog(prog, log)
+        log = b" " * logSize
+        (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log)
         ASSERT_DRV(err_log)
         result = log.decode()
         if len(result) > 1:
@@ -71,8 +84,8 @@ def _load_module(kernel_string, device):
         ASSERT_DRV(err)
         err, cubinSize = nvrtc.nvrtcGetCUBINSize(prog)
         ASSERT_DRV(err)
-        cubin = b' ' * cubinSize
-        err, = nvrtc.nvrtcGetCUBIN(prog, cubin)
+        cubin = b" " * cubinSize
+        (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin)
         ASSERT_DRV(err)
         cubin = np.char.array(cubin)
         err, module = cuda.cuModuleLoadData(cubin)
@@ -82,5 +95,5 @@ def _load_module(kernel_string, device):
 
     yield _load_module
 
-    err, = cuda.cuModuleUnload(module)
+    (err,) = cuda.cuModuleUnload(module)
     ASSERT_DRV(err)
@@ -13,34 +13,38 @@
 if not skip_tests:
     try:
         import cupy
+
         skip_tests = False
     except ImportError:
         skip_tests = True
 
 from .kernels import kernel_string
 
+
 def launch(kernel, args=()):
     kernel((1,), (1,), args)
 
+
 # Measure launch latency with no parmaeters
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_empty_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('empty_kernel')
+    kernel = module.get_function("empty_kernel")
 
     stream = cupy.cuda.stream.Stream(non_blocking=True)
 
     with stream:
         benchmark(launch, kernel)
         stream.synchronize()
 
+
 # Measure launch latency with a single parameter
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel')
+    kernel = module.get_function("small_kernel")
     cupy.cuda.set_allocator()
     arg = cupy.cuda.alloc(ctypes.sizeof(ctypes.c_float))
 
@@ -50,12 +54,13 @@ def test_launch_latency_small_kernel(benchmark):
         benchmark(launch, kernel, (arg,))
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_args')
+    kernel = module.get_function("small_kernel_512_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -69,12 +74,13 @@ def test_launch_latency_small_kernel_512_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bools(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_bools')
+    kernel = module.get_function("small_kernel_512_bools")
     cupy.cuda.set_allocator()
 
     args = [True] * 512
@@ -86,12 +92,13 @@ def test_launch_latency_small_kernel_512_bools(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_doubles(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_doubles')
+    kernel = module.get_function("small_kernel_512_doubles")
     cupy.cuda.set_allocator()
 
     args = [1.2345] * 512
@@ -103,12 +110,13 @@ def test_launch_latency_small_kernel_512_doubles(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_ints(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_ints')
+    kernel = module.get_function("small_kernel_512_ints")
     cupy.cuda.set_allocator()
 
     args = [123] * 512
@@ -120,12 +128,13 @@ def test_launch_latency_small_kernel_512_ints(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_bytes(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_chars')
+    kernel = module.get_function("small_kernel_512_chars")
     cupy.cuda.set_allocator()
 
     args = [127] * 512
@@ -137,12 +146,13 @@ def test_launch_latency_small_kernel_512_bytes(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_512_longlongs(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_512_longlongs')
+    kernel = module.get_function("small_kernel_512_longlongs")
     cupy.cuda.set_allocator()
 
     args = [9223372036854775806] * 512
@@ -154,12 +164,13 @@ def test_launch_latency_small_kernel_512_longlongs(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_256_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_256_args')
+    kernel = module.get_function("small_kernel_256_args")
     cupy.cuda.set_allocator()
 
     args = []
@@ -173,12 +184,13 @@ def test_launch_latency_small_kernel_256_args(benchmark):
         benchmark(launch, kernel, args)
         stream.synchronize()
 
+
 # Measure launch latency with many parameters using builtin parameter packing
 @pytest.mark.skipif(skip_tests, reason="cupy is not installed")
 @pytest.mark.benchmark(group="cupy")
 def test_launch_latency_small_kernel_16_args(benchmark):
     module = cupy.RawModule(code=kernel_string)
-    kernel = module.get_function('small_kernel_16_args')
+    kernel = module.get_function("small_kernel_16_args")
     cupy.cuda.set_allocator()
 
     args = []