@@ -284,261 +284,3 @@ def __del__(self):
284
284
"""Close profiler's stream."""
285
285
if self .output_file :
286
286
self .output_file .close ()
287
-
288
-
289
- class PyTorchProfiler (BaseProfiler ):
290
-
291
- PROFILED_FUNCTIONS = ("training_step_and_backward" , "validation_step" , "test_step" )
292
- AVAILABLE_SORT_KEYS = (
293
- "cpu_time" ,
294
- "cuda_time" ,
295
- "cpu_time_total" ,
296
- "cuda_time_total" ,
297
- "cpu_memory_usage" ,
298
- "cuda_memory_usage" ,
299
- "self_cpu_memory_usage" ,
300
- "self_cuda_memory_usage" ,
301
- "count" ,
302
- )
303
-
304
- def __init__ (
305
- self ,
306
- output_filename : Optional [str ] = None ,
307
- enabled : bool = True ,
308
- use_cuda : bool = False ,
309
- record_shapes : bool = False ,
310
- profile_memory : bool = False ,
311
- group_by_input_shapes : bool = False ,
312
- with_stack : bool = False ,
313
- use_kineto : bool = False ,
314
- use_cpu : bool = True ,
315
- emit_nvtx : bool = False ,
316
- export_to_chrome : bool = False ,
317
- path_to_export_trace : str = None ,
318
- row_limit : int = 20 ,
319
- sort_by_key : Optional [str ] = None ,
320
- profiled_functions : Optional [List ] = None ,
321
- local_rank : Optional [int ] = None ,
322
- ):
323
- """
324
- This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
325
- different operators inside your model - both on the CPU and GPU
326
-
327
- Args:
328
-
329
- output_filename: optionally save profile results to file instead of printing
330
- to std out when training is finished. When using ``ddp``,
331
- each rank will stream the profiled operation to their own file
332
- with the extension ``_{rank}.txt``
333
-
334
- enabled: Setting this to False makes this context manager a no-op.
335
-
336
- use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
337
- Adds approximately 4us of overhead to each tensor operation.
338
-
339
- record_shapes: If shapes recording is set, information about input dimensions will be collected.
340
-
341
- profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0)
342
-
343
- group_by_input_shapes: Include operator input shapes and group calls by shape.
344
-
345
- with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0)
346
-
347
- use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0)
348
-
349
- use_cpu: use_kineto=True and can be used to lower the overhead
350
- for GPU-only profiling (Introduced in PyTorch 1.8.0)
351
-
352
- emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
353
- Run::
354
-
355
- nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
356
-
357
- To visualize, you can either use::
358
-
359
- nvvp trace_name.prof
360
- torch.autograd.profiler.load_nvprof(path)
361
-
362
- export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
363
- It will generate a ``.json`` file which can be read by Chrome.
364
-
365
- path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``.
366
- By default, it will be save where the file being is being run.
367
-
368
- row_limit: Limit the number of rows in a table, `0` is a special value that
369
- removes the limit completely.
370
-
371
- sort_by_key: Keys to sort out profiled table
372
-
373
- profiled_functions: list of profiled functions which will create a context manager on.
374
- Any other will be pass through.
375
-
376
- local_rank: When running in distributed setting, local_rank is used for each process
377
- to write to their own file if `output_fname` is provided.
378
- """
379
-
380
- self .profiled_actions = {}
381
- self .enabled = enabled
382
- self .profiled_functions = profiled_functions or self .PROFILED_FUNCTIONS
383
- self .use_cuda = use_cuda
384
- self .record_shapes = record_shapes
385
- self .profile_memory = profile_memory
386
- self .sort_by_key = sort_by_key or ("cuda_time_total" if self .use_cuda else "cpu_time_total" )
387
- self .with_stack = with_stack
388
- self .group_by_input_shapes = group_by_input_shapes and record_shapes
389
- self .use_kineto = use_kineto
390
- self .use_cpu = use_cpu
391
- self .row_limit = row_limit
392
- self .emit_nvtx = emit_nvtx
393
- self .export_to_chrome = export_to_chrome
394
- self .path_to_export_trace = path_to_export_trace
395
-
396
- if export_to_chrome and path_to_export_trace is None :
397
- rank_zero_warn (
398
- "The exported trace would be save locally as `path_to_export_trace` is empty."
399
- " Note: Each functions will generate its own traced file."
400
- )
401
-
402
- if self .sort_by_key not in self .AVAILABLE_SORT_KEYS :
403
- raise MisconfigurationException (
404
- f"Found sort_by_key: { sort_by_key } . Should be within { self .AVAILABLE_SORT_KEYS } . "
405
- )
406
-
407
- self .profiled_actions = {}
408
- self .context_names = {}
409
- self .running_stack = []
410
- self .profiler = None
411
-
412
- self .output_fname = output_filename
413
- self .output_file = None
414
- if local_rank is not None :
415
- self .on_train_start (local_rank = local_rank )
416
- self .on_train_start = super ().on_train_start
417
-
418
- def on_train_start (self , local_rank : Optional [str ] = None ):
419
- self .local_rank = local_rank
420
-
421
- # when logging to `log.info`, only perform profiling on rank 0
422
- if local_rank != 0 and self .output_fname is None :
423
- self .wrap_functions_into_rank_zero_only ()
424
-
425
- if self .output_fname :
426
- if local_rank is not None :
427
- if '.txt' not in self .output_fname :
428
- raise MisconfigurationException ("Log file should be .txt file." )
429
-
430
- self .output_fname = self .output_fname .replace (".txt" , f"_{ self .local_rank } .txt" )
431
-
432
- fs = get_filesystem (self .output_fname )
433
- self .output_file = fs .open (self .output_fname , "w" )
434
-
435
- streaming_out = [self .output_file .write ] if self .output_file else [log .info ]
436
- super ().__init__ (output_streams = streaming_out )
437
-
438
- def wrap_functions_into_rank_zero_only (self ):
439
- self .start = rank_zero_only (self .start )
440
- self .stop = rank_zero_only (self .stop )
441
- self .summary = rank_zero_only (self .summary )
442
- self .describe = rank_zero_only (self .describe )
443
-
444
- def start (self , action_name : str ) -> None :
445
- if action_name not in self .profiled_functions :
446
- return
447
-
448
- if len (self .running_stack ) > 0 :
449
- self ._stop (self .running_stack [- 1 ])
450
- self .running_stack .append (action_name )
451
-
452
- self .context_names [action_name ] = "/" .join (self .running_stack )
453
-
454
- self ._start (action_name )
455
-
456
- def _start (self , action_name : str ) -> None :
457
- if self .emit_nvtx :
458
- self ._create_profiler (action_name , torch .cuda .profiler .profile , enter = False )
459
- self ._create_profiler (action_name , torch .autograd .profiler .emit_nvtx )
460
- else :
461
- self ._create_profiler (action_name , torch .autograd .profiler .profile )
462
-
463
- def _create_profiler (self , action_name , profiler , enter = True ):
464
- init_args = inspect .signature (profiler .__init__ ).parameters
465
- profiler_args = {k : v for k , v in vars (self ).items () if k in init_args }
466
- pr = profiler (** profiler_args )
467
- if enter :
468
- pr = pr .__enter__ ()
469
- self .profiler = pr
470
-
471
- def _stop (self , action_name : str ) -> None :
472
- if self .profiler is None :
473
- return
474
-
475
- self .profiler .__exit__ (exc_type = None , exc_val = None , exc_tb = None )
476
-
477
- function_events = self .profiler .function_events
478
- self .profiler = None
479
- for name in self .running_stack :
480
- if name not in self .profiled_actions :
481
- self .profiled_actions [name ] = function_events
482
- else :
483
- self .profiled_actions [name ] += function_events
484
-
485
- def stop (self , action_name : str ) -> None :
486
- if action_name not in self .profiled_functions :
487
- return
488
-
489
- if len (self .running_stack ) == 0 or self .running_stack [- 1 ] != action_name :
490
- raise ValueError ( # pragma: no-cover
491
- f"Attempting to stop recording an action ({ action_name } ) which was never started."
492
- )
493
- self ._stop (action_name )
494
- self .running_stack .pop ()
495
- # restore running profiler
496
- if len (self .running_stack ) > 0 :
497
- self ._start (self .running_stack [- 1 ])
498
-
499
- def summary (self ) -> str :
500
- recorded_stats = {}
501
- output_string = ''
502
- local_rank = '0' if self .local_rank is None else self .local_rank
503
-
504
- if not self .enabled :
505
- return output_string
506
-
507
- for action_name , function_events in self .profiled_actions .items ():
508
-
509
- # next line is a workaround for a pytorch issue (fixed on master, still present
510
- # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
511
- # parent event for detach`
512
- function_events .populate_cpu_children = lambda : None
513
-
514
- if self .export_to_chrome :
515
- filename = f"{ action_name } _{ local_rank } _trace.json"
516
- path_to_trace = filename if self .path_to_export_trace is None \
517
- else os .path .join (self .path_to_export_trace , filename )
518
- function_events .export_chrome_trace (path_to_trace )
519
-
520
- if self .emit_nvtx :
521
- return output_string
522
-
523
- else :
524
- data = function_events .key_averages (group_by_input_shapes = self .group_by_input_shapes )
525
- table = data .table (sort_by = self .sort_by_key , row_limit = self .row_limit )
526
- recorded_stats [action_name ] = table
527
-
528
- # log to standard out
529
- output_string = f"{ os .linesep } Profiler Report{ os .linesep } "
530
- for action , stats in recorded_stats .items ():
531
- output_string += (f"{ os .linesep } Profile stats for: { action } rank: { local_rank } { os .linesep } { stats } " )
532
-
533
- return output_string
534
-
535
- def describe (self ):
536
- """Logs a profile report after the conclusion of the training run."""
537
- super ().describe ()
538
- if self .output_file :
539
- self .output_file .flush ()
540
-
541
- def __del__ (self ):
542
- """Close profiler's stream."""
543
- if self .output_file :
544
- self .output_file .close ()
0 commit comments