add comment pointing to Sequence Parallel optimization example

tianyu-l · tianyu-l · commit 21dd9807a797 · 2024-07-08T13:45:31.000-07:00
ghstack-source-id: 6fa0dcd Pull Request resolved: #438
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -362,6 +362,9 @@ def apply_tp(model, world_mesh, parallel_dims, job_config: JobConfig):
     )
 
     # Apply tensor + sequence parallelism to every transformer block
+    # NOTE: At the cost of model code change, we can accelerate Sequence Parallel
+    #       by folding (and unfolding) the batch dimension and the sequence dimension.
+    #       Examples can be found at https://github.com/pytorch/torchtitan/pull/437
     for layer_id, transformer_block in model.layers.items():
         layer_plan = {
             "attention": prepare_module_input(

Original file line number	Diff line number	Diff line change
`@@ -362,6 +362,9 @@ def apply_tp(model, world_mesh, parallel_dims, job_config: JobConfig):`
`362`	`362`	`)`
`363`	`363`
`364`	`364`	`# Apply tensor + sequence parallelism to every transformer block`
	`365`	`+ # NOTE: At the cost of model code change, we can accelerate Sequence Parallel`
	`366`	`+ # by folding (and unfolding) the batch dimension and the sequence dimension.`
	`367`	`+ # Examples can be found at https://github.com/pytorch/torchtitan/pull/437`
`365`	`368`	`for layer_id, transformer_block in model.layers.items():`
`366`	`369`	`layer_plan = {`
`367`	`370`	`"attention": prepare_module_input(`