File tree Expand file tree Collapse file tree 5 files changed +15
-5
lines changed Expand file tree Collapse file tree 5 files changed +15
-5
lines changed Original file line number Diff line number Diff line change @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping
33
33
steps = 1000
34
34
data_parallel_degree = -1
35
35
tensor_parallel_degree = 1
36
- pipeline_parallel_degree = 1
37
36
fp8_linear = " "
38
37
compile = false
39
38
dataset = " c4"
40
39
40
+ [experimental ]
41
+ pipeline_parallel_degree = 1
42
+
41
43
[checkpoint ]
42
44
enable_checkpoint = false
43
45
folder = " checkpoint"
Original file line number Diff line number Diff line change @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping
33
33
steps = 1000
34
34
data_parallel_degree = -1
35
35
tensor_parallel_degree = 8 # 8-way TP
36
- pipeline_parallel_degree = 1
37
36
fp8_linear = " "
38
37
compile = false
39
38
dataset = " c4"
40
39
40
+ [experimental ]
41
+ pipeline_parallel_degree = 1
42
+
41
43
[checkpoint ]
42
44
enable_checkpoint = false
43
45
folder = " checkpoint"
Original file line number Diff line number Diff line change @@ -32,11 +32,13 @@ max_norm = 1.0 # grad norm clipping
32
32
steps = 1000
33
33
data_parallel_degree = -1
34
34
tensor_parallel_degree = 1 # dp-only would be sufficient for 7B
35
- pipeline_parallel_degree = 1
36
35
fp8_linear = " "
37
36
compile = false
38
37
dataset = " c4"
39
38
39
+ [experimental ]
40
+ pipeline_parallel_degree = 1
41
+
40
42
[checkpoint ]
41
43
enable_checkpoint = false
42
44
folder = " checkpoint"
Original file line number Diff line number Diff line change @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping
33
33
steps = 1000
34
34
data_parallel_degree = -1
35
35
tensor_parallel_degree = 8 # 8-way TP
36
- pipeline_parallel_degree = 1
37
36
fp8_linear = " "
38
37
compile = false
39
38
dataset = " c4"
40
39
40
+ [experimental ]
41
+ pipeline_parallel_degree = 1
42
+
41
43
[checkpoint ]
42
44
enable_checkpoint = false
43
45
folder = " checkpoint"
Original file line number Diff line number Diff line change @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping
33
33
steps = 1000
34
34
data_parallel_degree = -1
35
35
tensor_parallel_degree = 1
36
- pipeline_parallel_degree = 1
37
36
fp8_linear = " "
38
37
compile = false
39
38
dataset = " c4"
40
39
40
+ [experimental ]
41
+ pipeline_parallel_degree = 1
42
+
41
43
[checkpoint ]
42
44
enable_checkpoint = false
43
45
folder = " checkpoint"
You can’t perform that action at this time.
0 commit comments