Skip to content

Commit e3911c9

Browse files
moonlifacebook-github-bot
authored andcommitted
enable worker slice reshaping in controller to avoid hot node in message casting
Summary: Perf test revealed that a casting node with large fanout could still hit the single thread bottleneck. Recently, we introduced slice reshaping to allow us to decouple mesh Slice and distribution Slice. This is verified in the perf test. This diff enables reshaping in controller. Reviewed By: highker Differential Revision: D74748665 fbshipit-source-id: 5b0283f9ae8a6e40faebbd2296bf097f1ba6352d
1 parent 217e904 commit e3911c9

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

controller/src/lib.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,16 @@ use monarch_messages::worker::WorkerActor;
5959
use monarch_messages::worker::WorkerMessage;
6060
use ndslice::Selection;
6161
use ndslice::Slice;
62+
use ndslice::reshape::Limit;
63+
use ndslice::reshape::ReshapeSliceExt;
6264
use ndslice::selection::dsl;
6365
use ndslice::shape::Range;
6466
use serde::Deserialize;
6567
use serde::Serialize;
6668
use tokio::sync::OnceCell;
6769

70+
const CASTING_FANOUT_SIZE: usize = 8;
71+
6872
/// A controller for the workers that will be leveraged by the client to do the actual
6973
/// compute tasks. This acts a proxy managing comms with the workers and handling things like history,
7074
/// data dependency, worker lifecycles etc for the client abstracting it away.
@@ -387,7 +391,14 @@ impl ControllerMessageHandler for ControllerActor {
387391
message: Serialized,
388392
) -> Result<(), anyhow::Error> {
389393
let selection = match ranks {
390-
Ranks::Slice(slice) => slice_to_selection(slice),
394+
Ranks::Slice(slice) => {
395+
if slice.len() == self.world_size {
396+
// All ranks are selected.
397+
Selection::True
398+
} else {
399+
slice_to_selection(slice)
400+
}
401+
}
391402
Ranks::SliceList(slices) => slices.into_iter().fold(dsl::false_(), |sel, slice| {
392403
dsl::union(sel, slice_to_selection(slice))
393404
}),
@@ -399,12 +410,17 @@ impl ControllerMessageHandler for ControllerActor {
399410
),
400411
message,
401412
);
413+
414+
let slice = Slice::new(0usize, vec![self.world_size], vec![1])
415+
.unwrap()
416+
.view_limit(Limit::from(CASTING_FANOUT_SIZE));
417+
402418
self.comm_actor_ref.port::<CastMessage>().send(
403419
this,
404420
CastMessage {
405421
dest: Uslice {
406422
// TODO: pass both slice and selection from client side
407-
slice: Slice::new(0usize, vec![self.world_size], vec![1]).unwrap(),
423+
slice,
408424
selection,
409425
},
410426
message,

0 commit comments

Comments
 (0)