Skip to content

Commit 7c48266

Browse files
dhowellskuba-moo
authored andcommitted
rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985]
When an rxrpc call is in its transmission phase and is sending a lot of packets, stalls occasionally occur that cause severe performance degradation (eg. increasing the transmission time for a 256MiB payload from 0.7s to 2.5s over a 10G link). rxrpc already implements TCP-style congestion control [RFC5681] and this helps mitigate the effects, but occasionally we're missing a time event that deals with a missing ACK, leading to a stall until the RTO expires. Fix this by implementing RACK/TLP in rxrpc. Signed-off-by: David Howells <[email protected]> cc: Marc Dionne <[email protected]> cc: [email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 4ee4c2f commit 7c48266

File tree

9 files changed

+1041
-236
lines changed

9 files changed

+1041
-236
lines changed

include/trace/events/rxrpc.h

Lines changed: 323 additions & 19 deletions
Large diffs are not rendered by default.

net/rxrpc/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ rxrpc-y := \
1616
conn_object.o \
1717
conn_service.o \
1818
input.o \
19+
input_rack.o \
1920
insecure.o \
2021
io_thread.o \
2122
key.o \

net/rxrpc/ar-internal.h

Lines changed: 102 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,18 @@ enum rxrpc_ca_state {
621621
NR__RXRPC_CA_STATES
622622
} __mode(byte);
623623

624+
/*
625+
* Current purpose of call RACK timer. According to the RACK-TLP protocol
626+
* [RFC8985], the transmission timer (call->rack_timo_at) may only be used for
627+
* one of these at once.
628+
*/
629+
enum rxrpc_rack_timer_mode {
630+
RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */
631+
RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */
632+
RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */
633+
RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */
634+
} __mode(byte);
635+
624636
/*
625637
* RxRPC call definition
626638
* - matched by { connection, call_id }
@@ -638,8 +650,7 @@ struct rxrpc_call {
638650
struct mutex user_mutex; /* User access mutex */
639651
struct sockaddr_rxrpc dest_srx; /* Destination address */
640652
ktime_t delay_ack_at; /* When DELAY ACK needs to happen */
641-
ktime_t ack_lost_at; /* When ACK is figured as lost */
642-
ktime_t resend_at; /* When next resend needs to happen */
653+
ktime_t rack_timo_at; /* When ACK is figured as lost */
643654
ktime_t ping_at; /* When next to send a ping */
644655
ktime_t keepalive_at; /* When next to send a keepalive ping */
645656
ktime_t expect_rx_by; /* When we expect to get a packet by */
@@ -695,8 +706,12 @@ struct rxrpc_call {
695706
rxrpc_seq_t tx_bottom; /* First packet in buffer */
696707
rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */
697708
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
709+
rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */
698710
u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */
699-
u8 tx_winsize; /* Maximum size of Tx window */
711+
u16 tx_nr_sent; /* Number of packets sent, but unacked */
712+
u16 tx_nr_lost; /* Number of packets marked lost */
713+
u16 tx_nr_resent; /* Number of packets resent, but unacked */
714+
u16 tx_winsize; /* Maximum size of Tx window */
700715
#define RXRPC_TX_MAX_WINDOW 128
701716
u8 tx_jumbo_max; /* Maximum subpkts peer will accept */
702717
ktime_t tx_last_sent; /* Last time a transmission occurred */
@@ -725,6 +740,25 @@ struct rxrpc_call {
725740
u16 cong_cumul_acks; /* Cumulative ACK count */
726741
ktime_t cong_tstamp; /* Last time cwnd was changed */
727742

743+
/* RACK-TLP [RFC8985] state. */
744+
ktime_t rack_xmit_ts; /* Latest transmission timestamp */
745+
ktime_t rack_rtt; /* RTT of most recently ACK'd segment */
746+
ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */
747+
ktime_t rack_reo_wnd; /* Reordering window */
748+
unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */
749+
int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */
750+
rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */
751+
rxrpc_seq_t rack_end_seq; /* Highest sequence seen */
752+
rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */
753+
bool rack_dsack_round_none; /* T if dsack_round is "None" */
754+
bool rack_reordering_seen; /* T if detected reordering event */
755+
enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */
756+
bool tlp_is_retrans; /* T if unacked TLP retransmission */
757+
rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */
758+
rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */
759+
unsigned int tlp_rtt_taken; /* Last time RTT taken */
760+
ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */
761+
728762
/* Receive-phase ACK management (ACKs we send). */
729763
u8 ackr_reason; /* reason to ACK */
730764
u16 ackr_sack_base; /* Starting slot in SACK table ring */
@@ -783,6 +817,9 @@ struct rxrpc_ack_summary {
783817
bool retrans_timeo:1; /* T if reTx due to timeout happened */
784818
bool need_retransmit:1; /* T if we need transmission */
785819
bool rtt_sample_avail:1; /* T if RTT sample available */
820+
bool in_fast_or_rto_recovery:1;
821+
bool exiting_fast_or_rto_recovery:1;
822+
bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */
786823
u8 /*enum rxrpc_congest_change*/ change;
787824
};
788825

@@ -864,6 +901,7 @@ struct rxrpc_txqueue {
864901
unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */
865902
unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */
866903
unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */
904+
unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */
867905

868906
/* The arrays we want to pack into as few cache lines as possible. */
869907
struct {
@@ -883,7 +921,9 @@ struct rxrpc_send_data_req {
883921
struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */
884922
rxrpc_seq_t seq; /* Sequence of first data */
885923
int n; /* Number of DATA packets to glue into jumbo */
924+
bool retrans; /* T if this is a retransmission */
886925
bool did_send; /* T if did actually send */
926+
bool tlp_probe; /* T if this is a TLP probe */
887927
int /* enum rxrpc_txdata_trace */ trace;
888928
};
889929

@@ -943,8 +983,9 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
943983
enum rxrpc_propose_ack_trace why);
944984
void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
945985
enum rxrpc_propose_ack_trace);
946-
void rxrpc_resend(struct rxrpc_call *call, rxrpc_serial_t ack_serial, bool ping_response);
947-
986+
void rxrpc_resend_tlp(struct rxrpc_call *call);
987+
void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
988+
enum rxrpc_txdata_trace trace);
948989
bool rxrpc_input_call_event(struct rxrpc_call *call);
949990

950991
/*
@@ -1123,6 +1164,32 @@ void rxrpc_congestion_degrade(struct rxrpc_call *);
11231164
void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
11241165
void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
11251166

1167+
/*
1168+
* input_rack.c
1169+
*/
1170+
void rxrpc_input_rack_one(struct rxrpc_call *call,
1171+
struct rxrpc_ack_summary *summary,
1172+
struct rxrpc_txqueue *tq,
1173+
unsigned int ix);
1174+
void rxrpc_input_rack(struct rxrpc_call *call,
1175+
struct rxrpc_ack_summary *summary,
1176+
struct rxrpc_txqueue *tq,
1177+
unsigned long new_acks);
1178+
void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
1179+
struct rxrpc_ack_summary *summary);
1180+
ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now);
1181+
void rxrpc_tlp_send_probe(struct rxrpc_call *call);
1182+
void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary);
1183+
void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by);
1184+
1185+
/* Initialise TLP state [RFC8958 7.1]. */
1186+
static inline void rxrpc_tlp_init(struct rxrpc_call *call)
1187+
{
1188+
call->tlp_serial = 0;
1189+
call->tlp_seq = call->acks_hard_ack;
1190+
call->tlp_is_retrans = false;
1191+
}
1192+
11261193
/*
11271194
* io_thread.c
11281195
*/
@@ -1402,13 +1469,43 @@ static inline u32 latest(u32 seq1, u32 seq2)
14021469
return after(seq1, seq2) ? seq1 : seq2;
14031470
}
14041471

1472+
static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq)
1473+
{
1474+
return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase;
1475+
}
1476+
14051477
static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
14061478
{
14071479
rxrpc_get_skb(skb, rxrpc_skb_get_call_rx);
14081480
__skb_queue_tail(&call->rx_queue, skb);
14091481
rxrpc_poke_call(call, rxrpc_call_poke_rx_packet);
14101482
}
14111483

1484+
/*
1485+
* Calculate how much space there is for transmitting more DATA packets.
1486+
*/
1487+
static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call)
1488+
{
1489+
int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
1490+
int transmitted = call->tx_top - call->tx_bottom;
1491+
1492+
return max(winsize - transmitted, 0);
1493+
}
1494+
1495+
static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call)
1496+
{
1497+
return call->acks_nr_sacks + call->tx_nr_lost;
1498+
}
1499+
1500+
/*
1501+
* Calculate the number of transmitted DATA packets assumed to be in flight
1502+
* [approx RFC6675].
1503+
*/
1504+
static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call)
1505+
{
1506+
return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent;
1507+
}
1508+
14121509
/*
14131510
* debug tracing
14141511
*/

0 commit comments

Comments
 (0)