Skip to content

Commit 365ad35

Browse files
Jon Paul Maloydavem330
authored andcommitted
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking and trying to send again when the congestion has abated, or just returning to the user with -EAGAIN and let him re-try later. This mechanism is prone to starvation, because the wakeup algorithm is non-atomic. During the time the link issues a wakeup signal, until the socket wakes up and re-attempts sending, other senders may have come in between and occupied the free buffer space in the link. This in turn may lead to a socket having to make many send attempts before it is successful. In extremely loaded systems we have observed latency times of several seconds before a low-priority socket is able to send out a message. In this commit, we simplify this mechanism and reduce the risk of the described scenario happening. When a message is attempted sent via a congested link, we now let it be added to the link's backlog queue anyway, thus permitting an oversubscription of one message per source socket. We still create a wakeup item and return an error code, hence instructing the sender to block or stop sending. Only when enough space has been freed up in the link's backlog queue do we issue a wakeup event that allows the sender to continue with the next message, if any. The fact that a socket now can consider a message sent even when the link returns a congestion code means that the sending socket code can be simplified. Also, since this is a good opportunity to get rid of the obsolete 'mtu change' condition in the three socket send functions, we now choose to refactor those functions completely. Signed-off-by: Parthasarathy Bhuvaragan <[email protected]> Acked-by: Ying Xue <[email protected]> Signed-off-by: Jon Maloy <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 4d8642d commit 365ad35

File tree

5 files changed

+194
-251
lines changed

5 files changed

+194
-251
lines changed

net/tipc/bcast.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
174174
* and to identified node local sockets
175175
* @net: the applicable net namespace
176176
* @list: chain of buffers containing message
177-
* Consumes the buffer chain, except when returning -ELINKCONG
177+
* Consumes the buffer chain.
178178
* Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
179179
*/
180180
int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
@@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
197197
tipc_bcast_unlock(net);
198198

199199
/* Don't send to local node if adding to link failed */
200-
if (unlikely(rc)) {
200+
if (unlikely(rc && (rc != -ELINKCONG))) {
201201
__skb_queue_purge(&rcvq);
202202
return rc;
203203
}
@@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
206206
tipc_bcbase_xmit(net, &xmitq);
207207
tipc_sk_mcast_rcv(net, &rcvq, &inputq);
208208
__skb_queue_purge(list);
209-
return 0;
209+
return rc;
210210
}
211211

212212
/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link

net/tipc/link.c

Lines changed: 32 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
776776

777777
/**
778778
* link_schedule_user - schedule a message sender for wakeup after congestion
779-
* @link: congested link
780-
* @list: message that was attempted sent
779+
* @l: congested link
780+
* @hdr: header of message that is being sent
781781
* Create pseudo msg to send back to user when congestion abates
782-
* Does not consume buffer list
783782
*/
784-
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
783+
static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
785784
{
786-
struct tipc_msg *msg = buf_msg(skb_peek(list));
787-
int imp = msg_importance(msg);
788-
u32 oport = msg_origport(msg);
789-
u32 addr = tipc_own_addr(link->net);
785+
u32 dnode = tipc_own_addr(l->net);
786+
u32 dport = msg_origport(hdr);
790787
struct sk_buff *skb;
791788

792-
/* This really cannot happen... */
793-
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
794-
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
795-
return -ENOBUFS;
796-
}
797-
/* Non-blocking sender: */
798-
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
799-
return -ELINKCONG;
800-
801789
/* Create and schedule wakeup pseudo message */
802790
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
803-
addr, addr, oport, 0, 0);
791+
dnode, l->addr, dport, 0, 0);
804792
if (!skb)
805793
return -ENOBUFS;
806-
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
807-
TIPC_SKB_CB(skb)->chain_imp = imp;
808-
skb_queue_tail(&link->wakeupq, skb);
809-
link->stats.link_congs++;
794+
msg_set_dest_droppable(buf_msg(skb), true);
795+
TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
796+
skb_queue_tail(&l->wakeupq, skb);
797+
l->stats.link_congs++;
810798
return -ELINKCONG;
811799
}
812800

813801
/**
814802
* link_prepare_wakeup - prepare users for wakeup after congestion
815-
* @link: congested link
816-
* Move a number of waiting users, as permitted by available space in
817-
* the send queue, from link wait queue to node wait queue for wakeup
803+
* @l: congested link
804+
* Wake up a number of waiting users, as permitted by available space
805+
* in the send queue
818806
*/
819807
void link_prepare_wakeup(struct tipc_link *l)
820808
{
821-
int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
822-
int imp, lim;
823809
struct sk_buff *skb, *tmp;
810+
int imp, i = 0;
824811

825812
skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
826813
imp = TIPC_SKB_CB(skb)->chain_imp;
827-
lim = l->backlog[imp].limit;
828-
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
829-
if ((pnd[imp] + l->backlog[imp].len) >= lim)
814+
if (l->backlog[imp].len < l->backlog[imp].limit) {
815+
skb_unlink(skb, &l->wakeupq);
816+
skb_queue_tail(l->inputq, skb);
817+
} else if (i++ > 10) {
830818
break;
831-
skb_unlink(skb, &l->wakeupq);
832-
skb_queue_tail(l->inputq, skb);
819+
}
833820
}
834821
}
835822

@@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l)
869856
* @list: chain of buffers containing message
870857
* @xmitq: returned list of packets to be sent by caller
871858
*
872-
* Consumes the buffer chain, except when returning -ELINKCONG,
873-
* since the caller then may want to make more send attempts.
859+
* Consumes the buffer chain.
874860
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
875861
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
876862
*/
@@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
879865
{
880866
struct tipc_msg *hdr = buf_msg(skb_peek(list));
881867
unsigned int maxwin = l->window;
882-
unsigned int i, imp = msg_importance(hdr);
868+
int imp = msg_importance(hdr);
883869
unsigned int mtu = l->mtu;
884870
u16 ack = l->rcv_nxt - 1;
885871
u16 seqno = l->snd_nxt;
@@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
888874
struct sk_buff_head *backlogq = &l->backlogq;
889875
struct sk_buff *skb, *_skb, *bskb;
890876
int pkt_cnt = skb_queue_len(list);
877+
int rc = 0;
891878

892-
/* Match msg importance against this and all higher backlog limits: */
893-
if (!skb_queue_empty(backlogq)) {
894-
for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
895-
if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
896-
return link_schedule_user(l, list);
897-
}
898-
}
899879
if (unlikely(msg_size(hdr) > mtu)) {
900880
skb_queue_purge(list);
901881
return -EMSGSIZE;
902882
}
903883

884+
/* Allow oversubscription of one data msg per source at congestion */
885+
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
886+
if (imp == TIPC_SYSTEM_IMPORTANCE) {
887+
pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
888+
return -ENOBUFS;
889+
}
890+
rc = link_schedule_user(l, hdr);
891+
}
892+
904893
if (pkt_cnt > 1) {
905894
l->stats.sent_fragmented++;
906895
l->stats.sent_fragments += pkt_cnt;
@@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
946935
skb_queue_splice_tail_init(list, backlogq);
947936
}
948937
l->snd_nxt = seqno;
949-
return 0;
938+
return rc;
950939
}
951940

952941
void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)

net/tipc/msg.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,6 @@ struct tipc_skb_cb {
9898
u32 bytes_read;
9999
struct sk_buff *tail;
100100
bool validated;
101-
bool wakeup_pending;
102-
u16 chain_sz;
103101
u16 chain_imp;
104102
u16 ackers;
105103
};

net/tipc/node.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,7 +1167,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
11671167
* @list: chain of buffers containing message
11681168
* @dnode: address of destination node
11691169
* @selector: a number used for deterministic link selection
1170-
* Consumes the buffer chain, except when returning -ELINKCONG
1170+
* Consumes the buffer chain.
11711171
* Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
11721172
*/
11731173
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
@@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
12061206
spin_unlock_bh(&le->lock);
12071207
tipc_node_read_unlock(n);
12081208

1209-
if (likely(rc == 0))
1210-
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
1211-
else if (rc == -ENOBUFS)
1209+
if (unlikely(rc == -ENOBUFS))
12121210
tipc_node_link_down(n, bearer_id, false);
1211+
else
1212+
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
12131213

12141214
tipc_node_put(n);
12151215

@@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
12211221
* messages, which will not be rejected
12221222
* The only exception is datagram messages rerouted after secondary
12231223
* lookup, which are rare and safe to dispose of anyway.
1224-
* TODO: Return real return value, and let callers use
1225-
* tipc_wait_for_sendpkt() where applicable
12261224
*/
12271225
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
12281226
u32 selector)
12291227
{
12301228
struct sk_buff_head head;
1231-
int rc;
12321229

12331230
skb_queue_head_init(&head);
12341231
__skb_queue_tail(&head, skb);
1235-
rc = tipc_node_xmit(net, &head, dnode, selector);
1236-
if (rc == -ELINKCONG)
1237-
kfree_skb(skb);
1232+
tipc_node_xmit(net, &head, dnode, selector);
12381233
return 0;
12391234
}
12401235

0 commit comments

Comments
 (0)