@@ -1234,27 +1234,86 @@ static void packet_free_pending(struct packet_sock *po)
1234
1234
free_percpu (po -> tx_ring .pending_refcnt );
1235
1235
}
1236
1236
1237
- static bool packet_rcv_has_room (struct packet_sock * po , struct sk_buff * skb )
1237
+ #define ROOM_POW_OFF 2
1238
+ #define ROOM_NONE 0x0
1239
+ #define ROOM_LOW 0x1
1240
+ #define ROOM_NORMAL 0x2
1241
+
1242
+ static bool __tpacket_has_room (struct packet_sock * po , int pow_off )
1243
+ {
1244
+ int idx , len ;
1245
+
1246
+ len = po -> rx_ring .frame_max + 1 ;
1247
+ idx = po -> rx_ring .head ;
1248
+ if (pow_off )
1249
+ idx += len >> pow_off ;
1250
+ if (idx >= len )
1251
+ idx -= len ;
1252
+ return packet_lookup_frame (po , & po -> rx_ring , idx , TP_STATUS_KERNEL );
1253
+ }
1254
+
1255
+ static bool __tpacket_v3_has_room (struct packet_sock * po , int pow_off )
1256
+ {
1257
+ int idx , len ;
1258
+
1259
+ len = po -> rx_ring .prb_bdqc .knum_blocks ;
1260
+ idx = po -> rx_ring .prb_bdqc .kactive_blk_num ;
1261
+ if (pow_off )
1262
+ idx += len >> pow_off ;
1263
+ if (idx >= len )
1264
+ idx -= len ;
1265
+ return prb_lookup_block (po , & po -> rx_ring , idx , TP_STATUS_KERNEL );
1266
+ }
1267
+
1268
+ static int __packet_rcv_has_room (struct packet_sock * po , struct sk_buff * skb )
1238
1269
{
1239
1270
struct sock * sk = & po -> sk ;
1271
+ int ret = ROOM_NONE ;
1272
+
1273
+ if (po -> prot_hook .func != tpacket_rcv ) {
1274
+ int avail = sk -> sk_rcvbuf - atomic_read (& sk -> sk_rmem_alloc )
1275
+ - (skb ? skb -> truesize : 0 );
1276
+ if (avail > (sk -> sk_rcvbuf >> ROOM_POW_OFF ))
1277
+ return ROOM_NORMAL ;
1278
+ else if (avail > 0 )
1279
+ return ROOM_LOW ;
1280
+ else
1281
+ return ROOM_NONE ;
1282
+ }
1283
+
1284
+ if (po -> tp_version == TPACKET_V3 ) {
1285
+ if (__tpacket_v3_has_room (po , ROOM_POW_OFF ))
1286
+ ret = ROOM_NORMAL ;
1287
+ else if (__tpacket_v3_has_room (po , 0 ))
1288
+ ret = ROOM_LOW ;
1289
+ } else {
1290
+ if (__tpacket_has_room (po , ROOM_POW_OFF ))
1291
+ ret = ROOM_NORMAL ;
1292
+ else if (__tpacket_has_room (po , 0 ))
1293
+ ret = ROOM_LOW ;
1294
+ }
1295
+
1296
+ return ret ;
1297
+ }
1298
+
1299
+ static int packet_rcv_has_room (struct packet_sock * po , struct sk_buff * skb )
1300
+ {
1301
+ int ret ;
1240
1302
bool has_room ;
1241
1303
1242
- if (po -> prot_hook .func != tpacket_rcv )
1243
- return (atomic_read (& sk -> sk_rmem_alloc ) + skb -> truesize )
1244
- <= sk -> sk_rcvbuf ;
1304
+ if (po -> prot_hook .func == tpacket_rcv ) {
1305
+ spin_lock (& po -> sk .sk_receive_queue .lock );
1306
+ ret = __packet_rcv_has_room (po , skb );
1307
+ spin_unlock (& po -> sk .sk_receive_queue .lock );
1308
+ } else {
1309
+ ret = __packet_rcv_has_room (po , skb );
1310
+ }
1245
1311
1246
- spin_lock (& sk -> sk_receive_queue .lock );
1247
- if (po -> tp_version == TPACKET_V3 )
1248
- has_room = prb_lookup_block (po , & po -> rx_ring ,
1249
- po -> rx_ring .prb_bdqc .kactive_blk_num ,
1250
- TP_STATUS_KERNEL );
1251
- else
1252
- has_room = packet_lookup_frame (po , & po -> rx_ring ,
1253
- po -> rx_ring .head ,
1254
- TP_STATUS_KERNEL );
1255
- spin_unlock (& sk -> sk_receive_queue .lock );
1312
+ has_room = ret == ROOM_NORMAL ;
1313
+ if (po -> pressure == has_room )
1314
+ xchg (& po -> pressure , !has_room );
1256
1315
1257
- return has_room ;
1316
+ return ret ;
1258
1317
}
1259
1318
1260
1319
static void packet_sock_destruct (struct sock * sk )
@@ -1282,6 +1341,20 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1282
1341
return x ;
1283
1342
}
1284
1343
1344
+ static bool fanout_flow_is_huge (struct packet_sock * po , struct sk_buff * skb )
1345
+ {
1346
+ u32 rxhash ;
1347
+ int i , count = 0 ;
1348
+
1349
+ rxhash = skb_get_hash (skb );
1350
+ for (i = 0 ; i < ROLLOVER_HLEN ; i ++ )
1351
+ if (po -> rollover -> history [i ] == rxhash )
1352
+ count ++ ;
1353
+
1354
+ po -> rollover -> history [prandom_u32 () % ROLLOVER_HLEN ] = rxhash ;
1355
+ return count > (ROLLOVER_HLEN >> 1 );
1356
+ }
1357
+
1285
1358
static unsigned int fanout_demux_hash (struct packet_fanout * f ,
1286
1359
struct sk_buff * skb ,
1287
1360
unsigned int num )
@@ -1318,22 +1391,39 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1318
1391
1319
1392
static unsigned int fanout_demux_rollover (struct packet_fanout * f ,
1320
1393
struct sk_buff * skb ,
1321
- unsigned int idx , unsigned int skip ,
1394
+ unsigned int idx , bool try_self ,
1322
1395
unsigned int num )
1323
1396
{
1324
- unsigned int i , j ;
1397
+ struct packet_sock * po , * po_next ;
1398
+ unsigned int i , j , room = ROOM_NONE ;
1325
1399
1326
- i = j = min_t (int , f -> next [idx ], num - 1 );
1400
+ po = pkt_sk (f -> arr [idx ]);
1401
+
1402
+ if (try_self ) {
1403
+ room = packet_rcv_has_room (po , skb );
1404
+ if (room == ROOM_NORMAL ||
1405
+ (room == ROOM_LOW && !fanout_flow_is_huge (po , skb )))
1406
+ return idx ;
1407
+ }
1408
+
1409
+ i = j = min_t (int , po -> rollover -> sock , num - 1 );
1327
1410
do {
1328
- if (i != skip && packet_rcv_has_room (pkt_sk (f -> arr [i ]), skb )) {
1411
+ po_next = pkt_sk (f -> arr [i ]);
1412
+ if (po_next != po && !po_next -> pressure &&
1413
+ packet_rcv_has_room (po_next , skb ) == ROOM_NORMAL ) {
1329
1414
if (i != j )
1330
- f -> next [idx ] = i ;
1415
+ po -> rollover -> sock = i ;
1416
+ atomic_long_inc (& po -> rollover -> num );
1417
+ if (room == ROOM_LOW )
1418
+ atomic_long_inc (& po -> rollover -> num_huge );
1331
1419
return i ;
1332
1420
}
1421
+
1333
1422
if (++ i == num )
1334
1423
i = 0 ;
1335
1424
} while (i != j );
1336
1425
1426
+ atomic_long_inc (& po -> rollover -> num_failed );
1337
1427
return idx ;
1338
1428
}
1339
1429
@@ -1386,17 +1476,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1386
1476
idx = fanout_demux_qm (f , skb , num );
1387
1477
break ;
1388
1478
case PACKET_FANOUT_ROLLOVER :
1389
- idx = fanout_demux_rollover (f , skb , 0 , ( unsigned int ) -1 , num );
1479
+ idx = fanout_demux_rollover (f , skb , 0 , false , num );
1390
1480
break ;
1391
1481
}
1392
1482
1393
- po = pkt_sk (f -> arr [idx ]);
1394
- if (fanout_has_flag (f , PACKET_FANOUT_FLAG_ROLLOVER ) &&
1395
- unlikely (!packet_rcv_has_room (po , skb ))) {
1396
- idx = fanout_demux_rollover (f , skb , idx , idx , num );
1397
- po = pkt_sk (f -> arr [idx ]);
1398
- }
1483
+ if (fanout_has_flag (f , PACKET_FANOUT_FLAG_ROLLOVER ))
1484
+ idx = fanout_demux_rollover (f , skb , idx , true, num );
1399
1485
1486
+ po = pkt_sk (f -> arr [idx ]);
1400
1487
return po -> prot_hook .func (skb , dev , & po -> prot_hook , orig_dev );
1401
1488
}
1402
1489
@@ -1467,6 +1554,15 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1467
1554
if (po -> fanout )
1468
1555
return - EALREADY ;
1469
1556
1557
+ if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER ) {
1558
+ po -> rollover = kzalloc (sizeof (* po -> rollover ), GFP_KERNEL );
1559
+ if (!po -> rollover )
1560
+ return - ENOMEM ;
1561
+ atomic_long_set (& po -> rollover -> num , 0 );
1562
+ atomic_long_set (& po -> rollover -> num_huge , 0 );
1563
+ atomic_long_set (& po -> rollover -> num_failed , 0 );
1564
+ }
1565
+
1470
1566
mutex_lock (& fanout_mutex );
1471
1567
match = NULL ;
1472
1568
list_for_each_entry (f , & fanout_list , list ) {
@@ -1515,6 +1611,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1515
1611
}
1516
1612
out :
1517
1613
mutex_unlock (& fanout_mutex );
1614
+ if (err ) {
1615
+ kfree (po -> rollover );
1616
+ po -> rollover = NULL ;
1617
+ }
1518
1618
return err ;
1519
1619
}
1520
1620
@@ -1536,6 +1636,8 @@ static void fanout_release(struct sock *sk)
1536
1636
kfree (f );
1537
1637
}
1538
1638
mutex_unlock (& fanout_mutex );
1639
+
1640
+ kfree (po -> rollover );
1539
1641
}
1540
1642
1541
1643
static const struct proto_ops packet_ops ;
@@ -2865,6 +2967,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2865
2967
2866
2968
spin_lock_init (& po -> bind_lock );
2867
2969
mutex_init (& po -> pg_vec_lock );
2970
+ po -> rollover = NULL ;
2868
2971
po -> prot_hook .func = packet_rcv ;
2869
2972
2870
2973
if (sock -> type == SOCK_PACKET )
@@ -2942,6 +3045,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2942
3045
if (skb == NULL )
2943
3046
goto out ;
2944
3047
3048
+ if (pkt_sk (sk )-> pressure )
3049
+ packet_rcv_has_room (pkt_sk (sk ), NULL );
3050
+
2945
3051
if (pkt_sk (sk )-> has_vnet_hdr ) {
2946
3052
struct virtio_net_hdr vnet_hdr = { 0 };
2947
3053
@@ -3485,6 +3591,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3485
3591
struct packet_sock * po = pkt_sk (sk );
3486
3592
void * data = & val ;
3487
3593
union tpacket_stats_u st ;
3594
+ struct tpacket_rollover_stats rstats ;
3488
3595
3489
3596
if (level != SOL_PACKET )
3490
3597
return - ENOPROTOOPT ;
@@ -3560,6 +3667,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3560
3667
((u32 )po -> fanout -> flags << 24 )) :
3561
3668
0 );
3562
3669
break ;
3670
+ case PACKET_ROLLOVER_STATS :
3671
+ if (!po -> rollover )
3672
+ return - EINVAL ;
3673
+ rstats .tp_all = atomic_long_read (& po -> rollover -> num );
3674
+ rstats .tp_huge = atomic_long_read (& po -> rollover -> num_huge );
3675
+ rstats .tp_failed = atomic_long_read (& po -> rollover -> num_failed );
3676
+ data = & rstats ;
3677
+ lv = sizeof (rstats );
3678
+ break ;
3563
3679
case PACKET_TX_HAS_OFF :
3564
3680
val = po -> tp_tx_has_off ;
3565
3681
break ;
@@ -3697,6 +3813,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
3697
3813
TP_STATUS_KERNEL ))
3698
3814
mask |= POLLIN | POLLRDNORM ;
3699
3815
}
3816
+ if (po -> pressure && __packet_rcv_has_room (po , NULL ) == ROOM_NORMAL )
3817
+ xchg (& po -> pressure , 0 );
3700
3818
spin_unlock_bh (& sk -> sk_receive_queue .lock );
3701
3819
spin_lock_bh (& sk -> sk_write_queue .lock );
3702
3820
if (po -> tx_ring .pg_vec ) {
0 commit comments