@@ -115,6 +115,8 @@ struct tap_filter {
115
115
*/
116
116
#define MAX_TAP_QUEUES 1024
117
117
118
+ #define TUN_FLOW_EXPIRE (3 * HZ)
119
+
118
120
/* A tun_file connects an open character device to a tuntap netdevice. It
119
121
* also contains all socket related strctures (except sock_fprog and tap_filter)
120
122
* to serve as one transmit queue for tuntap device. The sock_fprog and
@@ -138,6 +140,18 @@ struct tun_file {
138
140
u16 queue_index ;
139
141
};
140
142
143
+ struct tun_flow_entry {
144
+ struct hlist_node hash_link ;
145
+ struct rcu_head rcu ;
146
+ struct tun_struct * tun ;
147
+
148
+ u32 rxhash ;
149
+ int queue_index ;
150
+ unsigned long updated ;
151
+ };
152
+
153
+ #define TUN_NUM_FLOW_ENTRIES 1024
154
+
141
155
/* Since the socket were moved to tun_file, to preserve the behavior of persist
142
156
* device, socket fileter, sndbuf and vnet header size were restore when the
143
157
* file were attached to a persist device.
@@ -163,8 +177,164 @@ struct tun_struct {
163
177
#ifdef TUN_DEBUG
164
178
int debug ;
165
179
#endif
180
+ spinlock_t lock ;
181
+ struct kmem_cache * flow_cache ;
182
+ struct hlist_head flows [TUN_NUM_FLOW_ENTRIES ];
183
+ struct timer_list flow_gc_timer ;
184
+ unsigned long ageing_time ;
166
185
};
167
186
187
+ static inline u32 tun_hashfn (u32 rxhash )
188
+ {
189
+ return rxhash & 0x3ff ;
190
+ }
191
+
192
+ static struct tun_flow_entry * tun_flow_find (struct hlist_head * head , u32 rxhash )
193
+ {
194
+ struct tun_flow_entry * e ;
195
+ struct hlist_node * n ;
196
+
197
+ hlist_for_each_entry_rcu (e , n , head , hash_link ) {
198
+ if (e -> rxhash == rxhash )
199
+ return e ;
200
+ }
201
+ return NULL ;
202
+ }
203
+
204
+ static struct tun_flow_entry * tun_flow_create (struct tun_struct * tun ,
205
+ struct hlist_head * head ,
206
+ u32 rxhash , u16 queue_index )
207
+ {
208
+ struct tun_flow_entry * e = kmem_cache_alloc (tun -> flow_cache ,
209
+ GFP_ATOMIC );
210
+ if (e ) {
211
+ tun_debug (KERN_INFO , tun , "create flow: hash %u index %u\n" ,
212
+ rxhash , queue_index );
213
+ e -> updated = jiffies ;
214
+ e -> rxhash = rxhash ;
215
+ e -> queue_index = queue_index ;
216
+ e -> tun = tun ;
217
+ hlist_add_head_rcu (& e -> hash_link , head );
218
+ }
219
+ return e ;
220
+ }
221
+
222
+ static void tun_flow_free (struct rcu_head * head )
223
+ {
224
+ struct tun_flow_entry * e
225
+ = container_of (head , struct tun_flow_entry , rcu );
226
+ kmem_cache_free (e -> tun -> flow_cache , e );
227
+ }
228
+
229
+ static void tun_flow_delete (struct tun_struct * tun , struct tun_flow_entry * e )
230
+ {
231
+ tun_debug (KERN_INFO , tun , "delete flow: hash %u index %u\n" ,
232
+ e -> rxhash , e -> queue_index );
233
+ hlist_del_rcu (& e -> hash_link );
234
+ call_rcu (& e -> rcu , tun_flow_free );
235
+ }
236
+
237
+ static void tun_flow_flush (struct tun_struct * tun )
238
+ {
239
+ int i ;
240
+
241
+ spin_lock_bh (& tun -> lock );
242
+ for (i = 0 ; i < TUN_NUM_FLOW_ENTRIES ; i ++ ) {
243
+ struct tun_flow_entry * e ;
244
+ struct hlist_node * h , * n ;
245
+
246
+ hlist_for_each_entry_safe (e , h , n , & tun -> flows [i ], hash_link )
247
+ tun_flow_delete (tun , e );
248
+ }
249
+ spin_unlock_bh (& tun -> lock );
250
+ }
251
+
252
+ static void tun_flow_delete_by_queue (struct tun_struct * tun , u16 queue_index )
253
+ {
254
+ int i ;
255
+
256
+ spin_lock_bh (& tun -> lock );
257
+ for (i = 0 ; i < TUN_NUM_FLOW_ENTRIES ; i ++ ) {
258
+ struct tun_flow_entry * e ;
259
+ struct hlist_node * h , * n ;
260
+
261
+ hlist_for_each_entry_safe (e , h , n , & tun -> flows [i ], hash_link ) {
262
+ if (e -> queue_index == queue_index )
263
+ tun_flow_delete (tun , e );
264
+ }
265
+ }
266
+ spin_unlock_bh (& tun -> lock );
267
+ }
268
+
269
+ static void tun_flow_cleanup (unsigned long data )
270
+ {
271
+ struct tun_struct * tun = (struct tun_struct * )data ;
272
+ unsigned long delay = tun -> ageing_time ;
273
+ unsigned long next_timer = jiffies + delay ;
274
+ unsigned long count = 0 ;
275
+ int i ;
276
+
277
+ tun_debug (KERN_INFO , tun , "tun_flow_cleanup\n" );
278
+
279
+ spin_lock_bh (& tun -> lock );
280
+ for (i = 0 ; i < TUN_NUM_FLOW_ENTRIES ; i ++ ) {
281
+ struct tun_flow_entry * e ;
282
+ struct hlist_node * h , * n ;
283
+
284
+ hlist_for_each_entry_safe (e , h , n , & tun -> flows [i ], hash_link ) {
285
+ unsigned long this_timer ;
286
+ count ++ ;
287
+ this_timer = e -> updated + delay ;
288
+ if (time_before_eq (this_timer , jiffies ))
289
+ tun_flow_delete (tun , e );
290
+ else if (time_before (this_timer , next_timer ))
291
+ next_timer = this_timer ;
292
+ }
293
+ }
294
+
295
+ if (count )
296
+ mod_timer (& tun -> flow_gc_timer , round_jiffies_up (next_timer ));
297
+ spin_unlock_bh (& tun -> lock );
298
+ }
299
+
300
+ static void tun_flow_update (struct tun_struct * tun , struct sk_buff * skb ,
301
+ u16 queue_index )
302
+ {
303
+ struct hlist_head * head ;
304
+ struct tun_flow_entry * e ;
305
+ unsigned long delay = tun -> ageing_time ;
306
+ u32 rxhash = skb_get_rxhash (skb );
307
+
308
+ if (!rxhash )
309
+ return ;
310
+ else
311
+ head = & tun -> flows [tun_hashfn (rxhash )];
312
+
313
+ rcu_read_lock ();
314
+
315
+ if (tun -> numqueues == 1 )
316
+ goto unlock ;
317
+
318
+ e = tun_flow_find (head , rxhash );
319
+ if (likely (e )) {
320
+ /* TODO: keep queueing to old queue until it's empty? */
321
+ e -> queue_index = queue_index ;
322
+ e -> updated = jiffies ;
323
+ } else {
324
+ spin_lock_bh (& tun -> lock );
325
+ if (!tun_flow_find (head , rxhash ))
326
+ tun_flow_create (tun , head , rxhash , queue_index );
327
+
328
+ if (!timer_pending (& tun -> flow_gc_timer ))
329
+ mod_timer (& tun -> flow_gc_timer ,
330
+ round_jiffies_up (jiffies + delay ));
331
+ spin_unlock_bh (& tun -> lock );
332
+ }
333
+
334
+ unlock :
335
+ rcu_read_unlock ();
336
+ }
337
+
168
338
/* We try to identify a flow through its rxhash first. The reason that
169
339
* we do not check rxq no. is becuase some cards(e.g 82599), chooses
170
340
* the rxq based on the txq where the last packet of the flow comes. As
@@ -175,6 +345,7 @@ struct tun_struct {
175
345
static u16 tun_select_queue (struct net_device * dev , struct sk_buff * skb )
176
346
{
177
347
struct tun_struct * tun = netdev_priv (dev );
348
+ struct tun_flow_entry * e ;
178
349
u32 txq = 0 ;
179
350
u32 numqueues = 0 ;
180
351
@@ -183,8 +354,12 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
183
354
184
355
txq = skb_get_rxhash (skb );
185
356
if (txq ) {
186
- /* use multiply and shift instead of expensive divide */
187
- txq = ((u64 )txq * numqueues ) >> 32 ;
357
+ e = tun_flow_find (& tun -> flows [tun_hashfn (txq )], txq );
358
+ if (e )
359
+ txq = e -> queue_index ;
360
+ else
361
+ /* use multiply and shift instead of expensive divide */
362
+ txq = ((u64 )txq * numqueues ) >> 32 ;
188
363
} else if (likely (skb_rx_queue_recorded (skb ))) {
189
364
txq = skb_get_rx_queue (skb );
190
365
while (unlikely (txq >= numqueues ))
@@ -234,6 +409,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
234
409
sock_put (& tfile -> sk );
235
410
236
411
synchronize_net ();
412
+ tun_flow_delete_by_queue (tun , tun -> numqueues + 1 );
237
413
/* Drop read queue */
238
414
skb_queue_purge (& tfile -> sk .sk_receive_queue );
239
415
tun_set_real_num_queues (tun );
@@ -631,6 +807,37 @@ static const struct net_device_ops tap_netdev_ops = {
631
807
#endif
632
808
};
633
809
810
+ static int tun_flow_init (struct tun_struct * tun )
811
+ {
812
+ int i ;
813
+
814
+ tun -> flow_cache = kmem_cache_create ("tun_flow_cache" ,
815
+ sizeof (struct tun_flow_entry ), 0 , 0 ,
816
+ NULL );
817
+ if (!tun -> flow_cache )
818
+ return - ENOMEM ;
819
+
820
+ for (i = 0 ; i < TUN_NUM_FLOW_ENTRIES ; i ++ )
821
+ INIT_HLIST_HEAD (& tun -> flows [i ]);
822
+
823
+ tun -> ageing_time = TUN_FLOW_EXPIRE ;
824
+ setup_timer (& tun -> flow_gc_timer , tun_flow_cleanup , (unsigned long )tun );
825
+ mod_timer (& tun -> flow_gc_timer ,
826
+ round_jiffies_up (jiffies + tun -> ageing_time ));
827
+
828
+ return 0 ;
829
+ }
830
+
831
+ static void tun_flow_uninit (struct tun_struct * tun )
832
+ {
833
+ del_timer_sync (& tun -> flow_gc_timer );
834
+ tun_flow_flush (tun );
835
+
836
+ /* Wait for completion of call_rcu()'s */
837
+ rcu_barrier ();
838
+ kmem_cache_destroy (tun -> flow_cache );
839
+ }
840
+
634
841
/* Initialize net device. */
635
842
static void tun_net_init (struct net_device * dev )
636
843
{
@@ -973,6 +1180,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
973
1180
tun -> dev -> stats .rx_packets ++ ;
974
1181
tun -> dev -> stats .rx_bytes += len ;
975
1182
1183
+ tun_flow_update (tun , skb , tfile -> queue_index );
976
1184
return total_len ;
977
1185
}
978
1186
@@ -1150,6 +1358,14 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
1150
1358
return ret ;
1151
1359
}
1152
1360
1361
+ static void tun_free_netdev (struct net_device * dev )
1362
+ {
1363
+ struct tun_struct * tun = netdev_priv (dev );
1364
+
1365
+ tun_flow_uninit (tun );
1366
+ free_netdev (dev );
1367
+ }
1368
+
1153
1369
static void tun_setup (struct net_device * dev )
1154
1370
{
1155
1371
struct tun_struct * tun = netdev_priv (dev );
@@ -1158,7 +1374,7 @@ static void tun_setup(struct net_device *dev)
1158
1374
tun -> group = INVALID_GID ;
1159
1375
1160
1376
dev -> ethtool_ops = & tun_ethtool_ops ;
1161
- dev -> destructor = free_netdev ;
1377
+ dev -> destructor = tun_free_netdev ;
1162
1378
}
1163
1379
1164
1380
/* Trivial set of netlink ops to allow deleting tun or tap
@@ -1381,10 +1597,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1381
1597
tun -> filter_attached = false;
1382
1598
tun -> sndbuf = tfile -> socket .sk -> sk_sndbuf ;
1383
1599
1600
+ spin_lock_init (& tun -> lock );
1601
+
1384
1602
security_tun_dev_post_create (& tfile -> sk );
1385
1603
1386
1604
tun_net_init (dev );
1387
1605
1606
+ if (tun_flow_init (tun ))
1607
+ goto err_free_dev ;
1608
+
1388
1609
dev -> hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1389
1610
TUN_USER_FEATURES ;
1390
1611
dev -> features = dev -> hw_features ;
0 commit comments