Skip to content

Commit 0e3125c

Browse files
nhormandavem330
authored andcommitted
packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version 4 of this patch. Change notes: 1) Removed extra memset. Didn't think kcalloc added a GFP_ZERO the way kzalloc did :) Summary: It was shown to me recently that systems under high load were driven very deep into swap when tcpdump was run. The reason this happened was because the AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space application to specify how many entries an AF_PACKET socket will have and how large each entry will be. It seems the default setting for tcpdump is to set the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5 allocation. Thats difficult under good circumstances, and horrid under memory pressure. I thought it would be good to make that a bit more usable. I was going to do a simple conversion of the ring buffer from contigous pages to iovecs, but unfortunately, the metadata which AF_PACKET places in these buffers can easily span a page boundary, and given that these buffers get mapped into user space, and the data layout doesn't easily allow for a change to padding between frames to avoid that, a simple iovec change is just going to break user space ABI consistency. So I've done this, I've added a three tiered mechanism to the af_packet set_ring socket option. It attempts to allocate memory in the following order: 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without digging into swap 2) Using vmalloc 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as needed to get the memory The effect is that we don't disturb the system as much when we're under load, while still being able to conduct tcpdumps effectively. Tested successfully by me. Signed-off-by: Neil Horman <[email protected]> Acked-by: Eric Dumazet <[email protected]> Acked-by: Maciej Żenczykowski <[email protected]> Reported-by: Maciej Żenczykowski <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 020f01e commit 0e3125c

File tree

1 file changed

+69
-16
lines changed

1 file changed

+69
-16
lines changed

net/packet/af_packet.c

Lines changed: 69 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
#include <linux/kernel.h>
6262
#include <linux/kmod.h>
6363
#include <linux/slab.h>
64+
#include <linux/vmalloc.h>
6465
#include <net/net_namespace.h>
6566
#include <net/ip.h>
6667
#include <net/protocol.h>
@@ -163,8 +164,14 @@ struct packet_mreq_max {
163164
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164165
int closing, int tx_ring);
165166

167+
#define PGV_FROM_VMALLOC 1
168+
struct pgv {
169+
char *buffer;
170+
unsigned char flags;
171+
};
172+
166173
struct packet_ring_buffer {
167-
char **pg_vec;
174+
struct pgv *pg_vec;
168175
unsigned int head;
169176
unsigned int frames_per_block;
170177
unsigned int frame_size;
@@ -283,7 +290,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
283290
pg_vec_pos = position / rb->frames_per_block;
284291
frame_offset = position % rb->frames_per_block;
285292

286-
h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
293+
h.raw = rb->pg_vec[pg_vec_pos].buffer +
294+
(frame_offset * rb->frame_size);
287295

288296
if (status != __packet_get_status(po, h.raw))
289297
return NULL;
@@ -2325,37 +2333,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
23252333
.close = packet_mm_close,
23262334
};
23272335

2328-
static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2336+
static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337+
unsigned int len)
23292338
{
23302339
int i;
23312340

23322341
for (i = 0; i < len; i++) {
2333-
if (likely(pg_vec[i]))
2334-
free_pages((unsigned long) pg_vec[i], order);
2342+
if (likely(pg_vec[i].buffer)) {
2343+
if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344+
vfree(pg_vec[i].buffer);
2345+
else
2346+
free_pages((unsigned long)pg_vec[i].buffer,
2347+
order);
2348+
pg_vec[i].buffer = NULL;
2349+
}
23352350
}
23362351
kfree(pg_vec);
23372352
}
23382353

2339-
static inline char *alloc_one_pg_vec_page(unsigned long order)
2354+
static inline char *alloc_one_pg_vec_page(unsigned long order,
2355+
unsigned char *flags)
23402356
{
2341-
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2357+
char *buffer = NULL;
2358+
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359+
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2360+
2361+
buffer = (char *) __get_free_pages(gfp_flags, order);
2362+
2363+
if (buffer)
2364+
return buffer;
2365+
2366+
/*
2367+
* __get_free_pages failed, fall back to vmalloc
2368+
*/
2369+
*flags |= PGV_FROM_VMALLOC;
2370+
buffer = vmalloc((1 << order) * PAGE_SIZE);
23422371

2343-
return (char *) __get_free_pages(gfp_flags, order);
2372+
if (buffer)
2373+
return buffer;
2374+
2375+
/*
2376+
* vmalloc failed, lets dig into swap here
2377+
*/
2378+
*flags = 0;
2379+
gfp_flags &= ~__GFP_NORETRY;
2380+
buffer = (char *)__get_free_pages(gfp_flags, order);
2381+
if (buffer)
2382+
return buffer;
2383+
2384+
/*
2385+
* complete and utter failure
2386+
*/
2387+
return NULL;
23442388
}
23452389

2346-
static char **alloc_pg_vec(struct tpacket_req *req, int order)
2390+
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
23472391
{
23482392
unsigned int block_nr = req->tp_block_nr;
2349-
char **pg_vec;
2393+
struct pgv *pg_vec;
23502394
int i;
23512395

2352-
pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2396+
pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
23532397
if (unlikely(!pg_vec))
23542398
goto out;
23552399

23562400
for (i = 0; i < block_nr; i++) {
2357-
pg_vec[i] = alloc_one_pg_vec_page(order);
2358-
if (unlikely(!pg_vec[i]))
2401+
pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2402+
&pg_vec[i].flags);
2403+
if (unlikely(!pg_vec[i].buffer))
23592404
goto out_free_pgvec;
23602405
}
23612406

@@ -2364,14 +2409,15 @@ static char **alloc_pg_vec(struct tpacket_req *req, int order)
23642409

23652410
out_free_pgvec:
23662411
free_pg_vec(pg_vec, order, block_nr);
2412+
kfree(pg_vec);
23672413
pg_vec = NULL;
23682414
goto out;
23692415
}
23702416

23712417
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
23722418
int closing, int tx_ring)
23732419
{
2374-
char **pg_vec = NULL;
2420+
struct pgv *pg_vec = NULL;
23752421
struct packet_sock *po = pkt_sk(sk);
23762422
int was_running, order = 0;
23772423
struct packet_ring_buffer *rb;
@@ -2533,15 +2579,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
25332579
continue;
25342580

25352581
for (i = 0; i < rb->pg_vec_len; i++) {
2536-
struct page *page = virt_to_page(rb->pg_vec[i]);
2582+
struct page *page;
2583+
void *kaddr = rb->pg_vec[i].buffer;
25372584
int pg_num;
25382585

25392586
for (pg_num = 0; pg_num < rb->pg_vec_pages;
2540-
pg_num++, page++) {
2587+
pg_num++) {
2588+
if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589+
page = vmalloc_to_page(kaddr);
2590+
else
2591+
page = virt_to_page(kaddr);
2592+
25412593
err = vm_insert_page(vma, start, page);
25422594
if (unlikely(err))
25432595
goto out;
25442596
start += PAGE_SIZE;
2597+
kaddr += PAGE_SIZE;
25452598
}
25462599
}
25472600
}

0 commit comments

Comments
 (0)