All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation
@ 2010-10-25 19:06 nhorman
  2010-10-25 20:17 ` Francois Romieu
                   ` (4 more replies)
  0 siblings, 5 replies; 17+ messages in thread
From: nhorman @ 2010-10-25 19:06 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, jpirko, Neil Horman

From: Neil Horman <nhorman@tuxdriver.com>

It was shown to me recently that systems under high load were driven very deep
into swap when tcpdump was run.  The reason this happened was because the
AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
application to specify how many entries an AF_PACKET socket will have and how
large each entry will be.  It seems the default setting for tcpdump is to set
the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
allocation.  Thats difficult under good circumstances, and horrid under memory
pressure.

I thought it would be good to make that a bit more usable.  I was going to do a
simple conversion of the ring buffer from contigous pages to iovecs, but
unfortunately, the metadata which AF_PACKET places in these buffers can easily
span a page boundary, and given that these buffers get mapped into user space,
and the data layout doesn't easily allow for a change to padding between frames
to avoid that, a simple iovec change is just going to break user space ABI
consistency.

So instead I've done this.  This patch does the aforementioned change,
allocating an array of pages instead of one contiguous chunk, and then vmaps the
array into a contiguous memory space, so that it can still be accessed in the
same way it was before.  This allows for a consisten user and kernel space
behavior for memory mapped AF_PACKET sockets, which at the same time relieving
the memory pressure placed on a system when tcpdump defaults are used.

Tested successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 net/packet/af_packet.c |   94 +++++++++++++++++++++++++++++++++++++----------
 1 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..fad3891 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -163,8 +163,14 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring);
 
+struct pkt_page_list {
+	unsigned int num_pages;
+	char	*vmap_area;
+	unsigned long pages[0];
+};
+
 struct packet_ring_buffer {
-	char			**pg_vec;
+	struct pkt_page_list	**pg_vec;
 	unsigned int		head;
 	unsigned int		frames_per_block;
 	unsigned int		frame_size;
@@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
 	pg_vec_pos = position / rb->frames_per_block;
 	frame_offset = position % rb->frames_per_block;
 
-	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+	h.raw = rb->pg_vec[pg_vec_pos]->vmap_area +
+		(frame_offset * rb->frame_size);
 
 	if (status != __packet_get_status(po, h.raw))
 		return NULL;
@@ -2322,41 +2329,86 @@ static const struct vm_operations_struct packet_mmap_ops = {
 	.close	=	packet_mm_close,
 };
 
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_one_pkt_list(struct pkt_page_list *pglist)
+{
+	int i;
+
+	if ((pglist->num_pages > 1) && (pglist->vmap_area))
+		vunmap(pglist->vmap_area);
+	for (i = 0; i < pglist->num_pages; i++)
+		free_pages(pglist->pages[i], 0);
+}
+
+static void free_pg_vec(struct pkt_page_list **pg_vec, unsigned int order,
+			unsigned int len)
 {
 	int i;
 
-	for (i = 0; i < len; i++) {
+	for (i = 0; i < len; i++)
 		if (likely(pg_vec[i]))
-			free_pages((unsigned long) pg_vec[i], order);
-	}
+			free_one_pkt_list(pg_vec[i]);
+
 	kfree(pg_vec);
 }
 
-static inline char *alloc_one_pg_vec_page(unsigned long order)
+static inline struct pkt_page_list *alloc_one_pg_page_list(unsigned long order)
 {
+	int i;
+	struct pkt_page_list *newlist;
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
 
-	return (char *) __get_free_pages(gfp_flags, order);
+
+	newlist = kzalloc(sizeof(struct pkt_page_list)+
+			  (sizeof(void *)*(1<<order)), GFP_KERNEL);
+	if (!newlist)
+		return NULL;
+	newlist->num_pages = (1<<order);
+
+	for (i = 0; i < newlist->num_pages; i++) {
+		newlist->pages[i] = __get_free_pages(gfp_flags, 0);
+		if (!newlist->pages[i])
+			goto out_free;
+	}
+
+	return newlist;
+out_free:
+	free_one_pkt_list(newlist);
+	kfree(newlist);
+	return NULL;
 }
 
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pkt_page_list **alloc_pg_vec(struct tpacket_req *req, int order)
 {
 	unsigned int block_nr = req->tp_block_nr;
-	char **pg_vec;
-	int i;
+	struct pkt_page_list **pg_vec;
+	struct page **pages;
+	int i, j;
 
-	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
-	if (unlikely(!pg_vec))
+	pg_vec = kzalloc(block_nr * sizeof(struct pkt_page_list *), GFP_KERNEL);
+	pages = kzalloc(block_nr * sizeof(struct page *), GFP_KERNEL);
+
+	if (unlikely(!pg_vec || !pages))
 		goto out;
 
 	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (unlikely(!pg_vec[i]))
-			goto out_free_pgvec;
+		pg_vec[i] = alloc_one_pg_page_list(order);
+
+		if (order > 0) {
+			for (j = 0; j < pg_vec[i]->num_pages; j++)
+				pages[j] = virt_to_page(pg_vec[i]->pages[j]);
+
+			if (unlikely(!pg_vec[i]))
+				goto out_free_pgvec;
+			pg_vec[i]->vmap_area = vmap(pages, pg_vec[i]->num_pages,
+						 VM_MAP, PAGE_KERNEL);
+			if (!pg_vec[i]->vmap_area)
+				goto out_free_pgvec;
+		} else
+			pg_vec[i]->vmap_area = (void *)pg_vec[i]->pages[0];
 	}
 
 out:
+	kfree(pages);
 	return pg_vec;
 
 out_free_pgvec:
@@ -2368,7 +2420,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring)
 {
-	char **pg_vec = NULL;
+	struct pkt_page_list **pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
 	struct packet_ring_buffer *rb;
@@ -2530,11 +2582,13 @@ static int packet_mmap(struct file *file, struct socket *sock,
 			continue;
 
 		for (i = 0; i < rb->pg_vec_len; i++) {
-			struct page *page = virt_to_page(rb->pg_vec[i]);
+			struct pkt_page_list *plist = rb->pg_vec[i];
 			int pg_num;
+			struct page *page;
 
-			for (pg_num = 0; pg_num < rb->pg_vec_pages;
-					pg_num++, page++) {
+			for (pg_num = 0; pg_num < plist->num_pages;
+					pg_num++) {
+				page = virt_to_page(plist->pages[pg_num]);
 				err = vm_insert_page(vma, start, page);
 				if (unlikely(err))
 					goto out;
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation
  2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
@ 2010-10-25 20:17 ` Francois Romieu
  2010-10-25 20:38 ` Eric Dumazet
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 17+ messages in thread
From: Francois Romieu @ 2010-10-25 20:17 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, eric.dumazet, jpirko

nhorman@tuxdriver.com <nhorman@tuxdriver.com> :
[...]
> +		if (order > 0) {
> +			for (j = 0; j < pg_vec[i]->num_pages; j++)
> +				pages[j] = virt_to_page(pg_vec[i]->pages[j]);
> +
> +			if (unlikely(!pg_vec[i]))
> +				goto out_free_pgvec;

Check pg_vec[i] sooner ?

-- 
Ueimor

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation
  2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
  2010-10-25 20:17 ` Francois Romieu
@ 2010-10-25 20:38 ` Eric Dumazet
  2010-11-09 17:46 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2) nhorman
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 17+ messages in thread
From: Eric Dumazet @ 2010-10-25 20:38 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, jpirko

Le lundi 25 octobre 2010 à 15:06 -0400, nhorman@tuxdriver.com a écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> 
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
> 
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
> 
> So instead I've done this.  This patch does the aforementioned change,
> allocating an array of pages instead of one contiguous chunk, and then vmaps the
> array into a contiguous memory space, so that it can still be accessed in the
> same way it was before.  This allows for a consisten user and kernel space
> behavior for memory mapped AF_PACKET sockets, which at the same time relieving
> the memory pressure placed on a system when tcpdump defaults are used.
> 
> Tested successfully by me.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---

Strange because last time I took a look at this stuff, libpcap was doing
several tries, reducing page orders until it got no allocation
failures...

(It tries to get high order pages, maybe to reduce TLB pressure...)

I remember adding __GFP_NOWARN to avoid a kernel message, while tcpdump
was actually working...




^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
  2010-10-25 20:17 ` Francois Romieu
  2010-10-25 20:38 ` Eric Dumazet
@ 2010-11-09 17:46 ` nhorman
  2010-11-09 18:02   ` Eric Dumazet
  2010-11-09 21:07   ` Maciej Żenczykowski
  2010-11-10 18:20 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3) nhorman
  2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
  4 siblings, 2 replies; 17+ messages in thread
From: nhorman @ 2010-11-09 17:46 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, zenczykowski, Neil Horman

From: Neil Horman <nhorman@tuxdriver.com>

Version 2 of this patch.  Sorry its been awhile, but I've had other issues come
up.  I've re-written this patch taking into account the change notes from the
last round

Change notes:
1) Rewrote the patch to not do all my previous stupid vmaps on single page
arrays.  Instead we just use vmalloc now.

2) Checked into the behavior of tcpdump on high order allocations in the failing
case.  Tcpdump (more specifically I think libpcap) does attempt to minimize the
allocation order on the ring buffer, unfortunately the lowest it will push the
ordrer too given a sufficiently large buffer is order 5, so its still a very
large contiguous allocation, and that says nothing of other malicious
applications that might try to do more.

Summary:
It was shown to me recently that systems under high load were driven very deep
into swap when tcpdump was run.  The reason this happened was because the
AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
application to specify how many entries an AF_PACKET socket will have and how
large each entry will be.  It seems the default setting for tcpdump is to set
the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
allocation.  Thats difficult under good circumstances, and horrid under memory
pressure.

I thought it would be good to make that a bit more usable.  I was going to do a
simple conversion of the ring buffer from contigous pages to iovecs, but
unfortunately, the metadata which AF_PACKET places in these buffers can easily
span a page boundary, and given that these buffers get mapped into user space,
and the data layout doesn't easily allow for a change to padding between frames
to avoid that, a simple iovec change is just going to break user space ABI
consistency.

So I've done this, I've added a three tiered mechanism to the af_packet set_ring
socket option.  It attempts to allocate memory in the following order:

1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
digging into swap

2) Using vmalloc

3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
needed to get the memory

The effect is that we don't disturb the system as much when we're under load,
while still being able to conduct tcpdumps effectively.

Tested successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..d1148ac 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -163,8 +163,14 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring);
 
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+	char *buffer;
+	unsigned char flags;
+};
+
 struct packet_ring_buffer {
-	char			**pg_vec;
+	struct pgv		*pg_vec;
 	unsigned int		head;
 	unsigned int		frames_per_block;
 	unsigned int		frame_size;
@@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
 	pg_vec_pos = position / rb->frames_per_block;
 	frame_offset = position % rb->frames_per_block;
 
-	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+	h.raw = rb->pg_vec[pg_vec_pos].buffer +
+		(frame_offset * rb->frame_size);
 
 	if (status != __packet_get_status(po, h.raw))
 		return NULL;
@@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
 	.close	=	packet_mm_close,
 };
 
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+			unsigned int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
-		if (likely(pg_vec[i]))
-			free_pages((unsigned long) pg_vec[i], order);
+		if (likely(pg_vec[i].buffer)) {
+			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
+				vfree(pg_vec[i].buffer);
+			else
+				free_pages((unsigned long)pg_vec[i].buffer,
+					   order);
+			pg_vec[i].buffer = NULL;
+		}
 	}
 	kfree(pg_vec);
 }
 
-static inline char *alloc_one_pg_vec_page(unsigned long order)
+static inline char *alloc_one_pg_vec_page(unsigned long order,
+					  unsigned char *flags)
 {
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+	char *buffer = NULL;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+	buffer = (char *) __get_free_pages(gfp_flags, order);
+
+	if (buffer)
+		return buffer;
+
+	/*
+	 * __get_free_pages failed, fall back to vmalloc
+	 */
+	*flags |= PGV_FROM_VMALLOC;
+	buffer = vmalloc((1 << order) * PAGE_SIZE);
 
-	return (char *) __get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * vmalloc failed, lets dig into swap here
+	 */
+	*flags = 0;
+	gfp_flags &= ~__GFP_NORETRY;
+	buffer = (char *)__get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * complete and utter failure
+	 */
+	return NULL;
 }
 
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 {
 	unsigned int block_nr = req->tp_block_nr;
-	char **pg_vec;
+	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+	pg_vec = kzalloc(block_nr * sizeof(struct pgv), GFP_KERNEL);
 	if (unlikely(!pg_vec))
 		goto out;
 
 	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (unlikely(!pg_vec[i]))
+		pg_vec[i].buffer = alloc_one_pg_vec_page(order,
+							 &pg_vec[i].flags);
+		if (unlikely(!pg_vec[i].buffer))
 			goto out_free_pgvec;
 	}
 
@@ -2361,6 +2405,7 @@ out:
 
 out_free_pgvec:
 	free_pg_vec(pg_vec, order, block_nr);
+	kfree(pg_vec);
 	pg_vec = NULL;
 	goto out;
 }
@@ -2368,7 +2413,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring)
 {
-	char **pg_vec = NULL;
+	struct pgv *pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
 	struct packet_ring_buffer *rb;
@@ -2530,15 +2575,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
 			continue;
 
 		for (i = 0; i < rb->pg_vec_len; i++) {
-			struct page *page = virt_to_page(rb->pg_vec[i]);
+			struct page *page;
+			void *kaddr = rb->pg_vec[i].buffer;
 			int pg_num;
 
 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
-					pg_num++, page++) {
+					pg_num++) {
+				if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
+					page = vmalloc_to_page(kaddr);
+				else
+					page = virt_to_page(kaddr);
+
 				err = vm_insert_page(vma, start, page);
 				if (unlikely(err))
 					goto out;
 				start += PAGE_SIZE;
+				kaddr += PAGE_SIZE;
 			}
 		}
 	}
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 17:46 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2) nhorman
@ 2010-11-09 18:02   ` Eric Dumazet
  2010-11-09 18:38     ` Neil Horman
  2010-11-09 21:07   ` Maciej Żenczykowski
  1 sibling, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2010-11-09 18:02 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, zenczykowski

Le mardi 09 novembre 2010 à 12:46 -0500, nhorman@tuxdriver.com a écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> 
> Version 2 of this patch.  Sorry its been awhile, but I've had other issues come
> up.  I've re-written this patch taking into account the change notes from the
> last round
> 
> Change notes:
> 1) Rewrote the patch to not do all my previous stupid vmaps on single page
> arrays.  Instead we just use vmalloc now.
> 
> 2) Checked into the behavior of tcpdump on high order allocations in the failing
> case.  Tcpdump (more specifically I think libpcap) does attempt to minimize the
> allocation order on the ring buffer, unfortunately the lowest it will push the
> ordrer too given a sufficiently large buffer is order 5, so its still a very
> large contiguous allocation, and that says nothing of other malicious
> applications that might try to do more.
> 
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
> 
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
> 
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
> 
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
> 
> 2) Using vmalloc
> 
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
> 
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
> 
> Tested successfully by me.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>  net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 68 insertions(+), 16 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..d1148ac 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -163,8 +163,14 @@ struct packet_mreq_max {
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>  		int closing, int tx_ring);
>  
> +#define PGV_FROM_VMALLOC 1
> +struct pgv {
> +	char *buffer;
> +	unsigned char flags;
> +};
> +
>  struct packet_ring_buffer {
> -	char			**pg_vec;
> +	struct pgv		*pg_vec;
>  	unsigned int		head;
>  	unsigned int		frames_per_block;
>  	unsigned int		frame_size;
> @@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
>  	pg_vec_pos = position / rb->frames_per_block;
>  	frame_offset = position % rb->frames_per_block;
>  
> -	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
> +	h.raw = rb->pg_vec[pg_vec_pos].buffer +
> +		(frame_offset * rb->frame_size);
>  
>  	if (status != __packet_get_status(po, h.raw))
>  		return NULL;
> @@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
>  	.close	=	packet_mm_close,
>  };
>  
> -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
> +static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
> +			unsigned int len)
>  {
>  	int i;
>  
>  	for (i = 0; i < len; i++) {
> -		if (likely(pg_vec[i]))
> -			free_pages((unsigned long) pg_vec[i], order);
> +		if (likely(pg_vec[i].buffer)) {
> +			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
> +				vfree(pg_vec[i].buffer);
> +			else
> +				free_pages((unsigned long)pg_vec[i].buffer,
> +					   order);
> +			pg_vec[i].buffer = NULL;
> +		}
>  	}
>  	kfree(pg_vec);
>  }
>  
> -static inline char *alloc_one_pg_vec_page(unsigned long order)
> +static inline char *alloc_one_pg_vec_page(unsigned long order,
> +					  unsigned char *flags)
>  {
> -	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
> +	char *buffer = NULL;
> +	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
> +			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +	buffer = (char *) __get_free_pages(gfp_flags, order);
> +
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * __get_free_pages failed, fall back to vmalloc
> +	 */
> +	*flags |= PGV_FROM_VMALLOC;
> +	buffer = vmalloc((1 << order) * PAGE_SIZE);
>  
> -	return (char *) __get_free_pages(gfp_flags, order);
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * vmalloc failed, lets dig into swap here
> +	 */
> +	*flags = 0;
> +	gfp_flags &= ~__GFP_NORETRY;
> +	buffer = (char *)__get_free_pages(gfp_flags, order);
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * complete and utter failure
> +	 */
> +	return NULL;
>  }
>  
> -static char **alloc_pg_vec(struct tpacket_req *req, int order)
> +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
>  {
>  	unsigned int block_nr = req->tp_block_nr;
> -	char **pg_vec;
> +	struct pgv *pg_vec;
>  	int i;
>  
> -	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> +	pg_vec = kzalloc(block_nr * sizeof(struct pgv), GFP_KERNEL);

While we are at it, we could check block_nr being a sane value here ;)

Nice stuff Neil !



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 18:02   ` Eric Dumazet
@ 2010-11-09 18:38     ` Neil Horman
  2010-11-09 19:20       ` Eric Dumazet
  0 siblings, 1 reply; 17+ messages in thread
From: Neil Horman @ 2010-11-09 18:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, zenczykowski

On Tue, Nov 09, 2010 at 07:02:32PM +0100, Eric Dumazet wrote:
> Le mardi 09 novembre 2010 à 12:46 -0500, nhorman@tuxdriver.com a écrit :
> > From: Neil Horman <nhorman@tuxdriver.com>
> > 
> > Version 2 of this patch.  Sorry its been awhile, but I've had other issues come
> > up.  I've re-written this patch taking into account the change notes from the
> > last round
> > 
> > Change notes:
> > 1) Rewrote the patch to not do all my previous stupid vmaps on single page
> > arrays.  Instead we just use vmalloc now.
> > 
> > 2) Checked into the behavior of tcpdump on high order allocations in the failing
> > case.  Tcpdump (more specifically I think libpcap) does attempt to minimize the
> > allocation order on the ring buffer, unfortunately the lowest it will push the
> > ordrer too given a sufficiently large buffer is order 5, so its still a very
> > large contiguous allocation, and that says nothing of other malicious
> > applications that might try to do more.
> > 
> > Summary:
> > It was shown to me recently that systems under high load were driven very deep
> > into swap when tcpdump was run.  The reason this happened was because the
> > AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> > application to specify how many entries an AF_PACKET socket will have and how
> > large each entry will be.  It seems the default setting for tcpdump is to set
> > the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> > allocation.  Thats difficult under good circumstances, and horrid under memory
> > pressure.
> > 
> > I thought it would be good to make that a bit more usable.  I was going to do a
> > simple conversion of the ring buffer from contigous pages to iovecs, but
> > unfortunately, the metadata which AF_PACKET places in these buffers can easily
> > span a page boundary, and given that these buffers get mapped into user space,
> > and the data layout doesn't easily allow for a change to padding between frames
> > to avoid that, a simple iovec change is just going to break user space ABI
> > consistency.
> > 
> > So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> > socket option.  It attempts to allocate memory in the following order:
> > 
> > 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> > digging into swap
> > 
> > 2) Using vmalloc
> > 
> > 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> > needed to get the memory
> > 
> > The effect is that we don't disturb the system as much when we're under load,
> > while still being able to conduct tcpdumps effectively.
> > 
> > Tested successfully by me.
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > ---
> >  net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
> >  1 files changed, 68 insertions(+), 16 deletions(-)
> > 
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index 3616f27..d1148ac 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -163,8 +163,14 @@ struct packet_mreq_max {
> >  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
> >  		int closing, int tx_ring);
> >  
> > +#define PGV_FROM_VMALLOC 1
> > +struct pgv {
> > +	char *buffer;
> > +	unsigned char flags;
> > +};
> > +
> >  struct packet_ring_buffer {
> > -	char			**pg_vec;
> > +	struct pgv		*pg_vec;
> >  	unsigned int		head;
> >  	unsigned int		frames_per_block;
> >  	unsigned int		frame_size;
> > @@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
> >  	pg_vec_pos = position / rb->frames_per_block;
> >  	frame_offset = position % rb->frames_per_block;
> >  
> > -	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
> > +	h.raw = rb->pg_vec[pg_vec_pos].buffer +
> > +		(frame_offset * rb->frame_size);
> >  
> >  	if (status != __packet_get_status(po, h.raw))
> >  		return NULL;
> > @@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
> >  	.close	=	packet_mm_close,
> >  };
> >  
> > -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
> > +static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
> > +			unsigned int len)
> >  {
> >  	int i;
> >  
> >  	for (i = 0; i < len; i++) {
> > -		if (likely(pg_vec[i]))
> > -			free_pages((unsigned long) pg_vec[i], order);
> > +		if (likely(pg_vec[i].buffer)) {
> > +			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
> > +				vfree(pg_vec[i].buffer);
> > +			else
> > +				free_pages((unsigned long)pg_vec[i].buffer,
> > +					   order);
> > +			pg_vec[i].buffer = NULL;
> > +		}
> >  	}
> >  	kfree(pg_vec);
> >  }
> >  
> > -static inline char *alloc_one_pg_vec_page(unsigned long order)
> > +static inline char *alloc_one_pg_vec_page(unsigned long order,
> > +					  unsigned char *flags)
> >  {
> > -	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
> > +	char *buffer = NULL;
> > +	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
> > +			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> > +
> > +	buffer = (char *) __get_free_pages(gfp_flags, order);
> > +
> > +	if (buffer)
> > +		return buffer;
> > +
> > +	/*
> > +	 * __get_free_pages failed, fall back to vmalloc
> > +	 */
> > +	*flags |= PGV_FROM_VMALLOC;
> > +	buffer = vmalloc((1 << order) * PAGE_SIZE);
> >  
> > -	return (char *) __get_free_pages(gfp_flags, order);
> > +	if (buffer)
> > +		return buffer;
> > +
> > +	/*
> > +	 * vmalloc failed, lets dig into swap here
> > +	 */
> > +	*flags = 0;
> > +	gfp_flags &= ~__GFP_NORETRY;
> > +	buffer = (char *)__get_free_pages(gfp_flags, order);
> > +	if (buffer)
> > +		return buffer;
> > +
> > +	/*
> > +	 * complete and utter failure
> > +	 */
> > +	return NULL;
> >  }
> >  
> > -static char **alloc_pg_vec(struct tpacket_req *req, int order)
> > +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
> >  {
> >  	unsigned int block_nr = req->tp_block_nr;
> > -	char **pg_vec;
> > +	struct pgv *pg_vec;
> >  	int i;
> >  
> > -	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> > +	pg_vec = kzalloc(block_nr * sizeof(struct pgv), GFP_KERNEL);
> 
> While we are at it, we could check block_nr being a sane value here ;)
> 
This is true.  What do you think a reasonable sane value is?  libpcap seems to
limit itself to 32 order 5 entries in the ring, but that seems a bit arbitrary.
Perhaps we could check and limit allocations to being no more than order 8
(1Mb), and a total allocation of no more than perhaps max(32Mb, 1% of all ram)?
Just throwing it out there, open to any suggestions here

> Nice stuff Neil !
> 
Thanks!
Neil

> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 18:38     ` Neil Horman
@ 2010-11-09 19:20       ` Eric Dumazet
  2010-11-09 20:57         ` Neil Horman
  0 siblings, 1 reply; 17+ messages in thread
From: Eric Dumazet @ 2010-11-09 19:20 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, davem, zenczykowski

Le mardi 09 novembre 2010 à 13:38 -0500, Neil Horman a écrit :
> On Tue, Nov 09, 2010 at 07:02:32PM +0100, Eric Dumazet wrote:
> > Le mardi 09 novembre 2010 à 12:46 -0500, nhorman@tuxdriver.com a écrit :
> ic char **alloc_pg_vec(struct tpacket_req *req, int order)
> > > +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
> > >  {
> > >  	unsigned int block_nr = req->tp_block_nr;
> > > -	char **pg_vec;
> > > +	struct pgv *pg_vec;
> > >  	int i;
> > >  
> > > -	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> > > +	pg_vec = kzalloc(block_nr * sizeof(struct pgv), GFP_KERNEL);
> > 
> > While we are at it, we could check block_nr being a sane value here ;)
> > 
> This is true.  What do you think a reasonable sane value is?  libpcap seems to
> limit itself to 32 order 5 entries in the ring, but that seems a bit arbitrary.
> Perhaps we could check and limit allocations to being no more than order 8
> (1Mb), and a total allocation of no more than perhaps max(32Mb, 1% of all ram)?
> Just throwing it out there, open to any suggestions here

I was refering to a malicious/buggy program giving a big tp_block_nr so
that (block_nr * sizeof(struct pgv)) overflows the u32

One way to deal with that is to use

	kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);

I am not sure consistency checks done in packet_set_ring() are enough to
properly detect such errors.





^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 19:20       ` Eric Dumazet
@ 2010-11-09 20:57         ` Neil Horman
  0 siblings, 0 replies; 17+ messages in thread
From: Neil Horman @ 2010-11-09 20:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, zenczykowski

On Tue, Nov 09, 2010 at 08:20:38PM +0100, Eric Dumazet wrote:
> Le mardi 09 novembre 2010 à 13:38 -0500, Neil Horman a écrit :
> > On Tue, Nov 09, 2010 at 07:02:32PM +0100, Eric Dumazet wrote:
> > > Le mardi 09 novembre 2010 à 12:46 -0500, nhorman@tuxdriver.com a écrit :
> > ic char **alloc_pg_vec(struct tpacket_req *req, int order)
> > > > +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
> > > >  {
> > > >  	unsigned int block_nr = req->tp_block_nr;
> > > > -	char **pg_vec;
> > > > +	struct pgv *pg_vec;
> > > >  	int i;
> > > >  
> > > > -	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> > > > +	pg_vec = kzalloc(block_nr * sizeof(struct pgv), GFP_KERNEL);
> > > 
> > > While we are at it, we could check block_nr being a sane value here ;)
> > > 
> > This is true.  What do you think a reasonable sane value is?  libpcap seems to
> > limit itself to 32 order 5 entries in the ring, but that seems a bit arbitrary.
> > Perhaps we could check and limit allocations to being no more than order 8
> > (1Mb), and a total allocation of no more than perhaps max(32Mb, 1% of all ram)?
> > Just throwing it out there, open to any suggestions here
> 
> I was refering to a malicious/buggy program giving a big tp_block_nr so
> that (block_nr * sizeof(struct pgv)) overflows the u32
> 
> One way to deal with that is to use
> 
> 	kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
> 
> I am not sure consistency checks done in packet_set_ring() are enough to
> properly detect such errors.
Ah, I get you, ok.  Yeah, I'll respin this with that taken into account.
Thanks!
Neil

> 
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 17:46 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2) nhorman
  2010-11-09 18:02   ` Eric Dumazet
@ 2010-11-09 21:07   ` Maciej Żenczykowski
  2010-11-09 21:20     ` Neil Horman
  1 sibling, 1 reply; 17+ messages in thread
From: Maciej Żenczykowski @ 2010-11-09 21:07 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, eric.dumazet

> +#define PGV_FROM_VMALLOC 1

Why don't we always just use vmalloc, what's the benefit of get_user_pages?

> +       /*
> +        * vmalloc failed, lets dig into swap here
> +        */
> +       *flags = 0;

probably better to *flags &= ~PGV_FROM_VMALLOC;
(since some flags could have been set before this function was called)

> +       gfp_flags &= ~__GFP_NORETRY;
> +       buffer = (char *)__get_free_pages(gfp_flags, order);

wouldn't this still cause problems because you're now requiring linear
memory again?
Would it be better to just fail at this point?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2)
  2010-11-09 21:07   ` Maciej Żenczykowski
@ 2010-11-09 21:20     ` Neil Horman
  0 siblings, 0 replies; 17+ messages in thread
From: Neil Horman @ 2010-11-09 21:20 UTC (permalink / raw)
  To: Maciej Żenczykowski; +Cc: netdev, davem, eric.dumazet

On Tue, Nov 09, 2010 at 01:07:32PM -0800, Maciej Żenczykowski wrote:
> > +#define PGV_FROM_VMALLOC 1
> 
> Why don't we always just use vmalloc, what's the benefit of get_user_pages?
> 
Because of how vmalloc works.  It maps discontiguous pages into contiguous
address space.  But we only have 128MB of that address space to work with by
default, so its quite possible that we won't be able to alloc all the memory.

> > +       /*
> > +        * vmalloc failed, lets dig into swap here
> > +        */
> > +       *flags = 0;
> 
> probably better to *flags &= ~PGV_FROM_VMALLOC;
> (since some flags could have been set before this function was called)
> 
Well, if any other users of this field existed, I'd agree, but since we're the
only one, I think its ok, at least for now.

> > +       gfp_flags &= ~__GFP_NORETRY;
> > +       buffer = (char *)__get_free_pages(gfp_flags, order);
> 
> wouldn't this still cause problems because you're now requiring linear
> memory again?
yes, its a last ditch effort after the other two options have been tried.  Its
all thats left to do.

> Would it be better to just fail at this point?
Why?  If we can dig into swap and get the memory, we may as well try.  It would
be better if we didn't have to, but if the choice is between failing and making
the system slow down....

Neil

> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3)
  2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
                   ` (2 preceding siblings ...)
  2010-11-09 17:46 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2) nhorman
@ 2010-11-10 18:20 ` nhorman
  2010-11-10 18:27   ` Eric Dumazet
  2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
  4 siblings, 1 reply; 17+ messages in thread
From: nhorman @ 2010-11-10 18:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, zenczykowski, Neil Horman

From: Neil Horman <nhorman@tuxdriver.com>

Version 3 of this patch.

Change notes:
1) Updated pg_vec alloc mechanism to prevent user space from overflowing an int
when configuring the ring buffer.

Summary:
It was shown to me recently that systems under high load were driven very deep
into swap when tcpdump was run.  The reason this happened was because the
AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
application to specify how many entries an AF_PACKET socket will have and how
large each entry will be.  It seems the default setting for tcpdump is to set
the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
allocation.  Thats difficult under good circumstances, and horrid under memory
pressure.

I thought it would be good to make that a bit more usable.  I was going to do a
simple conversion of the ring buffer from contigous pages to iovecs, but
unfortunately, the metadata which AF_PACKET places in these buffers can easily
span a page boundary, and given that these buffers get mapped into user space,
and the data layout doesn't easily allow for a change to padding between frames
to avoid that, a simple iovec change is just going to break user space ABI
consistency.

So I've done this, I've added a three tiered mechanism to the af_packet set_ring
socket option.  It attempts to allocate memory in the following order:

1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
digging into swap

2) Using vmalloc

3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
needed to get the memory

The effect is that we don't disturb the system as much when we're under load,
while still being able to conduct tcpdumps effectively.

Tested successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 net/packet/af_packet.c |   86 +++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..5218e67 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -163,8 +163,14 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring);
 
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+	char *buffer;
+	unsigned char flags;
+};
+
 struct packet_ring_buffer {
-	char			**pg_vec;
+	struct pgv		*pg_vec;
 	unsigned int		head;
 	unsigned int		frames_per_block;
 	unsigned int		frame_size;
@@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
 	pg_vec_pos = position / rb->frames_per_block;
 	frame_offset = position % rb->frames_per_block;
 
-	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+	h.raw = rb->pg_vec[pg_vec_pos].buffer +
+		(frame_offset * rb->frame_size);
 
 	if (status != __packet_get_status(po, h.raw))
 		return NULL;
@@ -2322,37 +2329,76 @@ static const struct vm_operations_struct packet_mmap_ops = {
 	.close	=	packet_mm_close,
 };
 
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+			unsigned int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
-		if (likely(pg_vec[i]))
-			free_pages((unsigned long) pg_vec[i], order);
+		if (likely(pg_vec[i].buffer)) {
+			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
+				vfree(pg_vec[i].buffer);
+			else
+				free_pages((unsigned long)pg_vec[i].buffer,
+					   order);
+			pg_vec[i].buffer = NULL;
+		}
 	}
 	kfree(pg_vec);
 }
 
-static inline char *alloc_one_pg_vec_page(unsigned long order)
+static inline char *alloc_one_pg_vec_page(unsigned long order,
+					  unsigned char *flags)
 {
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+	char *buffer = NULL;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+	buffer = (char *) __get_free_pages(gfp_flags, order);
+
+	if (buffer)
+		return buffer;
+
+	/*
+	 * __get_free_pages failed, fall back to vmalloc
+	 */
+	*flags |= PGV_FROM_VMALLOC;
+	buffer = vmalloc((1 << order) * PAGE_SIZE);
 
-	return (char *) __get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * vmalloc failed, lets dig into swap here
+	 */
+	*flags = 0;
+	gfp_flags &= ~__GFP_NORETRY;
+	buffer = (char *)__get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * complete and utter failure
+	 */
+	return NULL;
 }
 
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 {
 	unsigned int block_nr = req->tp_block_nr;
-	char **pg_vec;
+	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
 	if (unlikely(!pg_vec))
 		goto out;
 
+	memset(pg_vec, 0 , sizeof(struct pgv) * block_nr);
+
 	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (unlikely(!pg_vec[i]))
+		pg_vec[i].buffer = alloc_one_pg_vec_page(order,
+							 &pg_vec[i].flags);
+		if (unlikely(!pg_vec[i].buffer))
 			goto out_free_pgvec;
 	}
 
@@ -2361,6 +2407,7 @@ out:
 
 out_free_pgvec:
 	free_pg_vec(pg_vec, order, block_nr);
+	kfree(pg_vec);
 	pg_vec = NULL;
 	goto out;
 }
@@ -2368,7 +2415,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring)
 {
-	char **pg_vec = NULL;
+	struct pgv *pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
 	struct packet_ring_buffer *rb;
@@ -2530,15 +2577,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
 			continue;
 
 		for (i = 0; i < rb->pg_vec_len; i++) {
-			struct page *page = virt_to_page(rb->pg_vec[i]);
+			struct page *page;
+			void *kaddr = rb->pg_vec[i].buffer;
 			int pg_num;
 
 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
-					pg_num++, page++) {
+					pg_num++) {
+				if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
+					page = vmalloc_to_page(kaddr);
+				else
+					page = virt_to_page(kaddr);
+
 				err = vm_insert_page(vma, start, page);
 				if (unlikely(err))
 					goto out;
 				start += PAGE_SIZE;
+				kaddr += PAGE_SIZE;
 			}
 		}
 	}
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3)
  2010-11-10 18:20 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3) nhorman
@ 2010-11-10 18:27   ` Eric Dumazet
  0 siblings, 0 replies; 17+ messages in thread
From: Eric Dumazet @ 2010-11-10 18:27 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, zenczykowski

Le mercredi 10 novembre 2010 à 13:20 -0500, nhorman@tuxdriver.com a
écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> 
> Version 3 of this patch.
> 
> Change notes:
> 1) Updated pg_vec alloc mechanism to prevent user space from overflowing an int
> when configuring the ring buffer.
> 
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
> 
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
> 
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
> 
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
> 
> 2) Using vmalloc
> 
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
> 
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
> 
> Tested successfully by me.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>  net/packet/af_packet.c |   86 +++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 70 insertions(+), 16 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..5218e67 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -163,8 +163,14 @@ struct packet_mreq_max {
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>  		int closing, int tx_ring);
>  
> +#define PGV_FROM_VMALLOC 1
> +struct pgv {
> +	char *buffer;
> +	unsigned char flags;
> +};
> +
>  struct packet_ring_buffer {
> -	char			**pg_vec;
> +	struct pgv		*pg_vec;
>  	unsigned int		head;
>  	unsigned int		frames_per_block;
>  	unsigned int		frame_size;
> @@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
>  	pg_vec_pos = position / rb->frames_per_block;
>  	frame_offset = position % rb->frames_per_block;
>  
> -	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
> +	h.raw = rb->pg_vec[pg_vec_pos].buffer +
> +		(frame_offset * rb->frame_size);
>  
>  	if (status != __packet_get_status(po, h.raw))
>  		return NULL;
> @@ -2322,37 +2329,76 @@ static const struct vm_operations_struct packet_mmap_ops = {
>  	.close	=	packet_mm_close,
>  };
>  
> -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
> +static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
> +			unsigned int len)
>  {
>  	int i;
>  
>  	for (i = 0; i < len; i++) {
> -		if (likely(pg_vec[i]))
> -			free_pages((unsigned long) pg_vec[i], order);
> +		if (likely(pg_vec[i].buffer)) {
> +			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
> +				vfree(pg_vec[i].buffer);
> +			else
> +				free_pages((unsigned long)pg_vec[i].buffer,
> +					   order);
> +			pg_vec[i].buffer = NULL;
> +		}
>  	}
>  	kfree(pg_vec);
>  }
>  
> -static inline char *alloc_one_pg_vec_page(unsigned long order)
> +static inline char *alloc_one_pg_vec_page(unsigned long order,
> +					  unsigned char *flags)
>  {
> -	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
> +	char *buffer = NULL;
> +	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
> +			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +	buffer = (char *) __get_free_pages(gfp_flags, order);
> +
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * __get_free_pages failed, fall back to vmalloc
> +	 */
> +	*flags |= PGV_FROM_VMALLOC;
> +	buffer = vmalloc((1 << order) * PAGE_SIZE);
>  
> -	return (char *) __get_free_pages(gfp_flags, order);
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * vmalloc failed, lets dig into swap here
> +	 */
> +	*flags = 0;
> +	gfp_flags &= ~__GFP_NORETRY;
> +	buffer = (char *)__get_free_pages(gfp_flags, order);
> +	if (buffer)
> +		return buffer;
> +
> +	/*
> +	 * complete and utter failure
> +	 */
> +	return NULL;
>  }
>  
> -static char **alloc_pg_vec(struct tpacket_req *req, int order)
> +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
>  {
>  	unsigned int block_nr = req->tp_block_nr;
> -	char **pg_vec;
> +	struct pgv *pg_vec;
>  	int i;
>  
> -	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> +	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
>  	if (unlikely(!pg_vec))
>  		goto out;
>  
> +	memset(pg_vec, 0 , sizeof(struct pgv) * block_nr);
> +

Well, memset() is not needed here : kzalloc() or kcalloc() already
cleared your memory.

You added this bit, this was not in your previous patch ;)




^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
  2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
                   ` (3 preceding siblings ...)
  2010-11-10 18:20 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3) nhorman
@ 2010-11-10 19:09 ` nhorman
  2010-11-11  6:29   ` Eric Dumazet
                     ` (2 more replies)
  4 siblings, 3 replies; 17+ messages in thread
From: nhorman @ 2010-11-10 19:09 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, zenczykowski, Neil Horman

From: Neil Horman <nhorman@tuxdriver.com>

Version 4 of this patch.

Change notes:
1) Removed extra memset.  Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)

Summary:
It was shown to me recently that systems under high load were driven very deep
into swap when tcpdump was run.  The reason this happened was because the
AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
application to specify how many entries an AF_PACKET socket will have and how
large each entry will be.  It seems the default setting for tcpdump is to set
the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
allocation.  Thats difficult under good circumstances, and horrid under memory
pressure.

I thought it would be good to make that a bit more usable.  I was going to do a
simple conversion of the ring buffer from contigous pages to iovecs, but
unfortunately, the metadata which AF_PACKET places in these buffers can easily
span a page boundary, and given that these buffers get mapped into user space,
and the data layout doesn't easily allow for a change to padding between frames
to avoid that, a simple iovec change is just going to break user space ABI
consistency.

So I've done this, I've added a three tiered mechanism to the af_packet set_ring
socket option.  It attempts to allocate memory in the following order:

1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
digging into swap

2) Using vmalloc

3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
needed to get the memory

The effect is that we don't disturb the system as much when we're under load,
while still being able to conduct tcpdumps effectively.

Tested successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..a372390 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -163,8 +163,14 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring);
 
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+	char *buffer;
+	unsigned char flags;
+};
+
 struct packet_ring_buffer {
-	char			**pg_vec;
+	struct pgv		*pg_vec;
 	unsigned int		head;
 	unsigned int		frames_per_block;
 	unsigned int		frame_size;
@@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
 	pg_vec_pos = position / rb->frames_per_block;
 	frame_offset = position % rb->frames_per_block;
 
-	h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+	h.raw = rb->pg_vec[pg_vec_pos].buffer +
+		(frame_offset * rb->frame_size);
 
 	if (status != __packet_get_status(po, h.raw))
 		return NULL;
@@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
 	.close	=	packet_mm_close,
 };
 
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+			unsigned int len)
 {
 	int i;
 
 	for (i = 0; i < len; i++) {
-		if (likely(pg_vec[i]))
-			free_pages((unsigned long) pg_vec[i], order);
+		if (likely(pg_vec[i].buffer)) {
+			if (pg_vec[i].flags & PGV_FROM_VMALLOC)
+				vfree(pg_vec[i].buffer);
+			else
+				free_pages((unsigned long)pg_vec[i].buffer,
+					   order);
+			pg_vec[i].buffer = NULL;
+		}
 	}
 	kfree(pg_vec);
 }
 
-static inline char *alloc_one_pg_vec_page(unsigned long order)
+static inline char *alloc_one_pg_vec_page(unsigned long order,
+					  unsigned char *flags)
 {
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+	char *buffer = NULL;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+	buffer = (char *) __get_free_pages(gfp_flags, order);
+
+	if (buffer)
+		return buffer;
+
+	/*
+	 * __get_free_pages failed, fall back to vmalloc
+	 */
+	*flags |= PGV_FROM_VMALLOC;
+	buffer = vmalloc((1 << order) * PAGE_SIZE);
 
-	return (char *) __get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * vmalloc failed, lets dig into swap here
+	 */
+	*flags = 0;
+	gfp_flags &= ~__GFP_NORETRY;
+	buffer = (char *)__get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * complete and utter failure
+	 */
+	return NULL;
 }
 
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 {
 	unsigned int block_nr = req->tp_block_nr;
-	char **pg_vec;
+	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
 	if (unlikely(!pg_vec))
 		goto out;
 
 	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (unlikely(!pg_vec[i]))
+		pg_vec[i].buffer = alloc_one_pg_vec_page(order,
+							 &pg_vec[i].flags);
+		if (unlikely(!pg_vec[i].buffer))
 			goto out_free_pgvec;
 	}
 
@@ -2361,6 +2405,7 @@ out:
 
 out_free_pgvec:
 	free_pg_vec(pg_vec, order, block_nr);
+	kfree(pg_vec);
 	pg_vec = NULL;
 	goto out;
 }
@@ -2368,7 +2413,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring)
 {
-	char **pg_vec = NULL;
+	struct pgv *pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
 	struct packet_ring_buffer *rb;
@@ -2530,15 +2575,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
 			continue;
 
 		for (i = 0; i < rb->pg_vec_len; i++) {
-			struct page *page = virt_to_page(rb->pg_vec[i]);
+			struct page *page;
+			void *kaddr = rb->pg_vec[i].buffer;
 			int pg_num;
 
 			for (pg_num = 0; pg_num < rb->pg_vec_pages;
-					pg_num++, page++) {
+					pg_num++) {
+				if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
+					page = vmalloc_to_page(kaddr);
+				else
+					page = virt_to_page(kaddr);
+
 				err = vm_insert_page(vma, start, page);
 				if (unlikely(err))
 					goto out;
 				start += PAGE_SIZE;
+				kaddr += PAGE_SIZE;
 			}
 		}
 	}
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
  2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
@ 2010-11-11  6:29   ` Eric Dumazet
  2010-11-11  8:03   ` Maciej Żenczykowski
  2010-11-16 18:25   ` David Miller
  2 siblings, 0 replies; 17+ messages in thread
From: Eric Dumazet @ 2010-11-11  6:29 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, zenczykowski

Le mercredi 10 novembre 2010 à 14:09 -0500, nhorman@tuxdriver.com a
écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> 
> Version 4 of this patch.
> 
> Change notes:
> 1) Removed extra memset.  Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)
> 
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
> 
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
> 
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
> 
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
> 
> 2) Using vmalloc
> 
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
> 
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
> 
> Tested successfully by me.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks Neil !



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
  2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
  2010-11-11  6:29   ` Eric Dumazet
@ 2010-11-11  8:03   ` Maciej Żenczykowski
  2010-11-16 18:25   ` David Miller
  2 siblings, 0 replies; 17+ messages in thread
From: Maciej Żenczykowski @ 2010-11-11  8:03 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, eric.dumazet

On Wed, Nov 10, 2010 at 11:09,  <nhorman@tuxdriver.com> wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
>
> Version 4 of this patch.
>
> Change notes:
> 1) Removed extra memset.  Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)
>
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
>
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
>
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
>
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
>
> 2) Using vmalloc
>
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
>
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
>
> Tested successfully by me.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>  net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 68 insertions(+), 16 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..a372390 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -163,8 +163,14 @@ struct packet_mreq_max {
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>                int closing, int tx_ring);
>
> +#define PGV_FROM_VMALLOC 1
> +struct pgv {
> +       char *buffer;
> +       unsigned char flags;
> +};
> +
>  struct packet_ring_buffer {
> -       char                    **pg_vec;
> +       struct pgv              *pg_vec;
>        unsigned int            head;
>        unsigned int            frames_per_block;
>        unsigned int            frame_size;
> @@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
>        pg_vec_pos = position / rb->frames_per_block;
>        frame_offset = position % rb->frames_per_block;
>
> -       h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
> +       h.raw = rb->pg_vec[pg_vec_pos].buffer +
> +               (frame_offset * rb->frame_size);
>
>        if (status != __packet_get_status(po, h.raw))
>                return NULL;
> @@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
>        .close  =       packet_mm_close,
>  };
>
> -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
> +static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
> +                       unsigned int len)
>  {
>        int i;
>
>        for (i = 0; i < len; i++) {
> -               if (likely(pg_vec[i]))
> -                       free_pages((unsigned long) pg_vec[i], order);
> +               if (likely(pg_vec[i].buffer)) {
> +                       if (pg_vec[i].flags & PGV_FROM_VMALLOC)
> +                               vfree(pg_vec[i].buffer);
> +                       else
> +                               free_pages((unsigned long)pg_vec[i].buffer,
> +                                          order);
> +                       pg_vec[i].buffer = NULL;
> +               }
>        }
>        kfree(pg_vec);
>  }
>
> -static inline char *alloc_one_pg_vec_page(unsigned long order)
> +static inline char *alloc_one_pg_vec_page(unsigned long order,
> +                                         unsigned char *flags)
>  {
> -       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
> +       char *buffer = NULL;
> +       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
> +                         __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +       buffer = (char *) __get_free_pages(gfp_flags, order);
> +
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * __get_free_pages failed, fall back to vmalloc
> +        */
> +       *flags |= PGV_FROM_VMALLOC;
> +       buffer = vmalloc((1 << order) * PAGE_SIZE);
>
> -       return (char *) __get_free_pages(gfp_flags, order);
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * vmalloc failed, lets dig into swap here
> +        */
> +       *flags = 0;
> +       gfp_flags &= ~__GFP_NORETRY;
> +       buffer = (char *)__get_free_pages(gfp_flags, order);
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * complete and utter failure
> +        */
> +       return NULL;
>  }
>
> -static char **alloc_pg_vec(struct tpacket_req *req, int order)
> +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
>  {
>        unsigned int block_nr = req->tp_block_nr;
> -       char **pg_vec;
> +       struct pgv *pg_vec;
>        int i;
>
> -       pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> +       pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
>        if (unlikely(!pg_vec))
>                goto out;
>
>        for (i = 0; i < block_nr; i++) {
> -               pg_vec[i] = alloc_one_pg_vec_page(order);
> -               if (unlikely(!pg_vec[i]))
> +               pg_vec[i].buffer = alloc_one_pg_vec_page(order,
> +                                                        &pg_vec[i].flags);
> +               if (unlikely(!pg_vec[i].buffer))
>                        goto out_free_pgvec;
>        }
>
> @@ -2361,6 +2405,7 @@ out:
>
>  out_free_pgvec:
>        free_pg_vec(pg_vec, order, block_nr);
> +       kfree(pg_vec);
>        pg_vec = NULL;
>        goto out;
>  }
> @@ -2368,7 +2413,7 @@ out_free_pgvec:
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>                int closing, int tx_ring)
>  {
> -       char **pg_vec = NULL;
> +       struct pgv *pg_vec = NULL;
>        struct packet_sock *po = pkt_sk(sk);
>        int was_running, order = 0;
>        struct packet_ring_buffer *rb;
> @@ -2530,15 +2575,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
>                        continue;
>
>                for (i = 0; i < rb->pg_vec_len; i++) {
> -                       struct page *page = virt_to_page(rb->pg_vec[i]);
> +                       struct page *page;
> +                       void *kaddr = rb->pg_vec[i].buffer;
>                        int pg_num;
>
>                        for (pg_num = 0; pg_num < rb->pg_vec_pages;
> -                                       pg_num++, page++) {
> +                                       pg_num++) {
> +                               if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
> +                                       page = vmalloc_to_page(kaddr);
> +                               else
> +                                       page = virt_to_page(kaddr);
> +
>                                err = vm_insert_page(vma, start, page);
>                                if (unlikely(err))
>                                        goto out;
>                                start += PAGE_SIZE;
> +                               kaddr += PAGE_SIZE;
>                        }
>                }
>        }
> --
> 1.7.2.3
>
>
Acked-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>

Nice to see this.

-- Maciej

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
  2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
  2010-11-11  6:29   ` Eric Dumazet
  2010-11-11  8:03   ` Maciej Żenczykowski
@ 2010-11-16 18:25   ` David Miller
  2010-11-16 21:30     ` Neil Horman
  2 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2010-11-16 18:25 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, eric.dumazet, zenczykowski

From: nhorman@tuxdriver.com
Date: Wed, 10 Nov 2010 14:09:54 -0500

> Version 4 of this patch.

Applied, thanks a lot for doing this.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
  2010-11-16 18:25   ` David Miller
@ 2010-11-16 21:30     ` Neil Horman
  0 siblings, 0 replies; 17+ messages in thread
From: Neil Horman @ 2010-11-16 21:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, eric.dumazet, zenczykowski

On Tue, Nov 16, 2010 at 10:25:37AM -0800, David Miller wrote:
> From: nhorman@tuxdriver.com
> Date: Wed, 10 Nov 2010 14:09:54 -0500
> 
> > Version 4 of this patch.
> 
> Applied, thanks a lot for doing this.
No problem, thanks!
Neil

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-11-16 21:33 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-25 19:06 [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation nhorman
2010-10-25 20:17 ` Francois Romieu
2010-10-25 20:38 ` Eric Dumazet
2010-11-09 17:46 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v2) nhorman
2010-11-09 18:02   ` Eric Dumazet
2010-11-09 18:38     ` Neil Horman
2010-11-09 19:20       ` Eric Dumazet
2010-11-09 20:57         ` Neil Horman
2010-11-09 21:07   ` Maciej Żenczykowski
2010-11-09 21:20     ` Neil Horman
2010-11-10 18:20 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v3) nhorman
2010-11-10 18:27   ` Eric Dumazet
2010-11-10 19:09 ` [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) nhorman
2010-11-11  6:29   ` Eric Dumazet
2010-11-11  8:03   ` Maciej Żenczykowski
2010-11-16 18:25   ` David Miller
2010-11-16 21:30     ` Neil Horman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.