All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
@ 2013-04-15 11:40 Octavian Purdila
  2013-05-10 20:40 ` Andrew Morton
  2013-06-12 18:14 ` Kent Overstreet
  0 siblings, 2 replies; 11+ messages in thread
From: Octavian Purdila @ 2013-04-15 11:40 UTC (permalink / raw)
  To: akpm, linux-kernel, linux-aio, linux-s390
  Cc: koverstreet, bcrl, schwidefsky, kirill.shutemov, zab,
	Octavian Purdila, Andi Kleen

When using a large number of threads performing AIO operations the
IOCTX list may get a significant number of entries which will cause
significant overhead. For example, when running this fio script:

rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=512; thread; loops=100

on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
30% CPU time spent by lookup_ioctx:

 32.51%  [guest.kernel]  [g] lookup_ioctx
  9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
  4.40%  [guest.kernel]  [g] lock_release
  4.19%  [guest.kernel]  [g] sched_clock_local
  3.86%  [guest.kernel]  [g] local_clock
  3.68%  [guest.kernel]  [g] native_sched_clock
  3.08%  [guest.kernel]  [g] sched_clock_cpu
  2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
  2.60%  [guest.kernel]  [g] memcpy
  2.33%  [guest.kernel]  [g] lock_acquired
  2.25%  [guest.kernel]  [g] lock_acquire
  1.84%  [guest.kernel]  [g] do_io_submit

This patchs converts the ioctx list to a radix tree. For a performance
comparison the above FIO script was run on a 2 sockets 8 core
machine. This are the results (average and %rsd of 10 runs) for the
original list based implementation and for the radix tree based
implementation:

cores         1         2         4         8         16        32
list       109376 ms  69119 ms  35682 ms  22671 ms  19724 ms  16408 ms
%rsd         0.69%      1.15%     1.17%     1.21%     1.71%     1.43%
radix       73651 ms  41748 ms  23028 ms  16766 ms  15232 ms   13787 ms
%rsd         1.19%      0.98%     0.69%     1.13%    0.72%      0.75%
% of radix
relative    66.12%     65.59%    66.63%    72.31%   77.26%     83.66%
to list

To consider the impact of the patch on the typical case of having
only one ctx per process the following FIO script was run:

rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
blocksize=1024; numjobs=1; thread; loops=100

on the same system and the results are the following:

list        58892 ms
%rsd         0.91%
radix       59404 ms
%rsd         0.81%
% of radix
relative    100.87%
to list

Cc: Andi Kleen <ak@linux.intel.com>

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
---
Changes since v2:
 * rebased on top of next/akpm
 * add %rsd to measurements 

Changes since v1:
 * add performance comparision for the typical case of having only one
   ctx per process
 * use ARRAY_SIZE and drop tracking idx as it is not needed

 arch/s390/mm/pgtable.c   |    4 +--
 fs/aio.c                 |   76 ++++++++++++++++++++++++++++------------------
 include/linux/mm_types.h |    3 +-
 kernel/fork.c            |    2 +-
 4 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2accf71..54186e7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -894,7 +894,7 @@ int s390_enable_sie(void)
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
+	    tsk->mm->ioctx_rtree.rnode ||
 #endif
 	    tsk->mm != tsk->active_mm) {
 		task_unlock(tsk);
@@ -921,7 +921,7 @@ int s390_enable_sie(void)
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
+	    tsk->mm->ioctx_rtree.rnode ||
 #endif
 	    tsk->mm != tsk->active_mm) {
 		mmput(mm);
diff --git a/fs/aio.c b/fs/aio.c
index 5b7ed78..4ceed70 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -38,6 +38,7 @@
 #include <linux/blkdev.h>
 #include <linux/compat.h>
 #include <linux/percpu-refcount.h>
+#include <linux/radix-tree.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -69,9 +70,7 @@ struct kioctx_cpu {
 struct kioctx {
 	struct percpu_ref	users;
 
-	/* This needs improving */
 	unsigned long		user_id;
-	struct hlist_node	list;
 
 	struct __percpu kioctx_cpu *cpu;
 
@@ -457,10 +456,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
 
-	/* now link into global list. */
+	/* now insert into the radix tree */
+	err = radix_tree_preload(GFP_KERNEL);
+	if (err)
+		goto out_cleanup;
 	spin_lock(&mm->ioctx_lock);
-	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+	err = radix_tree_insert(&mm->ioctx_rtree, ctx->user_id, ctx);
 	spin_unlock(&mm->ioctx_lock);
+	radix_tree_preload_end();
+	if (err) {
+		WARN_ONCE(1, "aio: insert into ioctx tree failed: %d", err);
+		goto out_cleanup;
+	}
 
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
@@ -501,8 +508,8 @@ static void kill_ioctx_rcu(struct rcu_head *head)
 static void kill_ioctx(struct kioctx *ctx)
 {
 	if (percpu_ref_kill(&ctx->users)) {
-		hlist_del_rcu(&ctx->list);
-		/* Between hlist_del_rcu() and dropping the initial ref */
+		radix_tree_delete(&current->mm->ioctx_rtree, ctx->user_id);
+		/* Between radix_tree_delete() and dropping the initial ref */
 		synchronize_rcu();
 
 		/*
@@ -542,25 +549,38 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx *ctx;
-	struct hlist_node *n;
+	struct kioctx *ctx[16];
+	unsigned long idx = 0;
+	int count;
 
-	hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
-		/*
-		 * We don't need to bother with munmap() here -
-		 * exit_mmap(mm) is coming and it'll unmap everything.
-		 * Since aio_free_ring() uses non-zero ->mmap_size
-		 * as indicator that it needs to unmap the area,
-		 * just set it to 0; aio_free_ring() is the only
-		 * place that uses ->mmap_size, so it's safe.
-		 */
-		ctx->mmap_size = 0;
-
-		if (percpu_ref_kill(&ctx->users)) {
-			hlist_del_rcu(&ctx->list);
-			call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+	do {
+		int i;
+
+		count = radix_tree_gang_lookup(&mm->ioctx_rtree, (void **)ctx,
+					       idx, sizeof(ctx)/sizeof(void *));
+		for (i = 0; i < count; i++) {
+			void *ret;
+
+			BUG_ON(ctx[i]->user_id < idx);
+			idx = ctx[i]->user_id;
+
+			/*
+			 * We don't need to bother with munmap() here -
+			 * exit_mmap(mm) is coming and it'll unmap everything.
+			 * Since aio_free_ring() uses non-zero ->mmap_size
+			 * as indicator that it needs to unmap the area,
+			 * just set it to 0; aio_free_ring() is the only
+			 * place that uses ->mmap_size, so it's safe.
+			 */
+			ctx[i]->mmap_size = 0;
+
+			if (percpu_ref_kill(&ctx[i]->users)) {
+				ret = radix_tree_delete(&mm->ioctx_rtree, idx);
+				BUG_ON(!ret || ret != ctx[i]);
+				call_rcu(&ctx[i]->rcu_head, kill_ioctx_rcu);
+			}
 		}
-	}
+	} while (count);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -665,12 +685,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id) {
-			percpu_ref_get(&ctx->users);
-			ret = ctx;
-			break;
-		}
+	ctx = radix_tree_lookup(&mm->ioctx_rtree, ctx_id);
+	if (ctx) {
+		percpu_ref_get(&ctx->users);
+		ret = ctx;
 	}
 
 	rcu_read_unlock();
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fb425aa..1976fa9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/threads.h>
 #include <linux/list.h>
+#include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
@@ -383,7 +384,7 @@ struct mm_struct {
 	struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
-	struct hlist_head	ioctx_list;
+	struct radix_tree_root	ioctx_rtree;
 #endif
 #ifdef CONFIG_MM_OWNER
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 6fcc2e2..62e0d0a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -522,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm)
 {
 #ifdef CONFIG_AIO
 	spin_lock_init(&mm->ioctx_lock);
-	INIT_HLIST_HEAD(&mm->ioctx_list);
+	INIT_RADIX_TREE(&mm->ioctx_rtree, GFP_KERNEL);
 #endif
 }
 
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-04-15 11:40 [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree Octavian Purdila
@ 2013-05-10 20:40 ` Andrew Morton
  2013-05-10 21:15   ` Kent Overstreet
  2013-06-12 18:14 ` Kent Overstreet
  1 sibling, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2013-05-10 20:40 UTC (permalink / raw)
  To: Octavian Purdila
  Cc: linux-kernel, linux-aio, linux-s390, koverstreet, bcrl,
	schwidefsky, kirill.shutemov, zab, Andi Kleen

On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <octavian.purdila@intel.com> wrote:

> When using a large number of threads performing AIO operations the
> IOCTX list may get a significant number of entries which will cause
> significant overhead. For example, when running this fio script:
> 
> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=512; thread; loops=100
> 
> on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> 30% CPU time spent by lookup_ioctx:
> 
>  32.51%  [guest.kernel]  [g] lookup_ioctx
>   9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
>   4.40%  [guest.kernel]  [g] lock_release
>   4.19%  [guest.kernel]  [g] sched_clock_local
>   3.86%  [guest.kernel]  [g] local_clock
>   3.68%  [guest.kernel]  [g] native_sched_clock
>   3.08%  [guest.kernel]  [g] sched_clock_cpu
>   2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
>   2.60%  [guest.kernel]  [g] memcpy
>   2.33%  [guest.kernel]  [g] lock_acquired
>   2.25%  [guest.kernel]  [g] lock_acquire
>   1.84%  [guest.kernel]  [g] do_io_submit
> 
> This patchs converts the ioctx list to a radix tree.

The patch looks nice.  One thing we should pay attention to is the
memory consumption.  radix-trees can be far less space-efficient than
lists, and as the tree key comes from mmap() it can be pretty sparsely
distributed.

So could you please have a think about this, see if we can cook up some
worst-case numbers and decide if they are problematic?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-05-10 20:40 ` Andrew Morton
@ 2013-05-10 21:15   ` Kent Overstreet
  2013-05-13 21:01     ` Octavian Purdila
  0 siblings, 1 reply; 11+ messages in thread
From: Kent Overstreet @ 2013-05-10 21:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Octavian Purdila, linux-kernel, linux-aio, linux-s390, bcrl,
	schwidefsky, kirill.shutemov, zab, Andi Kleen

On Fri, May 10, 2013 at 01:40:15PM -0700, Andrew Morton wrote:
> On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <octavian.purdila@intel.com> wrote:
> 
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> > 
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> > 
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> > 
> >  32.51%  [guest.kernel]  [g] lookup_ioctx
> >   9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
> >   4.40%  [guest.kernel]  [g] lock_release
> >   4.19%  [guest.kernel]  [g] sched_clock_local
> >   3.86%  [guest.kernel]  [g] local_clock
> >   3.68%  [guest.kernel]  [g] native_sched_clock
> >   3.08%  [guest.kernel]  [g] sched_clock_cpu
> >   2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
> >   2.60%  [guest.kernel]  [g] memcpy
> >   2.33%  [guest.kernel]  [g] lock_acquired
> >   2.25%  [guest.kernel]  [g] lock_acquire
> >   1.84%  [guest.kernel]  [g] do_io_submit
> > 
> > This patchs converts the ioctx list to a radix tree.
> 
> The patch looks nice.  One thing we should pay attention to is the
> memory consumption.  radix-trees can be far less space-efficient than
> lists, and as the tree key comes from mmap() it can be pretty sparsely
> distributed.
> 
> So could you please have a think about this, see if we can cook up some
> worst-case numbers and decide if they are problematic?

Because the overhead of an ioctx is so high (ringbuffer is some number
of pages) it shouldn't matter much - but I wouldn't mind seeing a bit of
arithmatic.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-05-10 21:15   ` Kent Overstreet
@ 2013-05-13 21:01     ` Octavian Purdila
  0 siblings, 0 replies; 11+ messages in thread
From: Octavian Purdila @ 2013-05-13 21:01 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Andrew Morton, linux-kernel, linux-aio, linux-s390, bcrl,
	schwidefsky, Kirill A. Shutemov, Zach Brown, Andi Kleen

On Sat, May 11, 2013 at 12:15 AM, Kent Overstreet
<koverstreet@google.com> wrote:
> On Fri, May 10, 2013 at 01:40:15PM -0700, Andrew Morton wrote:
>> On Mon, 15 Apr 2013 14:40:55 +0300 Octavian Purdila <octavian.purdila@intel.com> wrote:
>>
>> > When using a large number of threads performing AIO operations the
>> > IOCTX list may get a significant number of entries which will cause
>> > significant overhead. For example, when running this fio script:
>> >
>> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>> > blocksize=1024; numjobs=512; thread; loops=100
>> >
>> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
>> > 30% CPU time spent by lookup_ioctx:
>> >
>> >  32.51%  [guest.kernel]  [g] lookup_ioctx
>> >   9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
>> >   4.40%  [guest.kernel]  [g] lock_release
>> >   4.19%  [guest.kernel]  [g] sched_clock_local
>> >   3.86%  [guest.kernel]  [g] local_clock
>> >   3.68%  [guest.kernel]  [g] native_sched_clock
>> >   3.08%  [guest.kernel]  [g] sched_clock_cpu
>> >   2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
>> >   2.60%  [guest.kernel]  [g] memcpy
>> >   2.33%  [guest.kernel]  [g] lock_acquired
>> >   2.25%  [guest.kernel]  [g] lock_acquire
>> >   1.84%  [guest.kernel]  [g] do_io_submit
>> >
>> > This patchs converts the ioctx list to a radix tree.
>>
>> The patch looks nice.  One thing we should pay attention to is the
>> memory consumption.  radix-trees can be far less space-efficient than
>> lists, and as the tree key comes from mmap() it can be pretty sparsely
>> distributed.
>>
>> So could you please have a think about this, see if we can cook up some
>> worst-case numbers and decide if they are problematic?
>
> Because the overhead of an ioctx is so high (ringbuffer is some number
> of pages) it shouldn't matter much - but I wouldn't mind seeing a bit of
> arithmatic.


The worst case seems to be when we are running on 64 bits and
CONFIG_BASE_SMALL is enable. In that case, if the aio_rings are mapped
dispersed enough (4 bits "space" between mappings, e.g. at 0, 0x400,
0x4000, 0x40000, etc.) then we will have to allocate a full depth
branch, in this case 13 nodes: (64-PAGE_SIZE_BITS)/4.

That adds up to 7384 just for the radix tree, per aio context, with a
radix_tree_node size of 568. Besides that, the minimum memory
allocated for an aio context seems to be 4928 bytes (one page +
sizeof(kioctx)), if I did not miss anything. That looks like a 300%
increase in memory consumption.

However, I am not sure if the worst case is so relevant in practice.
As the number of entries grows the overhead reduces, as some of nodes
can be shared. Also, when the address space is that pathologically
sparse, shouldn't the radix tree used for memory management suffer
from the same problem?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-04-15 11:40 [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree Octavian Purdila
  2013-05-10 20:40 ` Andrew Morton
@ 2013-06-12 18:14 ` Kent Overstreet
  2013-06-12 18:24   ` Benjamin LaHaise
  1 sibling, 1 reply; 11+ messages in thread
From: Kent Overstreet @ 2013-06-12 18:14 UTC (permalink / raw)
  To: Octavian Purdila
  Cc: akpm, linux-kernel, linux-aio, linux-s390, bcrl, schwidefsky,
	kirill.shutemov, zab, Andi Kleen

On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> When using a large number of threads performing AIO operations the
> IOCTX list may get a significant number of entries which will cause
> significant overhead. For example, when running this fio script:
> 
> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=512; thread; loops=100
> 
> on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> 30% CPU time spent by lookup_ioctx:
> 
>  32.51%  [guest.kernel]  [g] lookup_ioctx
>   9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
>   4.40%  [guest.kernel]  [g] lock_release
>   4.19%  [guest.kernel]  [g] sched_clock_local
>   3.86%  [guest.kernel]  [g] local_clock
>   3.68%  [guest.kernel]  [g] native_sched_clock
>   3.08%  [guest.kernel]  [g] sched_clock_cpu
>   2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
>   2.60%  [guest.kernel]  [g] memcpy
>   2.33%  [guest.kernel]  [g] lock_acquired
>   2.25%  [guest.kernel]  [g] lock_acquire
>   1.84%  [guest.kernel]  [g] do_io_submit
> 
> This patchs converts the ioctx list to a radix tree. For a performance
> comparison the above FIO script was run on a 2 sockets 8 core
> machine. This are the results (average and %rsd of 10 runs) for the
> original list based implementation and for the radix tree based
> implementation:
> 
> cores         1         2         4         8         16        32
> list       109376 ms  69119 ms  35682 ms  22671 ms  19724 ms  16408 ms
> %rsd         0.69%      1.15%     1.17%     1.21%     1.71%     1.43%
> radix       73651 ms  41748 ms  23028 ms  16766 ms  15232 ms   13787 ms
> %rsd         1.19%      0.98%     0.69%     1.13%    0.72%      0.75%
> % of radix
> relative    66.12%     65.59%    66.63%    72.31%   77.26%     83.66%
> to list
> 
> To consider the impact of the patch on the typical case of having
> only one ctx per process the following FIO script was run:
> 
> rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=1; thread; loops=100
> 
> on the same system and the results are the following:
> 
> list        58892 ms
> %rsd         0.91%
> radix       59404 ms
> %rsd         0.81%
> % of radix
> relative    100.87%
> to list

So, I was just doing some benchmarking/profiling to get ready to send
out the aio patches I've got for 3.11 - and it looks like your patch is
causing a ~1.5% throughput regression in my testing :/

I'm just benchmarking random 4k reads with fio, with a single job.
Looking at the profile it appears to all be radix_tree_lookup() - that's
more expensive than I'd expect for a tree with one element.

It's a shame we don't have resizable RCU hash tables, that's really what
we want for this. Actually, I think I might know how to make that work
by using cuckoo hashing...

Might also be worth trying a single element cache of the most recently
used ioctx. Anyways, I don't want to nack your patch over this (the
overhead this is fixing can be quite a bit worse) but I'd like to try
and see if we can fix or reduce the regression in the single ioctx case.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-12 18:14 ` Kent Overstreet
@ 2013-06-12 18:24   ` Benjamin LaHaise
  2013-06-12 19:40     ` Zach Brown
  0 siblings, 1 reply; 11+ messages in thread
From: Benjamin LaHaise @ 2013-06-12 18:24 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Octavian Purdila, akpm, linux-kernel, linux-aio, linux-s390,
	schwidefsky, kirill.shutemov, zab, Andi Kleen

On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote:
> On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> > 
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> > 
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> > 
> >  32.51%  [guest.kernel]  [g] lookup_ioctx
> >   9.19%  [guest.kernel]  [g] __lock_acquire.isra.28
> >   4.40%  [guest.kernel]  [g] lock_release
> >   4.19%  [guest.kernel]  [g] sched_clock_local
> >   3.86%  [guest.kernel]  [g] local_clock
> >   3.68%  [guest.kernel]  [g] native_sched_clock
> >   3.08%  [guest.kernel]  [g] sched_clock_cpu
> >   2.64%  [guest.kernel]  [g] lock_release_holdtime.part.11
> >   2.60%  [guest.kernel]  [g] memcpy
> >   2.33%  [guest.kernel]  [g] lock_acquired
> >   2.25%  [guest.kernel]  [g] lock_acquire
> >   1.84%  [guest.kernel]  [g] do_io_submit
> > 
> > This patchs converts the ioctx list to a radix tree. For a performance
> > comparison the above FIO script was run on a 2 sockets 8 core
> > machine. This are the results (average and %rsd of 10 runs) for the
> > original list based implementation and for the radix tree based
> > implementation:
> > 
> > cores         1         2         4         8         16        32
> > list       109376 ms  69119 ms  35682 ms  22671 ms  19724 ms  16408 ms
> > %rsd         0.69%      1.15%     1.17%     1.21%     1.71%     1.43%
> > radix       73651 ms  41748 ms  23028 ms  16766 ms  15232 ms   13787 ms
> > %rsd         1.19%      0.98%     0.69%     1.13%    0.72%      0.75%
> > % of radix
> > relative    66.12%     65.59%    66.63%    72.31%   77.26%     83.66%
> > to list
> > 
> > To consider the impact of the patch on the typical case of having
> > only one ctx per process the following FIO script was run:
> > 
> > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=1; thread; loops=100
> > 
> > on the same system and the results are the following:
> > 
> > list        58892 ms
> > %rsd         0.91%
> > radix       59404 ms
> > %rsd         0.81%
> > % of radix
> > relative    100.87%
> > to list
> 
> So, I was just doing some benchmarking/profiling to get ready to send
> out the aio patches I've got for 3.11 - and it looks like your patch is
> causing a ~1.5% throughput regression in my testing :/
... <snip>

I've got an alternate approach for fixing this wart in lookup_ioctx()...  
Instead of using an rbtree, just use the reserved id in the ring buffer 
header to index an array pointing the ioctx.  It's not finished yet, and 
it needs to be tidied up, but is most of the way there.

		-ben
-- 
"Thought is the essence of where you are now."

 fs/aio.c                 |   80 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/mm_types.h |    5 ++
 kernel/fork.c            |    4 ++
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 7fe5bde..96665db 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,6 +108,8 @@ struct kioctx {
 	} ____cacheline_aligned_in_smp;
 
 	struct page		*internal_pages[AIO_RING_PAGES];
+
+	unsigned		id;
 };
 
 /*------ sysctl variables----*/
@@ -207,7 +209,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
-	ring->id = ctx->user_id;
+	ring->id = ~0U;
 	ring->head = ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -351,6 +353,7 @@ static void put_ioctx(struct kioctx *ctx)
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
+	struct aio_ring *ring;
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 
@@ -394,15 +397,61 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	/* now link into global list. */
 	spin_lock(&mm->ioctx_lock);
+	if (mm->ioctx_nr == 0) {
+		mm->ioctx_nr++;
+		mm->ioctx_table[0] = ctx;
+		ctx->id = 0;
+	} else {
+		unsigned i;
+		if (mm->ioctx_nr == mm->ioctx_max) {
+			unsigned new_nr;
+			struct kioctx **table;
+again:
+			new_nr = mm->ioctx_max * 4;
+			spin_unlock(&mm->ioctx_lock);
+			table = kcalloc(new_nr, sizeof(*table), GFP_KERNEL);
+			if (!table) {
+				err = -ENOMEM;
+				goto out_cleanup_noerr;
+			}
+			spin_lock(&mm->ioctx_lock);
+			if (mm->ioctx_max < new_nr) {
+				struct kioctx **old = mm->ioctx_table;
+				memcpy(table, mm->ioctx_table,
+				       mm->ioctx_max * sizeof(struct kioctx *));
+				smp_wmb();
+				mm->ioctx_table = table;
+				smp_wmb();
+				mm->ioctx_max = new_nr;
+				//FIXME: kfree_rcu(old, old);
+			} else
+				kfree(table);
+			if (mm->ioctx_nr == mm->ioctx_max)
+				goto again;
+		}
+		for (i=0; i<mm->ioctx_max; i++) {
+			if (!mm->ioctx_table[i]) {
+				ctx->id = i;
+				mm->ioctx_table[i] = ctx;
+				break;
+			}
+		}
+		WARN_ON(1);
+	}
 	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
 	spin_unlock(&mm->ioctx_lock);
 
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->id = ctx->id;
+	kunmap_atomic(ring);
+
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
 
 out_cleanup:
 	err = -EAGAIN;
+out_cleanup_noerr:
 	aio_free_ring(ctx);
 out_freectx:
 	kmem_cache_free(kioctx_cachep, ctx);
@@ -434,7 +483,14 @@ static void kill_ioctx_rcu(struct rcu_head *head)
 static void kill_ioctx(struct kioctx *ctx)
 {
 	if (!atomic_xchg(&ctx->dead, 1)) {
+		struct mm_struct *mm = current->mm;
+
+		spin_lock(&mm->ioctx_lock);
+		WARN_ON(ctx != mm->ioctx_table[ctx->id]);
+		mm->ioctx_table[ctx->id] = NULL;
 		hlist_del_rcu(&ctx->list);
+		spin_unlock(&mm->ioctx_lock);
+
 		/* Between hlist_del_rcu() and dropping the initial ref */
 		synchronize_rcu();
 
@@ -496,6 +552,8 @@ void exit_aio(struct mm_struct *mm)
 		ctx->mmap_size = 0;
 
 		if (!atomic_xchg(&ctx->dead, 1)) {
+			WARN_ON(ctx != mm->ioctx_table[ctx->id]);
+			mm->ioctx_table[ctx->id] = NULL;
 			hlist_del_rcu(&ctx->list);
 			call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
 		}
@@ -557,19 +615,25 @@ EXPORT_SYMBOL(aio_put_req);
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
+	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
+	unsigned id;
+
+	if (get_user(id, &ring->id))
+		return NULL;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id) {
-			atomic_inc(&ctx->users);
-			ret = ctx;
-			break;
-		}
-	}
+	if (id >= mm->ioctx_max)
+		goto out;
 
+	ctx = mm->ioctx_table[id];
+	if (ctx->user_id == ctx_id) {
+		atomic_inc(&ctx->users);
+		ret = ctx;
+	}
+out:
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..6958e55 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -322,6 +322,7 @@ struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
 };
 
+struct kioctx;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -387,6 +388,10 @@ struct mm_struct {
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
 	struct hlist_head	ioctx_list;
+	unsigned		ioctx_nr;
+	unsigned		ioctx_max;
+	struct kioctx		**ioctx_table;
+	struct kioctx		*ioctx_table_inline[1];
 #endif
 #ifdef CONFIG_MM_OWNER
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a..5ef351c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,10 @@ static void mm_init_aio(struct mm_struct *mm)
 #ifdef CONFIG_AIO
 	spin_lock_init(&mm->ioctx_lock);
 	INIT_HLIST_HEAD(&mm->ioctx_list);
+	mm->ioctx_nr = 0;
+	mm->ioctx_max = 1;
+	mm->ioctx_table = mm->ioctx_table_inline;
+	mm->ioctx_table_inline[0] = NULL;
 #endif
 }
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-12 18:24   ` Benjamin LaHaise
@ 2013-06-12 19:40     ` Zach Brown
  2013-06-14 14:20       ` Octavian Purdila
  0 siblings, 1 reply; 11+ messages in thread
From: Zach Brown @ 2013-06-12 19:40 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Kent Overstreet, Octavian Purdila, akpm, linux-kernel, linux-aio,
	linux-s390, schwidefsky, kirill.shutemov, Andi Kleen

> I've got an alternate approach for fixing this wart in lookup_ioctx()...  
> Instead of using an rbtree, just use the reserved id in the ring buffer 
> header to index an array pointing the ioctx.  It's not finished yet, and 
> it needs to be tidied up, but is most of the way there.

Yeah, that might work.

Note that the patch wasn't using an rbtree, it was storing the pointer
value in a *radix* which is why single lookups took so long.  Presumably
radix was used for RCU lookups.

Your ring->id trick lets us use RCU with small ints instead of the
context pointer.  It might be worth using idr instead of rolling manual
array code.  It'd still be much faster than the list, but get rid of the
large alloc, array walking, memcpy(), etc.

- z

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-12 19:40     ` Zach Brown
@ 2013-06-14 14:20       ` Octavian Purdila
  2013-06-18 19:05         ` Octavian Purdila
  0 siblings, 1 reply; 11+ messages in thread
From: Octavian Purdila @ 2013-06-14 14:20 UTC (permalink / raw)
  To: Zach Brown
  Cc: Benjamin LaHaise, Kent Overstreet, Andrew Morton, linux-kernel,
	linux-aio, linux-s390, schwidefsky, Kirill A. Shutemov,
	Andi Kleen

On Wed, Jun 12, 2013 at 10:40 PM, Zach Brown <zab@redhat.com> wrote:
>> I've got an alternate approach for fixing this wart in lookup_ioctx()...
>> Instead of using an rbtree, just use the reserved id in the ring buffer
>> header to index an array pointing the ioctx.  It's not finished yet, and
>> it needs to be tidied up, but is most of the way there.
>
> Yeah, that might work.
>
> Note that the patch wasn't using an rbtree, it was storing the pointer
> value in a *radix* which is why single lookups took so long.  Presumably
> radix was used for RCU lookups.
>
> Your ring->id trick lets us use RCU with small ints instead of the
> context pointer.  It might be worth using idr instead of rolling manual
> array code.  It'd still be much faster than the list, but get rid of the
> large alloc, array walking, memcpy(), etc.
>

I picked up Ben's patch and incorporated Zach's idea and the first
results look promising, as expected. I am going to do a full test with
the same workload I've used for rbtree and come back with the results
and the patch in a day or so.

Thanks everybody !

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-14 14:20       ` Octavian Purdila
@ 2013-06-18 19:05         ` Octavian Purdila
  2013-06-18 19:08           ` Benjamin LaHaise
  0 siblings, 1 reply; 11+ messages in thread
From: Octavian Purdila @ 2013-06-18 19:05 UTC (permalink / raw)
  To: Zach Brown
  Cc: Benjamin LaHaise, Kent Overstreet, Andrew Morton, linux-kernel,
	linux-aio, linux-s390, schwidefsky, Kirill A. Shutemov,
	Andi Kleen

On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
<octavian.purdila@intel.com> wrote:
> On Wed, Jun 12, 2013 at 10:40 PM, Zach Brown <zab@redhat.com> wrote:
>>> I've got an alternate approach for fixing this wart in lookup_ioctx()...
>>> Instead of using an rbtree, just use the reserved id in the ring buffer
>>> header to index an array pointing the ioctx.  It's not finished yet, and
>>> it needs to be tidied up, but is most of the way there.
>>
>> Yeah, that might work.
>>
>> Note that the patch wasn't using an rbtree, it was storing the pointer
>> value in a *radix* which is why single lookups took so long.  Presumably
>> radix was used for RCU lookups.
>>
>> Your ring->id trick lets us use RCU with small ints instead of the
>> context pointer.  It might be worth using idr instead of rolling manual
>> array code.  It'd still be much faster than the list, but get rid of the
>> large alloc, array walking, memcpy(), etc.
>>
>
> I picked up Ben's patch and incorporated Zach's idea and the first
> results look promising, as expected. I am going to do a full test with
> the same workload I've used for rbtree and come back with the results
> and the patch in a day or so.
>

Unfortunately, I still see performance degradation for the one ioctx
case even when using IDR. I am using the same fio benchmark as before.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-18 19:05         ` Octavian Purdila
@ 2013-06-18 19:08           ` Benjamin LaHaise
  2013-06-18 19:32             ` Octavian Purdila
  0 siblings, 1 reply; 11+ messages in thread
From: Benjamin LaHaise @ 2013-06-18 19:08 UTC (permalink / raw)
  To: Octavian Purdila
  Cc: Zach Brown, Kent Overstreet, Andrew Morton, linux-kernel,
	linux-aio, linux-s390, schwidefsky, Kirill A. Shutemov,
	Andi Kleen

On Tue, Jun 18, 2013 at 10:05:22PM +0300, Octavian Purdila wrote:
> On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
> <octavian.purdila@intel.com> wrote:
> > I picked up Ben's patch and incorporated Zach's idea and the first
> > results look promising, as expected. I am going to do a full test with
> > the same workload I've used for rbtree and come back with the results
> > and the patch in a day or so.
> >
> 
> Unfortunately, I still see performance degradation for the one ioctx
> case even when using IDR. I am using the same fio benchmark as before.

How much of a regression?

		-ben
-- 
"Thought is the essence of where you are now."

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree
  2013-06-18 19:08           ` Benjamin LaHaise
@ 2013-06-18 19:32             ` Octavian Purdila
  0 siblings, 0 replies; 11+ messages in thread
From: Octavian Purdila @ 2013-06-18 19:32 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Zach Brown, Kent Overstreet, Andrew Morton, linux-kernel,
	linux-aio, linux-s390, schwidefsky, Kirill A. Shutemov,
	Andi Kleen

[-- Attachment #1: Type: text/plain, Size: 1524 bytes --]

On Tue, Jun 18, 2013 at 10:08 PM, Benjamin LaHaise <bcrl@kvack.org> wrote:
> On Tue, Jun 18, 2013 at 10:05:22PM +0300, Octavian Purdila wrote:
>> On Fri, Jun 14, 2013 at 5:20 PM, Octavian Purdila
>> <octavian.purdila@intel.com> wrote:
>> > I picked up Ben's patch and incorporated Zach's idea and the first
>> > results look promising, as expected. I am going to do a full test with
>> > the same workload I've used for rbtree and come back with the results
>> > and the patch in a day or so.
>> >
>>
>> Unfortunately, I still see performance degradation for the one ioctx
>> case even when using IDR. I am using the same fio benchmark as before.
>
> How much of a regression?
>

Hi Ben,

It is even worse then radix, I've seen between 1% to even 4%
degradation. I have tried to profile it and so far the culprit seems
to be the hint code in idr_find. perf top -e dTLB-load-misses shows:

       â     static inline void *idr_find(struct idr *idr, int id)
       â     {
       â             struct idr_layer *hint = rcu_dereference_raw(idr->hint);
       â
       â             if (hint && (id & ~IDR_MASK) == hint->prefix)
       â                     return rcu_dereference_raw(hint->ary[id &
IDR_MASK]);
 49.42 â d0:   movzbl %r12b,%edx
       â       add    $0x4,%rdx
       â       mov    0x8(%rax,%rdx,8),%rax


I should say that I am using kvm for testing this. Hopefully this is
not a side effect of virtualization. I have attached the patch I am
testing with.

Thanks,
Tavi

[-- Attachment #2: 0001-aio-use-ring-id-and-IDR-to-speed-up-lookps.patch --]
[-- Type: application/octet-stream, Size: 8008 bytes --]

From 82454842b92cb79024395d3d2872c138c61058d8 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <octavian.purdila@intel.com>
Date: Mon, 17 Jun 2013 17:11:16 +0300
Subject: [PATCH] aio: use ring->id and IDR to speed-up lookps

---
 arch/s390/mm/pgtable.c   |    5 ++--
 fs/aio.c                 |   75 +++++++++++++++++++++++++++++++---------------
 include/linux/idr.h      |   11 +++++++
 include/linux/mm_types.h |    3 +-
 kernel/fork.c            |    2 +-
 5 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index a938b54..085f317 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -17,6 +17,7 @@
 #include <linux/quicklist.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1028,7 +1029,7 @@ int s390_enable_sie(void)
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
+	    !idr_empty(&tsk->mm->ioctx_idr) ||
 #endif
 	    tsk->mm != tsk->active_mm) {
 		task_unlock(tsk);
@@ -1055,7 +1056,7 @@ int s390_enable_sie(void)
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
 #ifdef CONFIG_AIO
-	    !hlist_empty(&tsk->mm->ioctx_list) ||
+	    !idr_empty(&tsk->mm->ioctx_idr) ||
 #endif
 	    tsk->mm != tsk->active_mm) {
 		mmput(mm);
diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf..d4298d0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -63,9 +63,7 @@ struct kioctx {
 	atomic_t		users;
 	atomic_t		dead;
 
-	/* This needs improving */
-	unsigned long		user_id;
-	struct hlist_node	list;
+	unsigned		id;
 
 	/*
 	 * This is what userspace passed to io_setup(), it's not used for
@@ -199,12 +197,11 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (populate)
 		mm_populate(ctx->mmap_base, populate);
 
-	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
-	ring->id = ctx->user_id;
+	ring->id = ~0U;
 	ring->head = ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -343,6 +340,7 @@ static void put_ioctx(struct kioctx *ctx)
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
+	struct aio_ring *ring;
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 
@@ -379,22 +377,43 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (aio_nr + nr_events > aio_max_nr ||
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
+		err = -EAGAIN;
 		goto out_cleanup;
 	}
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
 
-	/* now link into global list. */
+	/* Allocate an id for the ioctx, save it in the aio_ring, and
+	 * link it with the current context. This will allow us to
+	 * quickly locate the kioctx based on the user handle which
+	 * currently is a pointer to aio_ring.
+	 *
+	 * NOTE: a simpler approach would be to use the ctx->id as the
+	 * user handle. However, libaio uses the aio_ring directly to
+	 * check the head/tail pointers and thus relies on the handle
+	 * to be a pointer to the aio_ring.
+	 */
+	idr_preload(GFP_KERNEL);
 	spin_lock(&mm->ioctx_lock);
-	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+	ctx->id = idr_alloc(&mm->ioctx_idr, ctx, 0, 0, GFP_NOWAIT);
 	spin_unlock(&mm->ioctx_lock);
+	idr_preload_end();
+	if (ctx->id < 0)
+		goto out_undo_aio_nr;
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->id = ctx->id;
+	kunmap_atomic(ring);
 
-	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
-		 ctx, ctx->user_id, mm, ctx->nr_events);
+	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x id=%d\n",
+		 ctx, ctx->mmap_base, mm, ctx->nr_events, ctx->id);
 	return ctx;
 
+out_undo_aio_nr:
+	spin_lock(&aio_nr_lock);
+	aio_nr -= ctx->max_reqs;
+	spin_unlock(&aio_nr_lock);
 out_cleanup:
-	err = -EAGAIN;
 	aio_free_ring(ctx);
 out_freectx:
 	kmem_cache_free(kioctx_cachep, ctx);
@@ -423,10 +442,12 @@ static void kill_ioctx_rcu(struct rcu_head *head)
  *	when the processes owning a context have all exited to encourage
  *	the rapid destruction of the kioctx.
  */
-static void kill_ioctx(struct kioctx *ctx)
+static void kill_ioctx(struct kioctx *ctx, struct mm_struct *mm)
 {
 	if (!atomic_xchg(&ctx->dead, 1)) {
-		hlist_del_rcu(&ctx->list);
+		spin_lock(&mm->ioctx_lock);
+		idr_remove(&mm->ioctx_idr, ctx->id);
+		spin_unlock(&mm->ioctx_lock);
 
 		/*
 		 * It'd be more correct to do this in free_ioctx(), after all
@@ -475,9 +496,9 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
 void exit_aio(struct mm_struct *mm)
 {
 	struct kioctx *ctx;
-	struct hlist_node *n;
+	unsigned id;
 
-	hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
+	idr_for_each_entry(&mm->ioctx_idr, ctx, id) {
 		if (1 != atomic_read(&ctx->users))
 			printk(KERN_DEBUG
 				"exit_aio:ioctx still alive: %d %d %d\n",
@@ -494,7 +515,7 @@ void exit_aio(struct mm_struct *mm)
 		 */
 		ctx->mmap_size = 0;
 
-		kill_ioctx(ctx);
+		kill_ioctx(ctx, mm);
 	}
 }
 
@@ -553,20 +574,26 @@ EXPORT_SYMBOL(aio_put_req);
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
+	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
+	unsigned id;
 
-	rcu_read_lock();
+	if (get_user(id, &ring->id))
+		return NULL;
 
-	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id) {
+	rcu_read_lock();
+	ctx = idr_find(&mm->ioctx_idr, id);
+	if (ctx) {
+		if (ctx->mmap_base == ctx_id) {
 			atomic_inc(&ctx->users);
 			ret = ctx;
-			break;
-		}
+		} else
+			WARN_ONCE(1, "aio: invalid ctx_id %ld: mmap_base = %ld, id = %u",
+				  ctx_id, ctx->mmap_base, id);
 	}
-
 	rcu_read_unlock();
+
 	return ret;
 }
 
@@ -850,9 +877,9 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	ioctx = ioctx_alloc(nr_events);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
-		ret = put_user(ioctx->user_id, ctxp);
+		ret = put_user(ioctx->mmap_base, ctxp);
 		if (ret)
-			kill_ioctx(ioctx);
+			kill_ioctx(ioctx, current->mm);
 		put_ioctx(ioctx);
 	}
 
@@ -870,7 +897,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
-		kill_ioctx(ioctx);
+		kill_ioctx(ioctx, current->mm);
 		put_ioctx(ioctx);
 		return 0;
 	}
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 871a213..c28b9ab 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -132,6 +132,17 @@ static inline void *idr_find(struct idr *idr, int id)
 #define idr_for_each_entry(idp, entry, id)			\
 	for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
 
+
+/**
+ * idr_empty - returns true if there are no elements in the given idr
+ * @idp: idr handle
+ */
+static inline bool idr_empty(struct idr *idp)
+{
+	int id = 0;
+	return idr_get_next(idp, &id) == NULL;
+}
+
 /*
  * Don't use the following functions.  These exist only to suppress
  * deprecated warnings on EXPORT_SYMBOL()s.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..f97eb1e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -13,6 +13,7 @@
 #include <linux/page-debug-flags.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/idr.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -386,7 +387,7 @@ struct mm_struct {
 	struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
-	struct hlist_head	ioctx_list;
+	struct idr		ioctx_idr;
 #endif
 #ifdef CONFIG_MM_OWNER
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a..82a84a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -524,7 +524,7 @@ static void mm_init_aio(struct mm_struct *mm)
 {
 #ifdef CONFIG_AIO
 	spin_lock_init(&mm->ioctx_lock);
-	INIT_HLIST_HEAD(&mm->ioctx_list);
+	idr_init(&mm->ioctx_idr);
 #endif
 }
 
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2013-06-18 19:32 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-04-15 11:40 [PATCH v3 next/akpm] aio: convert the ioctx list to radix tree Octavian Purdila
2013-05-10 20:40 ` Andrew Morton
2013-05-10 21:15   ` Kent Overstreet
2013-05-13 21:01     ` Octavian Purdila
2013-06-12 18:14 ` Kent Overstreet
2013-06-12 18:24   ` Benjamin LaHaise
2013-06-12 19:40     ` Zach Brown
2013-06-14 14:20       ` Octavian Purdila
2013-06-18 19:05         ` Octavian Purdila
2013-06-18 19:08           ` Benjamin LaHaise
2013-06-18 19:32             ` Octavian Purdila

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.