All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] fs/aio: Add support to aio ring pages migration
@ 2013-07-08 10:38 ` Gu Zheng
  0 siblings, 0 replies; 4+ messages in thread
From: Gu Zheng @ 2013-07-08 10:38 UTC (permalink / raw)
  To: Andrew Morton, Mel Gorman, Al Viro, Benjamin
  Cc: tangchen, KAMEZAWA Hiroyuki, linux-fsdevel, linux-kernel, linux-mm

As the aio job will pin the ring pages, that will lead to mem migrated
failed. In order to fix this problem we use an anon inode to manage the aio ring
pages, and  setup the migratepage callback in the anon inode's address space, so
that when mem migrating the aio ring pages will be moved to other mem node safely.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c                |  120 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/migrate.h |    3 +
 mm/migrate.c            |    2 +-
 3 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 9b5ca11..d10f956 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -35,6 +35,9 @@
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
+#include <linux/anon_inodes.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>

 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -110,6 +113,7 @@ struct kioctx {
 	} ____cacheline_aligned_in_smp;

 	struct page		*internal_pages[AIO_RING_PAGES];
+	struct file		*aio_ring_file;
 };

 /*------ sysctl variables----*/
@@ -138,15 +142,78 @@ __initcall(aio_setup);

 static void aio_free_ring(struct kioctx *ctx)
 {
-	long i;
+	int i;
+	struct file *aio_ring_file = ctx->aio_ring_file;

-	for (i = 0; i < ctx->nr_pages; i++)
+	for (i = 0; i < ctx->nr_pages; i++) {
+		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+				page_count(ctx->ring_pages[i]));
 		put_page(ctx->ring_pages[i]);
+	}

 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
 		kfree(ctx->ring_pages);
+
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+		pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
+			current->pid, aio_ring_file->f_inode->i_nlink,
+			aio_ring_file->f_path.dentry->d_count,
+			d_unhashed(aio_ring_file->f_path.dentry),
+			atomic_read(&aio_ring_file->f_inode->i_count));
+		fput(aio_ring_file);
+		ctx->aio_ring_file = NULL;
+	}
+}
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+static const struct file_operations aio_ring_fops = {
+	.mmap = aio_ring_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+	return 0;
 }

+static int aio_migratepage(struct address_space *mapping, struct page *new,
+			struct page *old, enum migrate_mode mode)
+{
+	struct kioctx *ctx = mapping->private_data;
+	unsigned long flags;
+	unsigned idx = old->index;
+	int rc;
+
+	/*Writeback must be complete*/
+	BUG_ON(PageWriteback(old));
+	put_page(old);
+
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		get_page(old);
+		return rc;
+	}
+
+	get_page(new);
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	migrate_page_copy(new, old);
+	ctx->ring_pages[idx] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return rc;
+}
+
+static const struct address_space_operations aio_ctx_aops = {
+	.set_page_dirty = aio_set_page_dirty,
+	.migratepage	= aio_migratepage,
+};
+
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
@@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx)
 	struct mm_struct *mm = current->mm;
 	unsigned long size, populate;
 	int nr_pages;
+	int i;
+	struct file *file;

 	/* Compensate for the ring buffer's head/tail overlap entry */
 	nr_events += 2;	/* 1 is required, 2 for good luck */

 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
-	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;

+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
 	if (nr_pages < 0)
 		return -EINVAL;

-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct
io_event);
+	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
+	if (IS_ERR(file)) {
+		ctx->aio_ring_file = NULL;
+		return -EAGAIN;
+	}
+
+	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
+	file->f_inode->i_mapping->private_data = ctx;
+	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+		page = find_or_create_page(file->f_inode->i_mapping,
+					   i, GFP_HIGHUSER | __GFP_ZERO);
+		if (!page)
+			break;
+		pr_debug("pid(%d) page[%d]->count=%d\n",
+			 current->pid, i, page_count(page));
+		SetPageUptodate(page);
+		SetPageDirty(page);
+		unlock_page(page);
+	}
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);

-	ctx->nr_events = 0;
 	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
 		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)

 	ctx->mmap_size = nr_pages * PAGE_SIZE;
 	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+
 	down_write(&mm->mmap_sem);
-	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
-				       PROT_READ|PROT_WRITE,
-				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
+				       PROT_READ | PROT_WRITE,
+				       MAP_SHARED | MAP_POPULATE, 0, &populate);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		up_write(&mm->mmap_sem);
 		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
+	up_write(&mm->mmap_sem);
+
+	mm_populate(ctx->mmap_base, populate);

 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
 	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
 				       1, 0, ctx->ring_pages, NULL);
-	up_write(&mm->mmap_sem);
+	for (i = 0; i < ctx->nr_pages; i++)
+		put_page(ctx->ring_pages[i]);

 	if (unlikely(ctx->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
-	if (populate)
-		mm_populate(ctx->mmap_base, populate);

 	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
@@ -399,6 +494,8 @@ out_cleanup:
 	err = -EAGAIN;
 	aio_free_ring(ctx);
 out_freectx:
+	if (ctx->aio_ring_file)
+		fput(ctx->aio_ring_file);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
@@ -852,6 +949,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t
__user *, ctxp)
 	ioctx = ioctx_alloc(nr_events);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
+		ctx = ioctx->user_id;
 		ret = put_user(ioctx->user_id, ctxp);
 		if (ret)
 			kill_ioctx(ioctx);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..c407d88 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
+extern int migrate_page_move_mapping(struct address_space *mapping,
+		struct page *newpage, struct page *page,
+		struct buffer_head *head, enum migrate_mode mode);
 #else

 static inline void putback_lru_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c244..1da0092 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct
buffer_head *head,
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
-static int migrate_page_move_mapping(struct address_space *mapping,
+int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page,
 		struct buffer_head *head, enum migrate_mode mode)
 {
-- 
1.7.7


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] fs/aio: Add support to aio ring pages migration
@ 2013-07-08 10:38 ` Gu Zheng
  0 siblings, 0 replies; 4+ messages in thread
From: Gu Zheng @ 2013-07-08 10:38 UTC (permalink / raw)
  To: Andrew Morton, Mel Gorman, Al Viro, Benjamin
  Cc: tangchen, KAMEZAWA Hiroyuki, linux-fsdevel, linux-kernel, linux-mm

As the aio job will pin the ring pages, that will lead to mem migrated
failed. In order to fix this problem we use an anon inode to manage the aio ring
pages, and  setup the migratepage callback in the anon inode's address space, so
that when mem migrating the aio ring pages will be moved to other mem node safely.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c                |  120 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/migrate.h |    3 +
 mm/migrate.c            |    2 +-
 3 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 9b5ca11..d10f956 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -35,6 +35,9 @@
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
+#include <linux/anon_inodes.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>

 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -110,6 +113,7 @@ struct kioctx {
 	} ____cacheline_aligned_in_smp;

 	struct page		*internal_pages[AIO_RING_PAGES];
+	struct file		*aio_ring_file;
 };

 /*------ sysctl variables----*/
@@ -138,15 +142,78 @@ __initcall(aio_setup);

 static void aio_free_ring(struct kioctx *ctx)
 {
-	long i;
+	int i;
+	struct file *aio_ring_file = ctx->aio_ring_file;

-	for (i = 0; i < ctx->nr_pages; i++)
+	for (i = 0; i < ctx->nr_pages; i++) {
+		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+				page_count(ctx->ring_pages[i]));
 		put_page(ctx->ring_pages[i]);
+	}

 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
 		kfree(ctx->ring_pages);
+
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+		pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
+			current->pid, aio_ring_file->f_inode->i_nlink,
+			aio_ring_file->f_path.dentry->d_count,
+			d_unhashed(aio_ring_file->f_path.dentry),
+			atomic_read(&aio_ring_file->f_inode->i_count));
+		fput(aio_ring_file);
+		ctx->aio_ring_file = NULL;
+	}
+}
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
+static const struct file_operations aio_ring_fops = {
+	.mmap = aio_ring_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+	return 0;
 }

+static int aio_migratepage(struct address_space *mapping, struct page *new,
+			struct page *old, enum migrate_mode mode)
+{
+	struct kioctx *ctx = mapping->private_data;
+	unsigned long flags;
+	unsigned idx = old->index;
+	int rc;
+
+	/*Writeback must be complete*/
+	BUG_ON(PageWriteback(old));
+	put_page(old);
+
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		get_page(old);
+		return rc;
+	}
+
+	get_page(new);
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	migrate_page_copy(new, old);
+	ctx->ring_pages[idx] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return rc;
+}
+
+static const struct address_space_operations aio_ctx_aops = {
+	.set_page_dirty = aio_set_page_dirty,
+	.migratepage	= aio_migratepage,
+};
+
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
@@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx)
 	struct mm_struct *mm = current->mm;
 	unsigned long size, populate;
 	int nr_pages;
+	int i;
+	struct file *file;

 	/* Compensate for the ring buffer's head/tail overlap entry */
 	nr_events += 2;	/* 1 is required, 2 for good luck */

 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
-	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;

+	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
 	if (nr_pages < 0)
 		return -EINVAL;

-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct
io_event);
+	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
+	if (IS_ERR(file)) {
+		ctx->aio_ring_file = NULL;
+		return -EAGAIN;
+	}
+
+	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
+	file->f_inode->i_mapping->private_data = ctx;
+	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+		page = find_or_create_page(file->f_inode->i_mapping,
+					   i, GFP_HIGHUSER | __GFP_ZERO);
+		if (!page)
+			break;
+		pr_debug("pid(%d) page[%d]->count=%d\n",
+			 current->pid, i, page_count(page));
+		SetPageUptodate(page);
+		SetPageDirty(page);
+		unlock_page(page);
+	}
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);

-	ctx->nr_events = 0;
 	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
 		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)

 	ctx->mmap_size = nr_pages * PAGE_SIZE;
 	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+
 	down_write(&mm->mmap_sem);
-	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
-				       PROT_READ|PROT_WRITE,
-				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
+				       PROT_READ | PROT_WRITE,
+				       MAP_SHARED | MAP_POPULATE, 0, &populate);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		up_write(&mm->mmap_sem);
 		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
+	up_write(&mm->mmap_sem);
+
+	mm_populate(ctx->mmap_base, populate);

 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
 	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
 				       1, 0, ctx->ring_pages, NULL);
-	up_write(&mm->mmap_sem);
+	for (i = 0; i < ctx->nr_pages; i++)
+		put_page(ctx->ring_pages[i]);

 	if (unlikely(ctx->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
-	if (populate)
-		mm_populate(ctx->mmap_base, populate);

 	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
@@ -399,6 +494,8 @@ out_cleanup:
 	err = -EAGAIN;
 	aio_free_ring(ctx);
 out_freectx:
+	if (ctx->aio_ring_file)
+		fput(ctx->aio_ring_file);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
@@ -852,6 +949,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t
__user *, ctxp)
 	ioctx = ioctx_alloc(nr_events);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
+		ctx = ioctx->user_id;
 		ret = put_user(ioctx->user_id, ctxp);
 		if (ret)
 			kill_ioctx(ioctx);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..c407d88 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
+extern int migrate_page_move_mapping(struct address_space *mapping,
+		struct page *newpage, struct page *page,
+		struct buffer_head *head, enum migrate_mode mode);
 #else

 static inline void putback_lru_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c244..1da0092 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct
buffer_head *head,
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
-static int migrate_page_move_mapping(struct address_space *mapping,
+int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page,
 		struct buffer_head *head, enum migrate_mode mode)
 {
-- 
1.7.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] fs/aio: Add support to aio ring pages migration
  2013-07-08 10:38 ` Gu Zheng
@ 2013-07-11 10:12   ` Gu Zheng
  -1 siblings, 0 replies; 4+ messages in thread
From: Gu Zheng @ 2013-07-11 10:12 UTC (permalink / raw)
  To: Andrew Morton, Mel Gorman, Al Viro, Benjamin
  Cc: tangchen, KAMEZAWA Hiroyuki, linux-fsdevel, linux-kernel, linux-mm

ping...

On 07/08/2013 06:38 PM, Gu Zheng wrote:

> As the aio job will pin the ring pages, that will lead to mem migrated
> failed. In order to fix this problem we use an anon inode to manage the aio ring
> pages, and  setup the migratepage callback in the anon inode's address space, so
> that when mem migrating the aio ring pages will be moved to other mem node safely.
> 
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
> ---
>  fs/aio.c                |  120 ++++++++++++++++++++++++++++++++++++++++++----
>  include/linux/migrate.h |    3 +
>  mm/migrate.c            |    2 +-
>  3 files changed, 113 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index 9b5ca11..d10f956 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -35,6 +35,9 @@
>  #include <linux/eventfd.h>
>  #include <linux/blkdev.h>
>  #include <linux/compat.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/migrate.h>
> +#include <linux/ramfs.h>
> 
>  #include <asm/kmap_types.h>
>  #include <asm/uaccess.h>
> @@ -110,6 +113,7 @@ struct kioctx {
>  	} ____cacheline_aligned_in_smp;
> 
>  	struct page		*internal_pages[AIO_RING_PAGES];
> +	struct file		*aio_ring_file;
>  };
> 
>  /*------ sysctl variables----*/
> @@ -138,15 +142,78 @@ __initcall(aio_setup);
> 
>  static void aio_free_ring(struct kioctx *ctx)
>  {
> -	long i;
> +	int i;
> +	struct file *aio_ring_file = ctx->aio_ring_file;
> 
> -	for (i = 0; i < ctx->nr_pages; i++)
> +	for (i = 0; i < ctx->nr_pages; i++) {
> +		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
> +				page_count(ctx->ring_pages[i]));
>  		put_page(ctx->ring_pages[i]);
> +	}
> 
>  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
>  		kfree(ctx->ring_pages);
> +
> +	if (aio_ring_file) {
> +		truncate_setsize(aio_ring_file->f_inode, 0);
> +		pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
> +			current->pid, aio_ring_file->f_inode->i_nlink,
> +			aio_ring_file->f_path.dentry->d_count,
> +			d_unhashed(aio_ring_file->f_path.dentry),
> +			atomic_read(&aio_ring_file->f_inode->i_count));
> +		fput(aio_ring_file);
> +		ctx->aio_ring_file = NULL;
> +	}
> +}
> +
> +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	vma->vm_ops = &generic_file_vm_ops;
> +	return 0;
> +}
> +
> +static const struct file_operations aio_ring_fops = {
> +	.mmap = aio_ring_mmap,
> +};
> +
> +static int aio_set_page_dirty(struct page *page)
> +{
> +	return 0;
>  }
> 
> +static int aio_migratepage(struct address_space *mapping, struct page *new,
> +			struct page *old, enum migrate_mode mode)
> +{
> +	struct kioctx *ctx = mapping->private_data;
> +	unsigned long flags;
> +	unsigned idx = old->index;
> +	int rc;
> +
> +	/*Writeback must be complete*/
> +	BUG_ON(PageWriteback(old));
> +	put_page(old);
> +
> +	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
> +	if (rc != MIGRATEPAGE_SUCCESS) {
> +		get_page(old);
> +		return rc;
> +	}
> +
> +	get_page(new);
> +
> +	spin_lock_irqsave(&ctx->completion_lock, flags);
> +	migrate_page_copy(new, old);
> +	ctx->ring_pages[idx] = new;
> +	spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +
> +	return rc;
> +}
> +
> +static const struct address_space_operations aio_ctx_aops = {
> +	.set_page_dirty = aio_set_page_dirty,
> +	.migratepage	= aio_migratepage,
> +};
> +
>  static int aio_setup_ring(struct kioctx *ctx)
>  {
>  	struct aio_ring *ring;
> @@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx)
>  	struct mm_struct *mm = current->mm;
>  	unsigned long size, populate;
>  	int nr_pages;
> +	int i;
> +	struct file *file;
> 
>  	/* Compensate for the ring buffer's head/tail overlap entry */
>  	nr_events += 2;	/* 1 is required, 2 for good luck */
> 
>  	size = sizeof(struct aio_ring);
>  	size += sizeof(struct io_event) * nr_events;
> -	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
> 
> +	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
>  	if (nr_pages < 0)
>  		return -EINVAL;
> 
> -	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct
> io_event);
> +	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
> +	if (IS_ERR(file)) {
> +		ctx->aio_ring_file = NULL;
> +		return -EAGAIN;
> +	}
> +
> +	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
> +	file->f_inode->i_mapping->private_data = ctx;
> +	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		struct page *page;
> +		page = find_or_create_page(file->f_inode->i_mapping,
> +					   i, GFP_HIGHUSER | __GFP_ZERO);
> +		if (!page)
> +			break;
> +		pr_debug("pid(%d) page[%d]->count=%d\n",
> +			 current->pid, i, page_count(page));
> +		SetPageUptodate(page);
> +		SetPageDirty(page);
> +		unlock_page(page);
> +	}
> +	ctx->aio_ring_file = file;
> +	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
> +			/ sizeof(struct io_event);
> 
> -	ctx->nr_events = 0;
>  	ctx->ring_pages = ctx->internal_pages;
>  	if (nr_pages > AIO_RING_PAGES) {
>  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
> @@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)
> 
>  	ctx->mmap_size = nr_pages * PAGE_SIZE;
>  	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
> +
>  	down_write(&mm->mmap_sem);
> -	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
> -				       PROT_READ|PROT_WRITE,
> -				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
> +	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
> +				       PROT_READ | PROT_WRITE,
> +				       MAP_SHARED | MAP_POPULATE, 0, &populate);
>  	if (IS_ERR((void *)ctx->mmap_base)) {
>  		up_write(&mm->mmap_sem);
>  		ctx->mmap_size = 0;
>  		aio_free_ring(ctx);
>  		return -EAGAIN;
>  	}
> +	up_write(&mm->mmap_sem);
> +
> +	mm_populate(ctx->mmap_base, populate);
> 
>  	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
>  	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
>  				       1, 0, ctx->ring_pages, NULL);
> -	up_write(&mm->mmap_sem);
> +	for (i = 0; i < ctx->nr_pages; i++)
> +		put_page(ctx->ring_pages[i]);
> 
>  	if (unlikely(ctx->nr_pages != nr_pages)) {
>  		aio_free_ring(ctx);
>  		return -EAGAIN;
>  	}
> -	if (populate)
> -		mm_populate(ctx->mmap_base, populate);
> 
>  	ctx->user_id = ctx->mmap_base;
>  	ctx->nr_events = nr_events; /* trusted copy */
> @@ -399,6 +494,8 @@ out_cleanup:
>  	err = -EAGAIN;
>  	aio_free_ring(ctx);
>  out_freectx:
> +	if (ctx->aio_ring_file)
> +		fput(ctx->aio_ring_file);
>  	kmem_cache_free(kioctx_cachep, ctx);
>  	pr_debug("error allocating ioctx %d\n", err);
>  	return ERR_PTR(err);
> @@ -852,6 +949,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t
> __user *, ctxp)
>  	ioctx = ioctx_alloc(nr_events);
>  	ret = PTR_ERR(ioctx);
>  	if (!IS_ERR(ioctx)) {
> +		ctx = ioctx->user_id;
>  		ret = put_user(ioctx->user_id, ctxp);
>  		if (ret)
>  			kill_ioctx(ioctx);
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index a405d3dc..c407d88 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
>  extern void migrate_page_copy(struct page *newpage, struct page *page);
>  extern int migrate_huge_page_move_mapping(struct address_space *mapping,
>  				  struct page *newpage, struct page *page);
> +extern int migrate_page_move_mapping(struct address_space *mapping,
> +		struct page *newpage, struct page *page,
> +		struct buffer_head *head, enum migrate_mode mode);
>  #else
> 
>  static inline void putback_lru_pages(struct list_head *l) {}
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f0c244..1da0092 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct
> buffer_head *head,
>   * 2 for pages with a mapping
>   * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
>   */
> -static int migrate_page_move_mapping(struct address_space *mapping,
> +int migrate_page_move_mapping(struct address_space *mapping,
>  		struct page *newpage, struct page *page,
>  		struct buffer_head *head, enum migrate_mode mode)
>  {



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] fs/aio: Add support to aio ring pages migration
@ 2013-07-11 10:12   ` Gu Zheng
  0 siblings, 0 replies; 4+ messages in thread
From: Gu Zheng @ 2013-07-11 10:12 UTC (permalink / raw)
  To: Andrew Morton, Mel Gorman, Al Viro, Benjamin
  Cc: tangchen, KAMEZAWA Hiroyuki, linux-fsdevel, linux-kernel, linux-mm

ping...

On 07/08/2013 06:38 PM, Gu Zheng wrote:

> As the aio job will pin the ring pages, that will lead to mem migrated
> failed. In order to fix this problem we use an anon inode to manage the aio ring
> pages, and  setup the migratepage callback in the anon inode's address space, so
> that when mem migrating the aio ring pages will be moved to other mem node safely.
> 
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
> ---
>  fs/aio.c                |  120 ++++++++++++++++++++++++++++++++++++++++++----
>  include/linux/migrate.h |    3 +
>  mm/migrate.c            |    2 +-
>  3 files changed, 113 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index 9b5ca11..d10f956 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -35,6 +35,9 @@
>  #include <linux/eventfd.h>
>  #include <linux/blkdev.h>
>  #include <linux/compat.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/migrate.h>
> +#include <linux/ramfs.h>
> 
>  #include <asm/kmap_types.h>
>  #include <asm/uaccess.h>
> @@ -110,6 +113,7 @@ struct kioctx {
>  	} ____cacheline_aligned_in_smp;
> 
>  	struct page		*internal_pages[AIO_RING_PAGES];
> +	struct file		*aio_ring_file;
>  };
> 
>  /*------ sysctl variables----*/
> @@ -138,15 +142,78 @@ __initcall(aio_setup);
> 
>  static void aio_free_ring(struct kioctx *ctx)
>  {
> -	long i;
> +	int i;
> +	struct file *aio_ring_file = ctx->aio_ring_file;
> 
> -	for (i = 0; i < ctx->nr_pages; i++)
> +	for (i = 0; i < ctx->nr_pages; i++) {
> +		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
> +				page_count(ctx->ring_pages[i]));
>  		put_page(ctx->ring_pages[i]);
> +	}
> 
>  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
>  		kfree(ctx->ring_pages);
> +
> +	if (aio_ring_file) {
> +		truncate_setsize(aio_ring_file->f_inode, 0);
> +		pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
> +			current->pid, aio_ring_file->f_inode->i_nlink,
> +			aio_ring_file->f_path.dentry->d_count,
> +			d_unhashed(aio_ring_file->f_path.dentry),
> +			atomic_read(&aio_ring_file->f_inode->i_count));
> +		fput(aio_ring_file);
> +		ctx->aio_ring_file = NULL;
> +	}
> +}
> +
> +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	vma->vm_ops = &generic_file_vm_ops;
> +	return 0;
> +}
> +
> +static const struct file_operations aio_ring_fops = {
> +	.mmap = aio_ring_mmap,
> +};
> +
> +static int aio_set_page_dirty(struct page *page)
> +{
> +	return 0;
>  }
> 
> +static int aio_migratepage(struct address_space *mapping, struct page *new,
> +			struct page *old, enum migrate_mode mode)
> +{
> +	struct kioctx *ctx = mapping->private_data;
> +	unsigned long flags;
> +	unsigned idx = old->index;
> +	int rc;
> +
> +	/*Writeback must be complete*/
> +	BUG_ON(PageWriteback(old));
> +	put_page(old);
> +
> +	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
> +	if (rc != MIGRATEPAGE_SUCCESS) {
> +		get_page(old);
> +		return rc;
> +	}
> +
> +	get_page(new);
> +
> +	spin_lock_irqsave(&ctx->completion_lock, flags);
> +	migrate_page_copy(new, old);
> +	ctx->ring_pages[idx] = new;
> +	spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +
> +	return rc;
> +}
> +
> +static const struct address_space_operations aio_ctx_aops = {
> +	.set_page_dirty = aio_set_page_dirty,
> +	.migratepage	= aio_migratepage,
> +};
> +
>  static int aio_setup_ring(struct kioctx *ctx)
>  {
>  	struct aio_ring *ring;
> @@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx)
>  	struct mm_struct *mm = current->mm;
>  	unsigned long size, populate;
>  	int nr_pages;
> +	int i;
> +	struct file *file;
> 
>  	/* Compensate for the ring buffer's head/tail overlap entry */
>  	nr_events += 2;	/* 1 is required, 2 for good luck */
> 
>  	size = sizeof(struct aio_ring);
>  	size += sizeof(struct io_event) * nr_events;
> -	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
> 
> +	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
>  	if (nr_pages < 0)
>  		return -EINVAL;
> 
> -	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct
> io_event);
> +	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
> +	if (IS_ERR(file)) {
> +		ctx->aio_ring_file = NULL;
> +		return -EAGAIN;
> +	}
> +
> +	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
> +	file->f_inode->i_mapping->private_data = ctx;
> +	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		struct page *page;
> +		page = find_or_create_page(file->f_inode->i_mapping,
> +					   i, GFP_HIGHUSER | __GFP_ZERO);
> +		if (!page)
> +			break;
> +		pr_debug("pid(%d) page[%d]->count=%d\n",
> +			 current->pid, i, page_count(page));
> +		SetPageUptodate(page);
> +		SetPageDirty(page);
> +		unlock_page(page);
> +	}
> +	ctx->aio_ring_file = file;
> +	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
> +			/ sizeof(struct io_event);
> 
> -	ctx->nr_events = 0;
>  	ctx->ring_pages = ctx->internal_pages;
>  	if (nr_pages > AIO_RING_PAGES) {
>  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
> @@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)
> 
>  	ctx->mmap_size = nr_pages * PAGE_SIZE;
>  	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
> +
>  	down_write(&mm->mmap_sem);
> -	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
> -				       PROT_READ|PROT_WRITE,
> -				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
> +	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
> +				       PROT_READ | PROT_WRITE,
> +				       MAP_SHARED | MAP_POPULATE, 0, &populate);
>  	if (IS_ERR((void *)ctx->mmap_base)) {
>  		up_write(&mm->mmap_sem);
>  		ctx->mmap_size = 0;
>  		aio_free_ring(ctx);
>  		return -EAGAIN;
>  	}
> +	up_write(&mm->mmap_sem);
> +
> +	mm_populate(ctx->mmap_base, populate);
> 
>  	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
>  	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
>  				       1, 0, ctx->ring_pages, NULL);
> -	up_write(&mm->mmap_sem);
> +	for (i = 0; i < ctx->nr_pages; i++)
> +		put_page(ctx->ring_pages[i]);
> 
>  	if (unlikely(ctx->nr_pages != nr_pages)) {
>  		aio_free_ring(ctx);
>  		return -EAGAIN;
>  	}
> -	if (populate)
> -		mm_populate(ctx->mmap_base, populate);
> 
>  	ctx->user_id = ctx->mmap_base;
>  	ctx->nr_events = nr_events; /* trusted copy */
> @@ -399,6 +494,8 @@ out_cleanup:
>  	err = -EAGAIN;
>  	aio_free_ring(ctx);
>  out_freectx:
> +	if (ctx->aio_ring_file)
> +		fput(ctx->aio_ring_file);
>  	kmem_cache_free(kioctx_cachep, ctx);
>  	pr_debug("error allocating ioctx %d\n", err);
>  	return ERR_PTR(err);
> @@ -852,6 +949,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t
> __user *, ctxp)
>  	ioctx = ioctx_alloc(nr_events);
>  	ret = PTR_ERR(ioctx);
>  	if (!IS_ERR(ioctx)) {
> +		ctx = ioctx->user_id;
>  		ret = put_user(ioctx->user_id, ctxp);
>  		if (ret)
>  			kill_ioctx(ioctx);
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index a405d3dc..c407d88 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
>  extern void migrate_page_copy(struct page *newpage, struct page *page);
>  extern int migrate_huge_page_move_mapping(struct address_space *mapping,
>  				  struct page *newpage, struct page *page);
> +extern int migrate_page_move_mapping(struct address_space *mapping,
> +		struct page *newpage, struct page *page,
> +		struct buffer_head *head, enum migrate_mode mode);
>  #else
> 
>  static inline void putback_lru_pages(struct list_head *l) {}
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f0c244..1da0092 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct
> buffer_head *head,
>   * 2 for pages with a mapping
>   * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
>   */
> -static int migrate_page_move_mapping(struct address_space *mapping,
> +int migrate_page_move_mapping(struct address_space *mapping,
>  		struct page *newpage, struct page *page,
>  		struct buffer_head *head, enum migrate_mode mode)
>  {


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-07-11 10:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-08 10:38 [PATCH 2/2] fs/aio: Add support to aio ring pages migration Gu Zheng
2013-07-08 10:38 ` Gu Zheng
2013-07-11 10:12 ` Gu Zheng
2013-07-11 10:12   ` Gu Zheng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.