Fix refcounting in hugetlbfs quota handling

* Fix refcounting in hugetlbfs quota handling
@ 2011-08-11  6:40 David Gibson
  2011-08-12  0:48 ` Linus Torvalds
  2011-08-12 22:20 ` [Qemu-devel] " Christoph Hellwig
  0 siblings, 2 replies; 11+ messages in thread
From: David Gibson @ 2011-08-11  6:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Avi Kivity, Marcelo Tosatti, Jan Kiszka, qemu-devel, agraf, kvm,
	Paul Mackerras, linux-kernel

Linus, please apply

hugetlbfs tracks the current usage of hugepages per hugetlbfs
mountpoint.  To correctly track this when hugepages are released, it
must find the right hugetlbfs super_block from the struct page
available in free_huge_page().

It does this by storing a pointer to the hugepage's struct
address_space in the page_private data.  The hugetlb_{get,put}_quota
functions go from this mapping to the inode and thence to the
super_block.

However, this usage is buggy, because nothing ensures that the
address_space is not freed before all the hugepages that belonged to
it are.  In practice that will usually be the case, but if extra page
references have been taken by e.g. drivers or kvm doing
get_user_pages() then the file, inode and address space may be
destroyed before all the pages.

In addition, the quota functions use the mapping only to get the inode
then the super_block.  However, most of the callers already have the
inode anyway and have to get the mapping from there.

This patch, therefore, stores a pointer to the inode instead of the
address_space in the page private data for hugepages.  More
importantly it correctly adjusts the reference count on the inodes
when they're added to the page private data.  This ensures that the
inode (and therefore the super block) will not be freed before we use
it from free_huge_page.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================

--- working-2.6.orig/fs/hugetlbfs/inode.c	2011-08-10 16:45:47.864758406 +1000
+++ working-2.6/fs/hugetlbfs/inode.c	2011-08-10 17:22:21.899638039 +1000
@@ -874,10 +874,10 @@ out_free:
 	return -ENOMEM;
 }
 
-int hugetlb_get_quota(struct address_space *mapping, long delta)
+int hugetlb_get_quota(struct inode *inode, long delta)
 {
 	int ret = 0;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
@@ -891,9 +891,9 @@ int hugetlb_get_quota(struct address_spa
 	return ret;
 }
 
-void hugetlb_put_quota(struct address_space *mapping, long delta)
+void hugetlb_put_quota(struct inode *inode, long delta)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h	2011-08-10 16:58:27.952527484 +1000
+++ working-2.6/include/linux/hugetlb.h	2011-08-10 17:22:08.723572707 +1000
@@ -171,8 +171,8 @@ extern const struct file_operations huge
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
+int hugetlb_get_quota(struct inode *inode, long delta);
+void hugetlb_put_quota(struct inode *inode, long delta);
 
 static inline int is_file_hugepages(struct file *file)
 {
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c	2011-08-10 16:44:12.212284092 +1000
+++ working-2.6/mm/hugetlb.c	2011-08-10 17:21:49.603477888 +1000
@@ -533,10 +533,12 @@ static void free_huge_page(struct page *
 	 */
 	struct hstate *h = page_hstate(page);
 	int nid = page_to_nid(page);
-	struct address_space *mapping;
+	struct inode *inode;
 
-	mapping = (struct address_space *) page_private(page);
+	inode = (struct inode *) page_private(page);
 	set_page_private(page, 0);
+	iput(inode);
+
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
 	BUG_ON(page_mapcount(page));
@@ -551,8 +553,8 @@ static void free_huge_page(struct page *
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
-	if (mapping)
-		hugetlb_put_quota(mapping, 1);
+	if (inode)
+		hugetlb_put_quota(inode, 1);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1035,7 +1037,7 @@ static struct page *alloc_huge_page(stru
 	if (chg < 0)
 		return ERR_PTR(-VM_FAULT_OOM);
 	if (chg)
-		if (hugetlb_get_quota(inode->i_mapping, chg))
+		if (hugetlb_get_quota(inode, chg))
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 
 	spin_lock(&hugetlb_lock);
@@ -1045,12 +1047,12 @@ static struct page *alloc_huge_page(stru
 	if (!page) {
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
-			hugetlb_put_quota(inode->i_mapping, chg);
+			hugetlb_put_quota(inode, chg);
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 		}
 	}
 
-	set_page_private(page, (unsigned long) mapping);
+	set_page_private(page, (unsigned long) igrab(inode));
 
 	vma_commit_reservation(h, vma, addr);
 
@@ -2086,7 +2088,8 @@ static void hugetlb_vm_op_close(struct v
 
 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
-			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+			hugetlb_put_quota(vma->vm_file->f_mapping->host,
+					  reserve);
 		}
 	}
 }
@@ -2884,7 +2887,7 @@ int hugetlb_reserve_pages(struct inode *
 		return chg;
 
 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
+	if (hugetlb_get_quota(inode, chg))
 		return -ENOSPC;
 
 	/*
@@ -2893,7 +2896,7 @@ int hugetlb_reserve_pages(struct inode *
 	 */
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
-		hugetlb_put_quota(inode->i_mapping, chg);
+		hugetlb_put_quota(inode, chg);
 		return ret;
 	}
 
@@ -2922,7 +2925,7 @@ void hugetlb_unreserve_pages(struct inod
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
 	spin_unlock(&inode->i_lock);
 
-	hugetlb_put_quota(inode->i_mapping, (chg - freed));
+	hugetlb_put_quota(inode, (chg - freed));
 	hugetlb_acct_memory(h, -(chg - freed));
 }
 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 11+ messages in thread