All of lore.kernel.org
 help / color / mirror / Atom feed
From: Adam Litke <agl@us.ibm.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@infradead.org>,
	William Lee Irwin III <wli@holomorphy.com>,
	Christoph Hellwig <hch@infradead.org>,
	Ken Chen <kenchen@google.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: pagetable_ops: Hugetlb character device example
Date: Wed, 21 Mar 2007 14:43:48 -0500	[thread overview]
Message-ID: <1174506228.21684.41.camel@localhost.localdomain> (raw)
In-Reply-To: <20070319200502.17168.17175.stgit@localhost.localdomain>

The main reason I am advocating a set of pagetable_operations is to
enable the development of a new hugetlb interface.  During the hugetlb
BOFS at OLS last year, we talked about a character device that would
behave like /dev/zero.  Many of the people were talking about how they
just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
getting anonymous memory so bringing that model to huge pages would make
programming for anonymous huge pages easier.

The pagetable_operations API opens up possibilities to do some
additional (and completely sane) things.  For example, I have a patch
that alters the character device code below to make use of a hugetlb
ZERO_PAGE.  This eliminates almost all the up-front fault time, allowing
pages to be COW'ed only when first written to.  We cannot do things like
this with hugetlbfs anymore because we have a set of complex semantics
to preserve.

The following patch is an example of what a simple pagetable_operations
consumer could look like.  It does depend on some other cleanups I am
working on (removal of is_file_hugepages(), ...hugetlbfs/inode.c vs.
mm/hugetlb.c separation, etc).  So it is unlikely to apply to any trees
you may have.  I do think it makes a useful illustration of what
legitimate things can be done with a pagetable_operations interface.

commit be72df1c616fb662693a8d4410ce3058f20c71f3
Author: Adam Litke <agl@us.ibm.com>
Date:   Tue Feb 13 14:18:21 2007 -0800

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fc11063..c5e755b 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_IPMI_HANDLER)	+= ipmi/
 
 obj-$(CONFIG_HANGCHECK_TIMER)	+= hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)		+= tpm/
+obj-$(CONFIG_HUGETLB_PAGE)	+= page.o
 
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
diff --git a/drivers/char/page.c b/drivers/char/page.c
new file mode 100644
index 0000000..e903028
--- /dev/null
+++ b/drivers/char/page.c
@@ -0,0 +1,133 @@
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+
+static const struct {
+	unsigned int    minor;
+	char            *name;
+	umode_t         mode;
+} devlist[] = {
+	{1, "page-huge", S_IRUGO | S_IWUGO},
+};
+
+static struct page *page_nopage(struct vm_area_struct *vma,
+			unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+static struct vm_operations_struct page_vm_ops = {
+	.nopage	= page_nopage,
+};
+
+static int page_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	pte_t entry, new_entry;
+	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+
+	ptep = huge_pte_alloc(mm, address);
+	if (!ptep)
+		return VM_FAULT_OOM;
+
+	mutex_lock(&hugetlb_instantiation_mutex);
+	entry = *ptep;
+	if (pte_none(entry)) {
+		struct page *page;
+
+		page = alloc_huge_page(vma, address);
+		if (!page)
+			return VM_FAULT_OOM;
+		clear_huge_page(page, address);
+
+		ret = VM_FAULT_MINOR;
+		spin_lock(&mm->page_table_lock);
+		if (!pte_none(*ptep))
+			goto out;
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+		new_entry = make_huge_pte(vma, page, 0);
+		set_huge_pte_at(mm, address, ptep, new_entry);
+		goto out;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+out:
+	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
+	return ret;
+}
+
+
+static struct pagetable_operations_struct page_pagetable_ops = {
+	.copy_vma		= copy_hugetlb_page_range,
+	.pin_pages		= follow_hugetlb_page,
+	.unmap_page_range	= unmap_hugepage_range,
+	.change_protection	= hugetlb_change_protection,
+	.free_pgtable_range	= hugetlb_free_pgd_range,
+	.fault			= page_fault,
+};
+
+static int page_mmap(struct file * file, struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED)
+		return -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	if (vma->vm_start & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+	vma->vm_ops = &page_vm_ops;
+	vma->pagetable_ops = &page_pagetable_ops;
+
+	return 0;
+}
+
+const struct file_operations page_file_operations = {
+	.mmap			= page_mmap,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.prepare_unmapped_area	= prepare_hugepage_range,
+};
+
+static struct class *page_class;
+
+static int __init chr_dev_init(void)
+{
+	int major, i;
+
+	printk("Initializing page devices...");
+	major = register_chrdev(0, "page", &page_file_operations);
+	if (major <= 0)
+		printk("failed\n");
+	else
+		printk("(%i:0)\n", major);
+
+	page_class = class_create(THIS_MODULE, "page");
+	for (i = 0; i < ARRAY_SIZE(devlist); i++)
+		class_device_create(page_class, NULL,
+			MKDEV(major, devlist[i].minor),
+			NULL, devlist[i].name);
+
+	return 0;
+}
+fs_initcall(chr_dev_init);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc0bca..edd4944 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -590,6 +590,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	BUG_ON(!has_pt_op(vma, fault));
 
+	BUG_ON(!has_pt_op(vma,fault));
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


WARNING: multiple messages have this Message-ID (diff)
From: Adam Litke <agl@us.ibm.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@infradead.org>,
	William Lee Irwin III <wli@holomorphy.com>,
	Christoph Hellwig <hch@infradead.org>,
	Ken Chen <kenchen@google.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: pagetable_ops: Hugetlb character device example
Date: Wed, 21 Mar 2007 14:43:48 -0500	[thread overview]
Message-ID: <1174506228.21684.41.camel@localhost.localdomain> (raw)
In-Reply-To: <20070319200502.17168.17175.stgit@localhost.localdomain>

The main reason I am advocating a set of pagetable_operations is to
enable the development of a new hugetlb interface.  During the hugetlb
BOFS at OLS last year, we talked about a character device that would
behave like /dev/zero.  Many of the people were talking about how they
just wanted to create MAP_PRIVATE hugetlb mappings without all the fuss
about the hugetlbfs filesystem.  /dev/zero is a familiar interface for
getting anonymous memory so bringing that model to huge pages would make
programming for anonymous huge pages easier.

The pagetable_operations API opens up possibilities to do some
additional (and completely sane) things.  For example, I have a patch
that alters the character device code below to make use of a hugetlb
ZERO_PAGE.  This eliminates almost all the up-front fault time, allowing
pages to be COW'ed only when first written to.  We cannot do things like
this with hugetlbfs anymore because we have a set of complex semantics
to preserve.

The following patch is an example of what a simple pagetable_operations
consumer could look like.  It does depend on some other cleanups I am
working on (removal of is_file_hugepages(), ...hugetlbfs/inode.c vs.
mm/hugetlb.c separation, etc).  So it is unlikely to apply to any trees
you may have.  I do think it makes a useful illustration of what
legitimate things can be done with a pagetable_operations interface.

commit be72df1c616fb662693a8d4410ce3058f20c71f3
Author: Adam Litke <agl@us.ibm.com>
Date:   Tue Feb 13 14:18:21 2007 -0800

diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fc11063..c5e755b 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_IPMI_HANDLER)	+= ipmi/
 
 obj-$(CONFIG_HANGCHECK_TIMER)	+= hangcheck-timer.o
 obj-$(CONFIG_TCG_TPM)		+= tpm/
+obj-$(CONFIG_HUGETLB_PAGE)	+= page.o
 
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
diff --git a/drivers/char/page.c b/drivers/char/page.c
new file mode 100644
index 0000000..e903028
--- /dev/null
+++ b/drivers/char/page.c
@@ -0,0 +1,133 @@
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+
+static const struct {
+	unsigned int    minor;
+	char            *name;
+	umode_t         mode;
+} devlist[] = {
+	{1, "page-huge", S_IRUGO | S_IWUGO},
+};
+
+static struct page *page_nopage(struct vm_area_struct *vma,
+			unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+static struct vm_operations_struct page_vm_ops = {
+	.nopage	= page_nopage,
+};
+
+static int page_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	pte_t entry, new_entry;
+	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+
+	ptep = huge_pte_alloc(mm, address);
+	if (!ptep)
+		return VM_FAULT_OOM;
+
+	mutex_lock(&hugetlb_instantiation_mutex);
+	entry = *ptep;
+	if (pte_none(entry)) {
+		struct page *page;
+
+		page = alloc_huge_page(vma, address);
+		if (!page)
+			return VM_FAULT_OOM;
+		clear_huge_page(page, address);
+
+		ret = VM_FAULT_MINOR;
+		spin_lock(&mm->page_table_lock);
+		if (!pte_none(*ptep))
+			goto out;
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+		new_entry = make_huge_pte(vma, page, 0);
+		set_huge_pte_at(mm, address, ptep, new_entry);
+		goto out;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+out:
+	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
+	return ret;
+}
+
+
+static struct pagetable_operations_struct page_pagetable_ops = {
+	.copy_vma		= copy_hugetlb_page_range,
+	.pin_pages		= follow_hugetlb_page,
+	.unmap_page_range	= unmap_hugepage_range,
+	.change_protection	= hugetlb_change_protection,
+	.free_pgtable_range	= hugetlb_free_pgd_range,
+	.fault			= page_fault,
+};
+
+static int page_mmap(struct file * file, struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_SHARED)
+		return -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	if (vma->vm_start & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+	vma->vm_ops = &page_vm_ops;
+	vma->pagetable_ops = &page_pagetable_ops;
+
+	return 0;
+}
+
+const struct file_operations page_file_operations = {
+	.mmap			= page_mmap,
+	.get_unmapped_area	= hugetlb_get_unmapped_area,
+	.prepare_unmapped_area	= prepare_hugepage_range,
+};
+
+static struct class *page_class;
+
+static int __init chr_dev_init(void)
+{
+	int major, i;
+
+	printk("Initializing page devices...");
+	major = register_chrdev(0, "page", &page_file_operations);
+	if (major <= 0)
+		printk("failed\n");
+	else
+		printk("(%i:0)\n", major);
+
+	page_class = class_create(THIS_MODULE, "page");
+	for (i = 0; i < ARRAY_SIZE(devlist); i++)
+		class_device_create(page_class, NULL,
+			MKDEV(major, devlist[i].minor),
+			NULL, devlist[i].name);
+
+	return 0;
+}
+fs_initcall(chr_dev_init);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4fc0bca..edd4944 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -590,6 +590,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	BUG_ON(!has_pt_op(vma, fault));
 
+	BUG_ON(!has_pt_op(vma,fault));
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2007-03-21 19:43 UTC|newest]

Thread overview: 80+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-19 20:05 [PATCH 0/7] [RFC] hugetlb: pagetable_operations API (V2) Adam Litke
2007-03-19 20:05 ` Adam Litke
2007-03-19 20:05 ` [PATCH 1/7] Introduce the pagetable_operations and associated helper macros Adam Litke
2007-03-19 20:05   ` Adam Litke
2007-03-20 23:24   ` Dave Hansen
2007-03-20 23:24     ` Dave Hansen
2007-03-21 14:50     ` Adam Litke
2007-03-21 14:50       ` Adam Litke
2007-03-21 15:05       ` Arjan van de Ven
2007-03-21 15:05         ` Arjan van de Ven
2007-03-21  4:18   ` Nick Piggin
2007-03-21  4:18     ` Nick Piggin
2007-03-21  4:52     ` William Lee Irwin III
2007-03-21  4:52       ` William Lee Irwin III
2007-03-21  5:07       ` Nick Piggin
2007-03-21  5:07         ` Nick Piggin
2007-03-21  5:41         ` William Lee Irwin III
2007-03-21  5:41           ` William Lee Irwin III
2007-03-21  6:51           ` Nick Piggin
2007-03-21  6:51             ` Nick Piggin
2007-03-21  7:36             ` Nick Piggin
2007-03-21  7:36               ` Nick Piggin
2007-03-21 10:46             ` William Lee Irwin III
2007-03-21 10:46               ` William Lee Irwin III
2007-03-21 15:17     ` Adam Litke
2007-03-21 15:17       ` Adam Litke
2007-03-21 16:00       ` Christoph Hellwig
2007-03-21 16:00         ` Christoph Hellwig
2007-03-21 23:03         ` Nick Piggin
2007-03-21 23:03           ` Nick Piggin
2007-03-21 23:02       ` Nick Piggin
2007-03-21 23:02         ` Nick Piggin
2007-03-21 23:32         ` William Lee Irwin III
2007-03-21 23:32           ` William Lee Irwin III
2007-03-19 20:05 ` [PATCH 2/7] copy_vma for hugetlbfs Adam Litke
2007-03-19 20:05   ` Adam Litke
2007-03-19 20:05 ` [PATCH 3/7] pin_pages for hugetlb Adam Litke
2007-03-19 20:05   ` Adam Litke
2007-03-19 20:05 ` [PATCH 4/7] unmap_page_range " Adam Litke
2007-03-19 20:05   ` Adam Litke
2007-03-20 23:27   ` Dave Hansen
2007-03-20 23:27     ` Dave Hansen
2007-03-19 20:05 ` [PATCH 5/7] change_protection " Adam Litke
2007-03-19 20:05   ` Adam Litke
2007-03-19 20:06 ` [PATCH 6/7] free_pgtable_range " Adam Litke
2007-03-19 20:06   ` Adam Litke
2007-03-19 20:06 ` [PATCH 7/7] hugetlbfs fault handler Adam Litke
2007-03-19 20:06   ` Adam Litke
2007-03-20 23:50 ` [PATCH 0/7] [RFC] hugetlb: pagetable_operations API (V2) Dave Hansen
2007-03-20 23:50   ` Dave Hansen
2007-03-21  1:17 ` William Lee Irwin III
2007-03-21  1:17   ` William Lee Irwin III
2007-03-21 15:55 ` Hugh Dickins
2007-03-21 15:55   ` Hugh Dickins
2007-03-21 16:01   ` Christoph Hellwig
2007-03-21 16:01     ` Christoph Hellwig
2007-03-21 16:23   ` William Lee Irwin III
2007-03-21 17:08     ` Hugh Dickins
2007-03-21 17:42       ` William Lee Irwin III
2007-03-21 19:43 ` Adam Litke [this message]
2007-03-21 19:43   ` pagetable_ops: Hugetlb character device example Adam Litke
2007-03-21 19:51   ` Valdis.Kletnieks
2007-03-21 20:26     ` Adam Litke
2007-03-21 20:26       ` Adam Litke
2007-03-21 22:26     ` William Lee Irwin III
2007-03-21 22:26       ` William Lee Irwin III
2007-03-21 22:53       ` Matt Mackall
2007-03-21 22:53         ` Matt Mackall
2007-03-21 23:35         ` William Lee Irwin III
2007-03-21 23:35           ` William Lee Irwin III
2007-03-22  0:31           ` Matt Mackall
2007-03-22  0:31             ` Matt Mackall
2007-03-22 10:38   ` Christoph Hellwig
2007-03-22 10:38     ` Christoph Hellwig
2007-03-22 15:42     ` Mel Gorman
2007-03-22 15:42       ` Mel Gorman
2007-03-22 18:15       ` Christoph Hellwig
2007-03-22 18:15         ` Christoph Hellwig
2007-03-23 14:57         ` Mel Gorman
2007-03-23 14:57           ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1174506228.21684.41.camel@localhost.localdomain \
    --to=agl@us.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=arjan@infradead.org \
    --cc=hch@infradead.org \
    --cc=kenchen@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=wli@holomorphy.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.