linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Huge TLB pages always physically continious?
@ 2002-11-01 22:56 Ingo Oeser
  2002-11-01 23:23 ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Ingo Oeser @ 2002-11-01 22:56 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2087 bytes --]

Hi there,

are huge TLB pages always physically continous in memory?

What does follow_hugetlb_page do exactly? I simply don't
understand what the code does.

I would like to build up a simplified get_user_pages_sgl() to
build a scatter gather list from user space adresses.

If I want to coalesce physically continous pages (if they are
also virtually continious) anyway, can I write up a simplified
follow_hugetlb_page_sgl() function which handles the huge page
really as only one page?

Motivation:

Currently doing scatter gather DMA of user pages requires THREE
runs over the pages and I would like to save at least the second
one and possibly shorten the third one.

The three steps required:

   1) get_user_pages() to obtain the pages and lock them in page_cache
   2) translate the vector of pointers to struct page to a vector
      of struct scatterlist
   3) pci_map_sg() a decent amount[1], DMA it, wait for completion 
      or abortion, pci_unmap_sg() it and start again with the remainder

Step 2) could be eliminated completely and also the allocation of
the temporary vector of struct page.

Step 3) could be shortend, if I coalesce physically continous
ranges into a single scatterlist entry with just a ->length
bigger than PAGE_SIZE. I know that this is only worth it on
architectures, where physical address == bus address.

As each step is a for() loop and should be considered running on
more than 1MB worth of memory, I see significant improvements.

Without supporting huge TLB pages, I only add 700 bytes to the
kernel while simply copying get_user_pages() into a function,
which takes an vector of struct scatterlist instead of struct
page.

This sounds a promising tradeoff for a first time implementation.

Patch attached. No users yet, but they will follow. First
candidate is the v4l DMA stuff.

Regards

Ingo Oeser

[1] How much can I safely map on the strange architectures, where
   this is a limited? AFAIK there is no value or function telling
   me how far I can go.
-- 
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth

[-- Attachment #2: get_user_pages_sgl.patch --]
[-- Type: text/plain, Size: 2791 bytes --]

diff -Naur linux-2.5.44/kernel/ksyms.c linux-2.5.44-ioe/kernel/ksyms.c
--- linux-2.5.44/kernel/ksyms.c	Sat Oct 19 06:01:08 2002
+++ linux-2.5.44-ioe/kernel/ksyms.c	Fri Nov  1 23:12:48 2002
@@ -136,6 +136,7 @@
 EXPORT_SYMBOL(page_address);
 #endif
 EXPORT_SYMBOL(get_user_pages);
+EXPORT_SYMBOL(get_user_pages_sgl);
 
 /* filesystem internal functions */
 EXPORT_SYMBOL(def_blk_fops);
diff -Naur linux-2.5.44/mm/memory.c linux-2.5.44-ioe/mm/memory.c
--- linux-2.5.44/mm/memory.c	Sat Oct 19 06:01:52 2002
+++ linux-2.5.44-ioe/mm/memory.c	Fri Nov  1 23:48:42 2002
@@ -49,6 +49,7 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/scatterlist.h>
 
 #include <linux/swapops.h>
 
@@ -514,6 +515,85 @@
 }
 
 
+int get_user_pages_sgl(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int len, int write,
+		struct scatterlist **sgl)
+{
+	int i;
+	unsigned int flags;
+
+	/* Without this structure, it makes no sense to call this */
+	BUG_ON(!sgl);
+
+	/* 
+	 * Require read or write permissions.
+	 */
+	flags = write ? VM_WRITE : VM_READ;
+	i = 0;
+
+	do {
+		struct vm_area_struct *	vma;
+
+		vma = find_extend_vma(mm, start);
+
+		if (!vma || (vma->vm_flags & VM_IO)
+				|| !(flags & vma->vm_flags))
+			return i ? : -EFAULT;
+
+		/* Doesn't work with huge pages! */
+		BUG_ON(is_vm_hugetlb_page(vma));
+		
+		spin_lock(&mm->page_table_lock);
+		do {
+			struct page *map;
+			while (!(map = follow_page(mm, start, write))) {
+				spin_unlock(&mm->page_table_lock);
+				switch (handle_mm_fault(mm,vma,start,write)) {
+				case VM_FAULT_MINOR:
+					tsk->min_flt++;
+					break;
+				case VM_FAULT_MAJOR:
+					tsk->maj_flt++;
+					break;
+				case VM_FAULT_SIGBUS:
+					return i ? i : -EFAULT;
+				case VM_FAULT_OOM:
+					return i ? i : -ENOMEM;
+				default:
+					BUG();
+				}
+				spin_lock(&mm->page_table_lock);
+			}
+			sgl[i]->page = get_page_map(map);
+			if (!sgl[i]->page) {
+				spin_unlock(&mm->page_table_lock);
+				while (i--)
+					page_cache_release(sgl[i]->page);
+				i = -EFAULT;
+				goto out;
+			}
+			if (!PageReserved(sgl[i]->page))
+				page_cache_get(sgl[i]->page);
+			
+			/* TODO: Do coalescing of physically continious pages
+			 * here
+			 */
+			sgl[i]->offset=0;
+			sgl[i]->length=PAGE_SIZE;
+
+			i++;
+			start += PAGE_SIZE;
+			len--;
+		} while(len && start < vma->vm_end);
+		spin_unlock(&mm->page_table_lock);
+	} while(len);
+	
+	/* This might be pointless, if start is always aligned to pages */
+	sgl[0]->offset=start & ~PAGE_MASK;
+	sgl[0]->length=PAGE_SIZE - (start & ~PAGE_MASK);
+out:
+	return i;
+}
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Huge TLB pages always physically continious?
  2002-11-01 22:56 Huge TLB pages always physically continious? Ingo Oeser
@ 2002-11-01 23:23 ` Andrew Morton
  2002-11-02  8:29   ` Jeff Garzik
  2002-11-04 13:49   ` get_user_pages rewrite (was: Huge TLB pages always physically continious?) Ingo Oeser
  0 siblings, 2 replies; 4+ messages in thread
From: Andrew Morton @ 2002-11-01 23:23 UTC (permalink / raw)
  To: Ingo Oeser; +Cc: linux-mm, linux-kernel

Ingo Oeser wrote:
> 
> Hi there,
> 
> are huge TLB pages always physically continous in memory?

Yes.

> What does follow_hugetlb_page do exactly? I simply don't
> understand what the code does.

It allows get_user_pages() to work correctly across hugepage
regions.  It walks a chunk of memory which is covered by
hugepages and installs (at *pages) the list of 4k-pages which
are covered by the hugepage.  So

 |--------------------------------------------------|  <- hugepage
 |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|  <- 4k pages

 get_user_pages(   ^here                   ^to here)

 will install the spanned 4k pages into the caller's pages[]
 array.
 
> I would like to build up a simplified get_user_pages_sgl() to
> build a scatter gather list from user space adresses.
> 
> If I want to coalesce physically continous pages (if they are
> also virtually continious) anyway, can I write up a simplified
> follow_hugetlb_page_sgl() function which handles the huge page
> really as only one page?

I suggest that you restructure get_user_pages thusly:

1: Write a simplified get_user_page().  Most callers of get_user_pages()
   only want a single page anyway, and don't need to concoct all those
   arguments.

2: Split get_user_pages up into a pagetable walker and a callback function.
   So it walks the pages, calling back to the caller's callback function
   for each page with

	(*callback)(struct page *page, <other stuff>, void *callerdata);

   You'll need to extend follow_hugetlb_page() to take the callback
   info and to perform the callbacks for its pages as well.

3: Reimplement the current get_user_pages() using the core engine from 2
   (ie: write the callback for it)

4: Implement your sg engine using the walker+callback arrangement.  This
   way, you can do your coalescing on-the-fly, and you only take one
   pass across the pages list and you do not need to know about hugepages
   at all.   Sure you'll do a *little* more work than you need to,  but
   not having that special case is nicer.

5: Fix up the ia64 follow_hugetlb_page too.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Huge TLB pages always physically continious?
  2002-11-01 23:23 ` Andrew Morton
@ 2002-11-02  8:29   ` Jeff Garzik
  2002-11-04 13:49   ` get_user_pages rewrite (was: Huge TLB pages always physically continious?) Ingo Oeser
  1 sibling, 0 replies; 4+ messages in thread
From: Jeff Garzik @ 2002-11-02  8:29 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Oeser, linux-mm, linux-kernel

Now that hugetlbfs is merged, can we remove the hugetlb syscalls? 
 Pretty please?  ;-)

What I've heard in the background is that all the Big Users(tm) of 
hugetlbs greatly prefer the existing syscalls (a.k.a. hugetlbfs) to 
adding support to new ones in the various userland portability layers in 
use...

    Jeff





^ permalink raw reply	[flat|nested] 4+ messages in thread

* get_user_pages rewrite (was: Huge TLB pages always physically continious?)
  2002-11-01 23:23 ` Andrew Morton
  2002-11-02  8:29   ` Jeff Garzik
@ 2002-11-04 13:49   ` Ingo Oeser
  1 sibling, 0 replies; 4+ messages in thread
From: Ingo Oeser @ 2002-11-04 13:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel

Hi Andrew,

On Fri, Nov 01, 2002 at 03:23:02PM -0800, Andrew Morton wrote:
> I suggest that you restructure get_user_pages thusly:

Ok, first step follows.

> 
> 1: Write a simplified get_user_page().  Most callers of get_user_pages()
>    only want a single page anyway, and don't need to concoct all those
>    arguments.
> 
> 2: Split get_user_pages up into a pagetable walker and a callback function.
>    So it walks the pages, calling back to the caller's callback function
>    for each page with
> 
> 	(*callback)(struct page *page, <other stuff>, void *callerdata);
> 
>    You'll need to extend follow_hugetlb_page() to take the callback
>    info and to perform the callbacks for its pages as well.
> 
> 3: Reimplement the current get_user_pages() using the core engine from 2
>    (ie: write the callback for it)
 
CONFIG_HUGETLB_PAGES is currently broken by this patch, but will
be fixed NOW.

Patch implementing 1-3 of your 5 points is attached.

This also enables get_user_pages() callers to do more cleanup, if
things failed, which wasn't ensured before (namely pagecache
references where not always released on errors).

The OBSOLETE_PAGE_WALKER define is there to catch callers, which
use the vmas-pointer of get_user_pages.

There are 3 kinds of callers in the kernel:

   a) Caller needs a VMA and only ONE page of it
      -> will be converted to get_one_user_page() and must do
         find_extend_vma() itself. Since it needs a vma anyway,
         it can also look for it, right? ;-)

   b) Caller doesn't need VMA, but will take care of pages
      passed. 
      -> Will be converted to NEW variant get_user_pages(), which
         will not support the "vmas" parameter anymore.
         
         But since we pass the VMA anyway for unlocking the
         page_table_lock of the MM it belongs to, any user
         outside of the kernel can use it's own custom_page_walker_t
         and account them, too.

   c) Caller does need neither VMA nor pages
      -> is converted to use walk_user_pages()

         This caller was mm/memory.c:make_pages_present and is
         covered by the patch attached already.

My problems are now with the handling of follow_hugetlb_pages().

1) follow_hugetlb_pages() don't need the page_table_lock, but the
   custom_page_walker_t expects it to be locked and will unlock
   it on error. But since everything inside follow_hugetlb_pages()
   is happening quite fast (it cannot sleep), there should be not
   much contention.

2) It looks benefical to implement follow_one_hugetlb_page(),
   which doesn't need the page_table_lock and will only look up
   one small page. But it has to be implemented for every
   hugetlb_page implemtation.

   Is this acceptable, because it covers the common case?


Thanks for your input so far. It has been very helpful.

Patch compiles so far. Testing will be done, after I converted
all users and the generic design is ACKed ;-)

Regards

Ingo Oeser

diff -Naur --exclude=.err --exclude=.out --exclude=[.].* linux-2.5.44/include/linux/mm.h linux-2.5.44-ioe/include/linux/mm.h
--- linux-2.5.44/include/linux/mm.h	Sat Oct 19 06:01:08 2002
+++ linux-2.5.44-ioe/include/linux/mm.h	Mon Nov  4 13:53:35 2002
@@ -370,9 +370,47 @@
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 
+/*** Page walking API ***/
+
+/* &custom_page_walker - A custom page walk handler for walk_user_pages().
+ * vma:         The vma we walk pages of.
+ * page:        The page we found or an %ERR_PTR() value
+ * virt_addr:   The virtual address we are at while walking.
+ * customdata:  Anything you would like to pass additionally.
+ *
+ * Returns:
+ *      Negative values -> ERRNO values.
+ *      0               -> continue page walking.
+ *      1               -> abort page walking.
+ *
+ * If this functions gets a page, for which %IS_ERR(@page) is true, than it
+ * should do it's cleanup of customdata and return -PTR_ERR(@page).
+ *
+ * This function is called with @vma->vm_mm->page_table_lock held,
+ * if IS_ERR(@vma) is not true.
+ *
+ * But if IS_ERR(@vma) is true, IS_ERR(@page) is also true, since if we have no
+ * vma, then we also have no user space page.
+ *
+ * If it returns a negative value, then the page_table_lock must be dropped
+ * by this function, if it is held.
+ */
+typedef int (*custom_page_walker_t)(struct vm_area_struct *vma, 
+		struct page *page, unsigned long virt_addr, void *customdata);
+
 extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write);
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
-		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+
+struct page *get_one_user_page(struct task_struct *tsk, 
+		struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long start, int write, int force);
+
+int walk_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int write, int force,
+		custom_page_walker_t walker, void *customdata);
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 
+		unsigned long start, int len, int write, int force, 
+		struct page **pages, struct vm_area_struct **vmas);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
diff -Naur --exclude=.err --exclude=.out --exclude=[.].* linux-2.5.44/mm/memory.c linux-2.5.44-ioe/mm/memory.c
--- linux-2.5.44/mm/memory.c	Sat Oct 19 06:01:52 2002
+++ linux-2.5.44-ioe/mm/memory.c	Mon Nov  4 14:32:39 2002
@@ -35,6 +35,10 @@
  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  *		(Gerhard.Wichert@pdb.siemens.de)
  */
+/* 04.11.02  -  Page walker API added by Ingo Oeser 
+ * 		<ioe@informatik.tu-chemnitz.de>
+ * 		Thanks go to Andrew Morton for his intial idea and general help.
+ */
 
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
@@ -505,20 +509,203 @@
  * it?  This may become more complex in the future if we start dealing
  * with IO-aperture pages for direct-IO.
  */
-
 static inline struct page *get_page_map(struct page *page)
 {
 	if (!pfn_valid(page_to_pfn(page)))
-		return 0;
+		return ERR_PTR(EFAULT);
 	return page;
 }
 
+/* Simple page walk handler adding pages to a list of them */
+struct gup_add_pages {
+	unsigned int count;
+	unsigned int max_pages;
+	struct page **pages;
+};
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, int len, int write, int force,
-		struct page **pages, struct vm_area_struct **vmas)
+static inline void gup_pages_cleanup(struct gup_add_pages *gup)
+{
+	while (gup->count--) {
+		page_cache_release(gup->pages[gup->count]);
+	}
+}
+
+/* Follows custom_page_walker API description */
+static int gup_add_pages(struct vm_area_struct *vma, struct page *page,
+	      unsigned long virt_addr, void *customdata)
+{
+
+	struct gup_add_pages *gup = customdata;
+
+	BUG_ON(!customdata);
+
+	if (!IS_ERR(page)) {
+		gup->pages[gup->count++] = page;
+		if (!PageReserved(page))
+			page_cache_get(page);
+
+		/* Abort if we cannot hold more pages */
+		return (gup->count == gup->max_pages) ? 1 : 0;
+	}
+
+	if (!IS_ERR(vma))
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	gup_pages_cleanup(gup);
+	return -PTR_ERR(page);
+}
+
+#define OBSOLETE_PAGE_WALKER
+#ifdef OBSOLETE_PAGE_WALKER
+/* Obsolete page walk handler adding pages and vmas to a list of them */
+struct gup_add_pv {
+	unsigned int page_count;
+	unsigned int max_pages;
+	struct page **pages;
+	unsigned int vma_count;
+	unsigned int max_vmas;
+	struct vm_area_struct **vmas;
+};
+
+static inline void gup_pv_cleanup(struct gup_add_pv *gup)
+{
+	while (gup->page_count--) {
+		page_cache_release(gup->pages[gup->page_count]);
+	}
+}
+
+/* Follows custom_page_walker API description */
+static int gup_add_pv(struct vm_area_struct *vma, struct page *page,
+	   unsigned long virt_addr, void *customdata)
+{
+
+	struct gup_add_pv *gup = customdata;
+	int ret = 0;
+
+	BUG_ON(!customdata);
+
+	if (!IS_ERR(page)) {
+		if (gup->vmas) {
+			/* Add vma only, if its a new one. Since we walk them
+			 * uniquely, this simple check is enough. -ioe 
+			 */
+			if (!gup->vma_count
+			    || gup->vmas[gup->vma_count - 1] != vma) {
+				gup->vmas[gup->vma_count++] = vma;
+
+				/* Abort scanning, if we cannot hold more */
+				if (gup->vma_count == gup->max_vmas)
+					ret = 1;
+			}
+		}
+
+		if (gup->pages) {
+			gup->pages[gup->page_count++] = page;
+			if (!PageReserved(page))
+				page_cache_get(page);
+
+			/* Abort scanning, if we cannot hold more */
+			if (gup->page_count == gup->max_pages)
+				ret = 1;
+		}
+		return ret;
+	}
+
+	if (!IS_ERR(vma))
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	gup_pv_cleanup(gup);
+	return -PTR_ERR(page);
+}
+#endif				/* OBSOLETE_PAGE_WALKER */
+
+/* Try to fault in the page at START. Returns valid page or ERR_PTR().
+ *
+ * called with mm->page_table_lock held 
+ */
+static struct page *single_page_walk(struct task_struct *tsk,
+				     struct mm_struct *mm,
+				     struct vm_area_struct *vma,
+				     unsigned long start, int write)
+{
+	struct page *map;
+
+	while (!(map = follow_page(mm, start, write))) {
+		int fault;
+
+		spin_unlock(&mm->page_table_lock);
+		fault = handle_mm_fault(mm, vma, start, write);
+		spin_lock(&mm->page_table_lock);
+
+		switch (fault) {
+		case VM_FAULT_MINOR:
+			tsk->min_flt++;
+			break;
+		case VM_FAULT_MAJOR:
+			tsk->maj_flt++;
+			break;
+		case VM_FAULT_SIGBUS:
+			return ERR_PTR(EFAULT);
+		case VM_FAULT_OOM:
+			return ERR_PTR(ENOMEM);
+		default:
+			/* FIXME Is this unlock better or worse here? -ioe */
+			spin_unlock(&mm->page_table_lock);
+			BUG();
+		}
+	}
+	return get_page_map(map);
+}
+
+/* VMA contains already "start".
+ * (e.g. find_vma_extend(mm,start) has been called sucessfully already 
+ */
+struct page *get_one_user_page(struct task_struct *tsk,
+			   struct mm_struct *mm, struct vm_area_struct *vma,
+			   unsigned long start, int write, int force)
+{
+	unsigned int flags;
+	struct page *page;
+
+	/* 
+	 * Require read or write permissions.
+	 * If 'force' is set, we only require the "MAY" flags.
+	 */
+	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+	if (!vma || (vma->vm_flags & VM_IO) || !(flags & vma->vm_flags))
+		return ERR_PTR(EFAULT);
+
+	/* FIXME: These are not handled properly, yet. -ioe */
+	/*
+	   if (is_vm_hugetlb_page(vma)) {
+	   int len = 1;
+	   int i;
+	   i = follow_hugetlb_page(mm, vma, &page, NULL,
+	   &start, &len, 0);
+	   return (i == 1) ? page : ERR_PTR(EFAULT);
+	   }
+	 */
+
+	spin_lock(&mm->page_table_lock);
+	page = single_page_walk(tsk, mm, vma, start, write);
+
+	if (!(IS_ERR(page) || PageReserved(page)))
+		page_cache_get(page);
+
+	spin_unlock(&mm->page_table_lock);
+	return page;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+#error This code is not suitable for huge pages yet.
+#endif
+
+/* Returns 0 or negative errno value */
+int walk_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int write, int force,
+		custom_page_walker_t walker, void *customdata)
 {
-	int i;
+	int ret;
 	unsigned int flags;
 
 	/* 
@@ -527,65 +714,100 @@
 	 */
 	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-	i = 0;
 
 	do {
-		struct vm_area_struct *	vma;
+		struct vm_area_struct *vma;
 
 		vma = find_extend_vma(mm, start);
 
-		if (!vma || (pages && (vma->vm_flags & VM_IO))
-				|| !(flags & vma->vm_flags))
-			return i ? : -EFAULT;
-
-		if (is_vm_hugetlb_page(vma)) {
-			i = follow_hugetlb_page(mm, vma, pages, vmas,
-						&start, &len, i);
-			continue;
-		}
+		if (!vma || (vma->vm_flags & VM_IO)
+		    || !(flags & vma->vm_flags))
+			return walker(ERR_PTR(EFAULT), ERR_PTR(EFAULT), start,
+				      customdata);
+
+		/* FIXME: These are not handled, yet. -ioe */
+		/*
+		   if (is_vm_hugetlb_page(vma)) {
+		   int i=0;
+		   i = follow_hugetlb_page(mm, vma, pages, vmas,
+		   &start, &len, i);
+		   continue;
+		   }
+		 */
 		spin_lock(&mm->page_table_lock);
 		do {
-			struct page *map;
-			while (!(map = follow_page(mm, start, write))) {
+			struct page *page;
+			page = single_page_walk(tsk, mm, vma, start, write);
+			ret = walker(vma, page, start, customdata);
+			switch (ret) {
+				/* Common case -> continue walking. */
+			case 0:
+				break;
+
+				/* We are satisfied with our walking. */
+			case 1:
+				ret = 0;
 				spin_unlock(&mm->page_table_lock);
-				switch (handle_mm_fault(mm,vma,start,write)) {
-				case VM_FAULT_MINOR:
-					tsk->min_flt++;
-					break;
-				case VM_FAULT_MAJOR:
-					tsk->maj_flt++;
-					break;
-				case VM_FAULT_SIGBUS:
-					return i ? i : -EFAULT;
-				case VM_FAULT_OOM:
-					return i ? i : -ENOMEM;
-				default:
-					BUG();
-				}
-				spin_lock(&mm->page_table_lock);
-			}
-			if (pages) {
-				pages[i] = get_page_map(map);
-				if (!pages[i]) {
-					spin_unlock(&mm->page_table_lock);
-					while (i--)
-						page_cache_release(pages[i]);
-					i = -EFAULT;
-					goto out;
-				}
-				if (!PageReserved(pages[i]))
-					page_cache_get(pages[i]);
+				/* Fall trough now */
+
+				/* Bail out because of error. */
+			default:
+				/* Error cases do unlock, 
+				 * if necessary. -ioe */
+				goto out;
 			}
-			if (vmas)
-				vmas[i] = vma;
-			i++;
 			start += PAGE_SIZE;
-			len--;
-		} while(len && start < vma->vm_end);
+		} while (start < vma->vm_end);
 		spin_unlock(&mm->page_table_lock);
-	} while(len);
-out:
-	return i;
+	} while (1);
+      out:
+	return ret;
+}
+
+/* Ugly for now, but the defines and the union will go later. -ioe */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+	       unsigned long start, int len, int write, int force,
+	       struct page **pages, struct vm_area_struct **vmas)
+{
+	int ret;
+	custom_page_walker_t walker = gup_add_pages;
+	union {
+		struct gup_add_pages pg;
+#ifdef OBSOLETE_PAGE_WALKER
+		struct gup_add_pv pv;
+#endif
+	} gup_u;
+
+	memset(&gup_u, 0, sizeof (gup_u));
+
+#ifdef OBSOLETE_PAGE_WALKER
+	if (vmas) {
+		gup_u.pv.vmas = vmas;
+		gup_u.pv.max_vmas = len;
+		walker = gup_add_pv;
+		printk("Obsolete argument \"vmas\" used!"
+		       " Please send this report to linux-mm@vger.kernel.org"
+		       " or fix the caller. Stack trace follows...\n");
+		WARN_ON(vmas);
+	}
+#else
+	/* FIXME: Or should we simply ignore it? -ioe */
+	BUG_ON(vmas);
+#endif
+
+	/* Warn on non-sense calls, but process them. -ioe */
+	WARN_ON(!vmas && !pages);
+
+	if (pages) {
+		gup_u.pg.max_pages = len;
+		gup_u.pg.pages = pages;
+	}
+
+	ret = walk_user_pages(tsk, mm, start, write, force, walker, &gup_u);
+	if (ret == 0) {
+		ret = gup_u.pg.count;
+	}
+	return ret;
 }
 
 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
@@ -1294,10 +1516,30 @@
 	return pmd_offset(pgd, address);
 }
 
+
+/* A page walker, which just counts down how many pages it got */
+static int
+gup_mk_present(struct vm_area_struct *vma, struct page *page,
+	       unsigned long virt_addr, void *customdata)
+{
+
+	int *todo = customdata;
+
+	if (!IS_ERR(page)) {
+		(*todo)--;
+		/* Abort if have made all required pages present */
+		return (*todo) ? 0 : 1;
+	}
+
+	if (!IS_ERR(vma))
+		spin_unlock(&vma->vm_mm->page_table_lock);
+	return -PTR_ERR(page);
+}
+
 int make_pages_present(unsigned long addr, unsigned long end)
 {
 	int ret, len, write;
-	struct vm_area_struct * vma;
+	struct vm_area_struct *vma;
 
 	vma = find_vma(current->mm, addr);
 	write = (vma->vm_flags & VM_WRITE) != 0;
@@ -1305,10 +1547,18 @@
 		BUG();
 	if (end > vma->vm_end)
 		BUG();
-	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
-	ret = get_user_pages(current, current->mm, addr,
-			len, write, 0, NULL, NULL);
-	return ret == len ? 0 : -1;
+	len = (end + PAGE_SIZE - 1) / PAGE_SIZE - addr / PAGE_SIZE;
+
+	/* This is necessary for gup_mk_present to work and
+	 * also a slight optimization. -ioe 
+	 */
+	if (len == 0)
+		return 0;
+
+	ret = walk_user_pages(current, current->mm, addr,
+			      write, 0, gup_mk_present, &len);
+
+	return (ret == 0 && len == 0) ? 0 : -1;
 }
 
 /* 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2002-11-04 13:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-11-01 22:56 Huge TLB pages always physically continious? Ingo Oeser
2002-11-01 23:23 ` Andrew Morton
2002-11-02  8:29   ` Jeff Garzik
2002-11-04 13:49   ` get_user_pages rewrite (was: Huge TLB pages always physically continious?) Ingo Oeser

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).