linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andi Kleen <ak@suse.de>
To: Andi Kleen <ak@suse.de>
Cc: linux-kernel@vger.kernel.org, akpm@osdl.org
Subject: [PATCH] NUMA API for Linux 9/ Add simple lazy i386/x86-64 hugetlbfs policy support
Date: Tue, 6 Apr 2004 15:40:12 +0200	[thread overview]
Message-ID: <20040406154012.2750827d.ak@suse.de> (raw)
In-Reply-To: <20040406153322.5d6e986e.ak@suse.de>

Add NUMA policy support to i386/x86-64 hugetlbfs and switch it 
over to lazy allocation instead of prefaulting.

The NUMA policy support policies the huge page allocation based on the
current policy.

It also switch the hugetlbfs to lazy allocation, because otherwise
mbind() cannot work after mmap, because the memory was already allocated.
This doesn't do any prereservation; when a process runs out of 
huge pages it will get a SIGBUS.

There are currently various proposals on linux-kernel to add preallocation
for this; once one of these patches turns out to be good it would be 
best to replace this patch with it (and port the mpol_* changes over)

diff -u linux-2.6.5-numa/arch/i386/mm/hugetlbpage.c-o linux-2.6.5-numa/arch/i386/mm/hugetlbpage.c
--- linux-2.6.5-numa/arch/i386/mm/hugetlbpage.c-o	2004-04-06 13:11:59.000000000 +0200
+++ linux-2.6.5-numa/arch/i386/mm/hugetlbpage.c	2004-04-06 13:36:12.000000000 +0200
@@ -15,14 +15,17 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sysctl.h>
+#include <linux/mempolicy.h>
 #include <asm/mman.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long    htlbpagemem;
+/* AK: this should be all moved into the pgdat */
+
+static long    htlbpagemem[MAX_NUMNODES];
 int     htlbpage_max;
-static long    htlbzone_pages;
+static long    htlbzone_pages[MAX_NUMNODES];
 
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
@@ -33,14 +36,15 @@
 		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
-	int nid = numa_node_id();
+	int nid = mpol_first_node(vma, addr); 
 	struct page *page = NULL;
 
 	if (list_empty(&hugepage_freelists[nid])) {
 		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
+			if (mpol_node_valid(nid, vma, addr) && 
+			    !list_empty(&hugepage_freelists[nid]))
 				break;
 	}
 	if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) {
@@ -61,18 +65,18 @@
 
 static void free_huge_page(struct page *page);
 
-static struct page *alloc_hugetlb_page(void)
+static struct page *alloc_hugetlb_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	int i;
 	struct page *page;
 
 	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
+	page = dequeue_huge_page(vma, addr);
 	if (!page) {
 		spin_unlock(&htlbpage_lock);
 		return NULL;
 	}
-	htlbpagemem--;
+	htlbpagemem[page_zone(page)->zone_pgdat->node_id]--;
 	spin_unlock(&htlbpage_lock);
 	set_page_count(page, 1);
 	page->lru.prev = (void *)free_huge_page;
@@ -284,7 +288,7 @@
 
 	spin_lock(&htlbpage_lock);
 	enqueue_huge_page(page);
-	htlbpagemem++;
+	htlbpagemem[page_zone(page)->zone_pgdat->node_id]++;
 	spin_unlock(&htlbpage_lock);
 }
 
@@ -329,41 +333,49 @@
 	spin_unlock(&mm->page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+/* page_table_lock hold on entry. */
+static int 
+hugetlb_alloc_fault(struct mm_struct *mm, struct vm_area_struct *vma, 
+			       unsigned long addr, int write_access)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
+	int ret;
+	pte_t *pte;
+	struct page *page = NULL;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 
+	pte = huge_pte_alloc(mm, addr); 
 		if (!pte) {
-			ret = -ENOMEM;
+		ret = VM_FAULT_OOM;
 			goto out;
 		}
-		if (!pte_none(*pte))
-			continue;
+
+		/* Handle race */
+		if (!pte_none(*pte)) { 
+			ret = VM_FAULT_MINOR;
+			goto flush; 
+		}
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 		page = find_get_page(mapping, idx);
 		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
+		/* Should do this at prefault time, but that gets us into
+		   trouble with freeing right now. */
+		ret = hugetlb_get_quota(mapping);
+		if (ret) {
+			ret = VM_FAULT_OOM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+		
+			page = alloc_hugetlb_page(vma, addr);
 			if (!page) {
 				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
+			
+			/* Instead of OOMing here could just transparently use
+			   small pages. */
+			
+				ret = VM_FAULT_OOM;
 				goto out;
 			}
 			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
@@ -371,23 +383,64 @@
 			if (ret) {
 				hugetlb_put_quota(mapping);
 				free_huge_page(page);
+				ret = VM_FAULT_SIGBUS;
 				goto out;
 			}
-		}
+		ret = VM_FAULT_MAJOR; 
+	} else
+		ret = VM_FAULT_MINOR;
+		
 		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
+
+ flush:
+	/* Don't need to flush other CPUs. They will just do a page
+	   fault and flush it lazily. */
+	__flush_tlb_one(addr);
+	
+ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
+int arch_hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 
+		       unsigned long address, int write_access)
+{ 
+	pmd_t *pmd;
+	pgd_t *pgd;
+
+	if (write_access && !(vma->vm_flags & VM_WRITE))
+		return VM_FAULT_SIGBUS;
+
+	spin_lock(&mm->page_table_lock);	
+	pgd = pgd_offset(mm, address); 
+	if (pgd_none(*pgd)) 
+		return hugetlb_alloc_fault(mm, vma, address, write_access); 
+
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none(*pmd))
+		return hugetlb_alloc_fault(mm, vma, address, write_access); 
+
+	BUG_ON(!pmd_large(*pmd)); 
+
+	/* must have been a race. Flush the TLB. NX not supported yet. */ 
+
+	__flush_tlb_one(address); 
+	spin_lock(&mm->page_table_lock);	
+	return VM_FAULT_MINOR;
+} 
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	return 0;
+}
+
 static void update_and_free_page(struct page *page)
 {
 	int j;
 	struct page *map;
 
 	map = page;
-	htlbzone_pages--;
+	htlbzone_pages[page_zone(page)->zone_pgdat->node_id]--;
 	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -404,6 +457,7 @@
 	struct list_head *p;
 	struct page *page, *map;
 
+   page = NULL;
 	map = NULL;
 	spin_lock(&htlbpage_lock);
 	/* all lowmem is on node 0 */
@@ -411,7 +465,7 @@
 		if (map) {
 			list_del(&map->list);
 			update_and_free_page(map);
-			htlbpagemem--;
+ 			htlbpagemem[page_zone(map)->zone_pgdat->node_id]--;
 			map = NULL;
 			if (++count == 0)
 				break;
@@ -423,49 +477,61 @@
 	if (map) {
 		list_del(&map->list);
 		update_and_free_page(map);
-		htlbpagemem--;
+		htlbpagemem[page_zone(page)->zone_pgdat->node_id]--;
 		count++;
 	}
 	spin_unlock(&htlbpage_lock);
 	return count;
 }
 
+static long all_huge_pages(void)
+{ 
+	long pages = 0;
+	int i;
+	for (i = 0; i < numnodes; i++) 
+		pages += htlbzone_pages[i];
+	return pages;
+} 
+
 static int set_hugetlb_mem_size(int count)
 {
 	int lcount;
 	struct page *page;
-
 	if (count < 0)
 		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
+	else { 
+		lcount = count - all_huge_pages();
+	}
 
 	if (lcount == 0)
-		return (int)htlbzone_pages;
+		return (int)all_huge_pages();
 	if (lcount > 0) {	/* Increase the mem size. */
 		while (lcount--) {
+			int node;
 			page = alloc_fresh_huge_page();
 			if (page == NULL)
 				break;
 			spin_lock(&htlbpage_lock);
 			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
+			node = page_zone(page)->zone_pgdat->node_id;
+			htlbpagemem[node]++;
+			htlbzone_pages[node]++;
 			spin_unlock(&htlbpage_lock);
 		}
-		return (int) htlbzone_pages;
+		goto out;
 	}
 	/* Shrink the memory size. */
 	lcount = try_to_free_low(lcount);
 	while (lcount++) {
-		page = alloc_hugetlb_page();
+		page = alloc_hugetlb_page(NULL, 0);
 		if (page == NULL)
 			break;
 		spin_lock(&htlbpage_lock);
 		update_and_free_page(page);
 		spin_unlock(&htlbpage_lock);
 	}
-	return (int) htlbzone_pages;
+ out:
+	return (int)all_huge_pages();
 }
 
 int hugetlb_sysctl_handler(ctl_table *table, int write,
@@ -498,33 +564,60 @@
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
 	for (i = 0; i < htlbpage_max; ++i) {
+		int nid; 
 		page = alloc_fresh_huge_page();
 		if (!page)
 			break;
 		spin_lock(&htlbpage_lock);
 		enqueue_huge_page(page);
+		nid = page_zone(page)->zone_pgdat->node_id;
+		htlbpagemem[nid]++;
+		htlbzone_pages[nid]++;
 		spin_unlock(&htlbpage_lock);
 	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
+	htlbpage_max = i;
+	printk("Initial HugeTLB pages allocated: %d\n", i);
 	return 0;
 }
 module_init(hugetlb_init);
 
 int hugetlb_report_meminfo(char *buf)
 {
+	int i;
+	long pages = 0, mem = 0;
+	for (i = 0; i < numnodes; i++) {
+		pages += htlbzone_pages[i];
+		mem += htlbpagemem[i];
+	}
+
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
+			pages,
+			mem,
 			HPAGE_SIZE/1024);
 }
 
+int hugetlb_report_node_meminfo(int node, char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			htlbzone_pages[node],
+			htlbpagemem[node],
+			HPAGE_SIZE/1024);
+}
+
+/* Not accurate with policy */
 int is_hugepage_mem_enough(size_t size)
 {
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
+	long pm = 0;
+	int i;
+	for (i = 0; i < numnodes; i++)
+		pm += htlbpagemem[i];
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= pm;
 }
 
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
diff -u linux-2.6.5-numa/include/linux/mm.h-o linux-2.6.5-numa/include/linux/mm.h
--- linux-2.6.5-numa/include/linux/mm.h-o	2004-04-06 13:12:23.000000000 +0200
+++ linux-2.6.5-numa/include/linux/mm.h	2004-04-06 13:36:12.000000000 +0200
@@ -643,6 +660,9 @@
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
 		unsigned long to, unsigned long size, pgprot_t prot);
 
+extern int arch_hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 
+			      unsigned long address, int write_access);
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff -u linux-2.6.5-numa/mm/memory.c-o linux-2.6.5-numa/mm/memory.c
--- linux-2.6.5-numa/mm/memory.c-o	2004-04-06 13:12:24.000000000 +0200
+++ linux-2.6.5-numa/mm/memory.c	2004-04-06 13:36:12.000000000 +0200
@@ -1604,6 +1633,15 @@
 	return VM_FAULT_MINOR;
 }
 
+
+/* Can be overwritten by the architecture */
+int __attribute__((weak)) arch_hugetlb_fault(struct mm_struct *mm, 
+					     struct vm_area_struct *vma, 
+					     unsigned long address, int write_access)
+{
+	return VM_FAULT_SIGBUS;
+}
+
 /*
  * By the time we get here, we already hold the mm semaphore
  */
@@ -1619,7 +1657,7 @@
 	inc_page_state(pgfault);
 
 	if (is_vm_hugetlb_page(vma))
-		return VM_FAULT_SIGBUS;	/* mapping truncation does this. */
+		return arch_hugetlb_fault(mm, vma, address, write_access);
 
 	/*
 	 * We need the page table lock to synchronize with kswapd

  parent reply	other threads:[~2004-04-06 13:47 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-04-06 13:33 NUMA API for Linux Andi Kleen
2004-04-06 13:34 ` [PATCH] NUMA API for Linux 1/ Core NUMA API code Andi Kleen
2004-04-06 13:35 ` NUMA API for Linux 2/ Add x86-64 support Andi Kleen
2004-04-06 13:35 ` [PATCH] NUMA API for Linux 3/ Add i386 support Andi Kleen
2004-04-06 23:23   ` Andrew Morton
2004-04-06 13:36 ` [PATCH] NUMA API for Linux 4/ Add IA64 support Andi Kleen
2004-04-06 13:37 ` [PATCH] NUMA API for Linux 5/ Add VMA hooks for policy Andi Kleen
2004-05-05 16:05   ` Paul Jackson
2004-05-05 16:39     ` Andi Kleen
2004-05-05 16:47       ` Paul Jackson
2004-05-06  6:00         ` Andi Kleen
2004-04-06 13:37 ` [PATCH] NUMA API for Linux 6/ Add shared memory support Andi Kleen
2004-04-06 13:38 ` [PATCH] NUMA API for Linux 7/ Add statistics Andi Kleen
2004-04-06 13:39 ` [PATCH] NUMA API for Linux 8/ Add policy support to anonymous memory Andi Kleen
2004-04-06 13:40 ` Andi Kleen [this message]
2004-04-06 13:40 ` [PATCH] NUMA API for Linux 10/ Bitmap bugfix Andi Kleen
2004-04-06 23:35 ` NUMA API for Linux Paul Jackson
2004-04-08 20:12 ` Pavel Machek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20040406154012.2750827d.ak@suse.de \
    --to=ak@suse.de \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).