All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Optimize out pte_chain take two
@ 2002-07-09 19:04 Dave McCracken
  2002-07-10  0:21 ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: Dave McCracken @ 2002-07-09 19:04 UTC (permalink / raw)
  To: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 403 bytes --]


Here's a version of my pte_chain removal patch that does not use anonymous
unions, so it'll compile with gcc 2.95.  Once again, it's based on Rik's
rmap-2.5.25-akpmtested.

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: rmap-opt-2.5.25.diff --]
[-- Type: text/plain, Size: 8763 bytes --]

--- linux-2.5.25-rmap/./include/linux/mm.h	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./include/linux/mm.h	Tue Jul  9 13:41:11 2002
@@ -157,8 +157,11 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer.
+	union {
+		struct pte_chain * _pte_chain;	/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
+		pte_t		 * _pte_direct;
+	} _pte_union;
 	unsigned long private;		/* mapping-private opaque data */
 
 	/*
@@ -176,6 +179,9 @@
 					   not kmapped, ie. highmem) */
 #endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */
 };
+
+#define	pte__chain	_pte_union._pte_chain
+#define	pte_direct	_pte_union._pte_direct
 
 /*
  * Methods to modify the page usage count.
--- linux-2.5.25-rmap/./include/linux/page-flags.h	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./include/linux/page-flags.h	Tue Jul  9 10:31:28 2002
@@ -66,6 +66,7 @@
 #define PG_writeback		13	/* Page is under writeback */
 #define PG_nosave		15	/* Used for system suspend/resume */
 #define PG_chainlock		16	/* lock bit for ->pte_chain */
+#define PG_direct		17	/* ->pte_chain points directly at pte */
 
 /*
  * Global page accounting.  One instance per CPU.
@@ -216,6 +217,12 @@
 #define TestSetPageNosave(page)	test_and_set_bit(PG_nosave, &(page)->flags)
 #define ClearPageNosave(page)		clear_bit(PG_nosave, &(page)->flags)
 #define TestClearPageNosave(page)	test_and_clear_bit(PG_nosave, &(page)->flags)
+
+#define PageDirect(page)	test_bit(PG_direct, &(page)->flags)
+#define SetPageDirect(page)	set_bit(PG_direct, &(page)->flags)
+#define TestSetPageDirect(page)	test_and_set_bit(PG_direct, &(page)->flags)
+#define ClearPageDirect(page)		clear_bit(PG_direct, &(page)->flags)
+#define TestClearPageDirect(page)	test_and_clear_bit(PG_direct, &(page)->flags)
 
 /*
  * inlines for acquisition and release of PG_chainlock
--- linux-2.5.25-rmap/./mm/page_alloc.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/page_alloc.c	Tue Jul  9 13:43:46 2002
@@ -92,7 +92,7 @@
 	BUG_ON(PageLRU(page));
 	BUG_ON(PageActive(page));
 	BUG_ON(PageWriteback(page));
-	BUG_ON(page->pte_chain != NULL);
+	BUG_ON(page->pte__chain != NULL);
 	if (PageDirty(page))
 		ClearPageDirty(page);
 	BUG_ON(page_count(page) != 0);
--- linux-2.5.25-rmap/./mm/vmscan.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/vmscan.c	Tue Jul  9 13:42:38 2002
@@ -48,7 +48,7 @@
 	struct address_space *mapping = page->mapping;
 
 	/* Page is in somebody's page tables. */
-	if (page->pte_chain)
+	if (page->pte__chain)
 		return 1;
 
 	/* XXX: does this happen ? */
@@ -151,7 +151,7 @@
 		 *
 		 * XXX: implement swap clustering ?
 		 */
-		if (page->pte_chain && !page->mapping && !PagePrivate(page)) {
+		if (page->pte__chain && !page->mapping && !PagePrivate(page)) {
 			page_cache_get(page);
 			pte_chain_unlock(page);
 			spin_unlock(&pagemap_lru_lock);
@@ -171,7 +171,7 @@
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
-		if (page->pte_chain) {
+		if (page->pte__chain) {
 			switch (try_to_unmap(page)) {
 				case SWAP_ERROR:
 				case SWAP_FAIL:
@@ -348,7 +348,7 @@
 		entry = entry->prev;
 
 		pte_chain_lock(page);
-		if (page->pte_chain && page_referenced(page)) {
+		if (page->pte__chain && page_referenced(page)) {
 			list_del(&page->lru);
 			list_add(&page->lru, &active_list);
 			pte_chain_unlock(page);
--- linux-2.5.25-rmap/./mm/rmap.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/rmap.c	Tue Jul  9 13:41:47 2002
@@ -13,7 +13,7 @@
 
 /*
  * Locking:
- * - the page->pte_chain is protected by the PG_chainlock bit,
+ * - the page->pte__chain is protected by the PG_chainlock bit,
  *   which nests within the pagemap_lru_lock, then the
  *   mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
@@ -71,10 +71,15 @@
 	if (TestClearPageReferenced(page))
 		referenced++;
 
-	/* Check all the page tables mapping this page. */
-	for (pc = page->pte_chain; pc; pc = pc->next) {
-		if (ptep_test_and_clear_young(pc->ptep))
+	if (PageDirect(page)) {
+		if (ptep_test_and_clear_young(page->pte_direct))
 			referenced++;
+	} else {
+		/* Check all the page tables mapping this page. */
+		for (pc = page->pte__chain; pc; pc = pc->next) {
+			if (ptep_test_and_clear_young(pc->ptep))
+				referenced++;
+		}
 	}
 	return referenced;
 }
@@ -108,22 +113,39 @@
 	pte_chain_lock(page);
 	{
 		struct pte_chain * pc;
-		for (pc = page->pte_chain; pc; pc = pc->next) {
-			if (pc->ptep == ptep)
+		if (PageDirect(page)) {
+			if (page->pte_direct == ptep)
 				BUG();
+		} else {
+			for (pc = page->pte__chain; pc; pc = pc->next) {
+				if (pc->ptep == ptep)
+					BUG();
+			}
 		}
 	}
 	pte_chain_unlock(page);
 #endif
 
-	pte_chain = pte_chain_alloc();
-
 	pte_chain_lock(page);
 
-	/* Hook up the pte_chain to the page. */
-	pte_chain->ptep = ptep;
-	pte_chain->next = page->pte_chain;
-	page->pte_chain = pte_chain;
+	if (PageDirect(page)) {
+		/* Convert a direct pointer into a pte_chain */
+		pte_chain = pte_chain_alloc();
+		pte_chain->ptep = page->pte_direct;
+		pte_chain->next = NULL;
+		page->pte__chain = pte_chain;
+		ClearPageDirect(page);
+	}
+	if (page->pte__chain) {
+		/* Hook up the pte_chain to the page. */
+		pte_chain = pte_chain_alloc();
+		pte_chain->ptep = ptep;
+		pte_chain->next = page->pte__chain;
+		page->pte__chain = pte_chain;
+	} else {
+		page->pte_direct = ptep;
+		SetPageDirect(page);
+	}
 
 	pte_chain_unlock(page);
 }
@@ -149,18 +171,38 @@
 		return;
 
 	pte_chain_lock(page);
-	for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
-		if (pc->ptep == ptep) {
-			pte_chain_free(pc, prev_pc, page);
+
+	if (PageDirect(page)) {
+		if (page->pte_direct == ptep) {
+			page->pte_direct = NULL;
+			ClearPageDirect(page);
 			goto out;
 		}
+	} else {
+		for (pc = page->pte__chain; pc; prev_pc = pc, pc = pc->next) {
+			if (pc->ptep == ptep) {
+				pte_chain_free(pc, prev_pc, page);
+				/* Check whether we can convert to direct */
+				pc = page->pte__chain;
+				if (!pc->next) {
+					page->pte_direct = pc->ptep;
+					SetPageDirect(page);
+					pte_chain_free(pc, NULL, NULL);
+				}
+				goto out;
+			}
+		}
 	}
 #ifdef DEBUG_RMAP
 	/* Not found. This should NEVER happen! */
 	printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
 	printk(KERN_ERR "page_remove_rmap: only found: ");
-	for (pc = page->pte_chain; pc; pc = pc->next)
-		printk("%p ", pc->ptep);
+	if (PageDirect(page)) {
+		printk("%p ", page->pte_direct);
+	} else {
+		for (pc = page->pte__chain; pc; pc = pc->next)
+			printk("%p ", pc->ptep);
+	}
 	printk("\n");
 	printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
 #endif
@@ -270,22 +312,42 @@
 	if (!page->mapping)
 		BUG();
 
-	for (pc = page->pte_chain; pc; pc = next_pc) {
-		next_pc = pc->next;
-		switch (try_to_unmap_one(page, pc->ptep)) {
+	if (PageDirect(page)) {
+		switch (ret = try_to_unmap_one(page, page->pte_direct)) {
 			case SWAP_SUCCESS:
-				/* Free the pte_chain struct. */
-				pte_chain_free(pc, prev_pc, page);
-				break;
+				page->pte_direct = NULL;
+				ClearPageDirect(page);
+				return ret;
 			case SWAP_AGAIN:
-				/* Skip this pte, remembering status. */
-				prev_pc = pc;
-				ret = SWAP_AGAIN;
-				continue;
 			case SWAP_FAIL:
-				return SWAP_FAIL;
 			case SWAP_ERROR:
-				return SWAP_ERROR;
+				return ret;
+		}
+	} else {
+		for (pc = page->pte__chain; pc; pc = next_pc) {
+			next_pc = pc->next;
+			switch (try_to_unmap_one(page, pc->ptep)) {
+				case SWAP_SUCCESS:
+					/* Free the pte_chain struct. */
+					pte_chain_free(pc, prev_pc, page);
+					break;
+				case SWAP_AGAIN:
+					/* Skip this pte, remembering status. */
+					prev_pc = pc;
+					ret = SWAP_AGAIN;
+					continue;
+				case SWAP_FAIL:
+					return SWAP_FAIL;
+				case SWAP_ERROR:
+					return SWAP_ERROR;
+			}
+		}
+		/* Check whether we can convert to direct pte pointer */
+		pc = page->pte__chain;
+		if (pc && !pc->next) {
+			page->pte_direct = pc->ptep;
+			SetPageDirect(page);
+			pte_chain_free(pc, NULL, NULL);
 		}
 	}
 
@@ -336,7 +398,7 @@
 	if (prev_pte_chain)
 		prev_pte_chain->next = pte_chain->next;
 	else if (page)
-		page->pte_chain = pte_chain->next;
+		page->pte__chain = pte_chain->next;
 
 	spin_lock(&pte_chain_freelist_lock);
 	pte_chain_push(pte_chain);

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take two
  2002-07-09 19:04 [PATCH] Optimize out pte_chain take two Dave McCracken
@ 2002-07-10  0:21 ` Andrew Morton
  2002-07-10 14:33   ` [PATCH] Optimize out pte_chain take three Dave McCracken
  0 siblings, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2002-07-10  0:21 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Linux Memory Management

Dave McCracken wrote:
> 
> Here's a version of my pte_chain removal patch that does not use anonymous
> unions, so it'll compile with gcc 2.95.  Once again, it's based on Rik's
> rmap-2.5.25-akpmtested.

Seems sane and simple, thanks.

This bit is icky:

+       union {
+               struct pte_chain * _pte_chain;  /* Reverse pte mapping pointer.
                                         * protected by PG_chainlock */
+               pte_t            * _pte_direct;
+       } _pte_union;
...
+
+#define        pte__chain      _pte_union._pte_chain
+#define        pte_direct      _pte_union._pte_direct


You could instead make it just a void * and have:

static inline struct pte_chain *page_pte_chain(struct page *page)
{
#ifdef DEBUG_RMAP
	BUG_ON(PageDirect(page));
#endif
	return page->rmap_thingy;
}

static inline pte_t *page_pte_direct(struct page *page)
{
#ifdef DEBUG_RMAP
	BUG_ON(!PageDirect(page));
#endif
	return page->rmap_thingy;
}

static inline void
set_page_pte_chain(struct page *page, struct pte_chain *pte_chain)
{
#ifdef DEBUG_RMAP
	BUG_ON(PageDirect(page));
#endif
	page->rmap_thingy = pte_chain;
}

static inline void
set_page_pte_direct(struct page *page, pte_t *ptep)
{
#ifdef DEBUG_RMAP
	BUG_ON(!PageDirect(page));
#endif
	page->rmap_thingy = ptep;
}

I think it's neater.  But then, I'm a convicted C++ weenie.

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] Optimize out pte_chain take three
  2002-07-10  0:21 ` Andrew Morton
@ 2002-07-10 14:33   ` Dave McCracken
  2002-07-10 15:18     ` Rik van Riel
  0 siblings, 1 reply; 39+ messages in thread
From: Dave McCracken @ 2002-07-10 14:33 UTC (permalink / raw)
  To: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 1095 bytes --]


--On Tuesday, July 09, 2002 05:21:11 PM -0700 Andrew Morton
<akpm@zip.com.au> wrote:

> Seems sane and simple, thanks.
> 
> This bit is icky:
> 
> +       union {
> +               struct pte_chain * _pte_chain;  /* Reverse pte mapping
> pointer.                                          * protected by
> PG_chainlock */ +               pte_t            * _pte_direct;
> +       } _pte_union;
> ...
> +
> +#define        pte__chain      _pte_union._pte_chain
> +#define        pte_direct      _pte_union._pte_direct
> 
> 
> You could instead make it just a void * and have:

I agree that it's icky, at least the #defines.  However, I don't like using
void *, either.  After thinking about it overnight, I think I prefer
exposing the union along the lines of pte.chain and pte.direct.  Attached
is a patch with that change, plus a couple of logic tweaks to fix a small
hole.

Dave

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: rmap-opt-2.5.25-3.diff --]
[-- Type: text/plain, Size: 8501 bytes --]

--- linux-2.5.25-rmap/./include/linux/mm.h	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./include/linux/mm.h	Wed Jul 10 09:10:04 2002
@@ -157,8 +157,11 @@
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer.
+	union {
+		struct pte_chain * chain;	/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
+		pte_t		 * direct;
+	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
 	/*
--- linux-2.5.25-rmap/./include/linux/page-flags.h	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./include/linux/page-flags.h	Tue Jul  9 10:31:28 2002
@@ -66,6 +66,7 @@
 #define PG_writeback		13	/* Page is under writeback */
 #define PG_nosave		15	/* Used for system suspend/resume */
 #define PG_chainlock		16	/* lock bit for ->pte_chain */
+#define PG_direct		17	/* ->pte_chain points directly at pte */
 
 /*
  * Global page accounting.  One instance per CPU.
@@ -216,6 +217,12 @@
 #define TestSetPageNosave(page)	test_and_set_bit(PG_nosave, &(page)->flags)
 #define ClearPageNosave(page)		clear_bit(PG_nosave, &(page)->flags)
 #define TestClearPageNosave(page)	test_and_clear_bit(PG_nosave, &(page)->flags)
+
+#define PageDirect(page)	test_bit(PG_direct, &(page)->flags)
+#define SetPageDirect(page)	set_bit(PG_direct, &(page)->flags)
+#define TestSetPageDirect(page)	test_and_set_bit(PG_direct, &(page)->flags)
+#define ClearPageDirect(page)		clear_bit(PG_direct, &(page)->flags)
+#define TestClearPageDirect(page)	test_and_clear_bit(PG_direct, &(page)->flags)
 
 /*
  * inlines for acquisition and release of PG_chainlock
--- linux-2.5.25-rmap/./mm/page_alloc.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/page_alloc.c	Wed Jul 10 09:11:47 2002
@@ -92,7 +92,7 @@
 	BUG_ON(PageLRU(page));
 	BUG_ON(PageActive(page));
 	BUG_ON(PageWriteback(page));
-	BUG_ON(page->pte_chain != NULL);
+	BUG_ON(page->pte.chain != NULL);
 	if (PageDirty(page))
 		ClearPageDirty(page);
 	BUG_ON(page_count(page) != 0);
--- linux-2.5.25-rmap/./mm/vmscan.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/vmscan.c	Wed Jul 10 09:11:38 2002
@@ -48,7 +48,7 @@
 	struct address_space *mapping = page->mapping;
 
 	/* Page is in somebody's page tables. */
-	if (page->pte_chain)
+	if (page->pte.chain)
 		return 1;
 
 	/* XXX: does this happen ? */
@@ -151,7 +151,7 @@
 		 *
 		 * XXX: implement swap clustering ?
 		 */
-		if (page->pte_chain && !page->mapping && !PagePrivate(page)) {
+		if (page->pte.chain && !page->mapping && !PagePrivate(page)) {
 			page_cache_get(page);
 			pte_chain_unlock(page);
 			spin_unlock(&pagemap_lru_lock);
@@ -171,7 +171,7 @@
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
-		if (page->pte_chain) {
+		if (page->pte.chain) {
 			switch (try_to_unmap(page)) {
 				case SWAP_ERROR:
 				case SWAP_FAIL:
@@ -348,7 +348,7 @@
 		entry = entry->prev;
 
 		pte_chain_lock(page);
-		if (page->pte_chain && page_referenced(page)) {
+		if (page->pte.chain && page_referenced(page)) {
 			list_del(&page->lru);
 			list_add(&page->lru, &active_list);
 			pte_chain_unlock(page);
--- linux-2.5.25-rmap/./mm/rmap.c	Mon Jul  8 15:37:35 2002
+++ linux-2.5.25-rmap-opt/./mm/rmap.c	Wed Jul 10 09:12:52 2002
@@ -13,7 +13,7 @@
 
 /*
  * Locking:
- * - the page->pte_chain is protected by the PG_chainlock bit,
+ * - the page->pte.chain is protected by the PG_chainlock bit,
  *   which nests within the pagemap_lru_lock, then the
  *   mm->page_table_lock, and then the page lock.
  * - because swapout locking is opposite to the locking order
@@ -71,10 +71,15 @@
 	if (TestClearPageReferenced(page))
 		referenced++;
 
-	/* Check all the page tables mapping this page. */
-	for (pc = page->pte_chain; pc; pc = pc->next) {
-		if (ptep_test_and_clear_young(pc->ptep))
+	if (PageDirect(page)) {
+		if (ptep_test_and_clear_young(page->pte.direct))
 			referenced++;
+	} else {
+		/* Check all the page tables mapping this page. */
+		for (pc = page->pte.chain; pc; pc = pc->next) {
+			if (ptep_test_and_clear_young(pc->ptep))
+				referenced++;
+		}
 	}
 	return referenced;
 }
@@ -108,22 +113,39 @@
 	pte_chain_lock(page);
 	{
 		struct pte_chain * pc;
-		for (pc = page->pte_chain; pc; pc = pc->next) {
-			if (pc->ptep == ptep)
+		if (PageDirect(page)) {
+			if (page->pte.direct == ptep)
 				BUG();
+		} else {
+			for (pc = page->pte.chain; pc; pc = pc->next) {
+				if (pc->ptep == ptep)
+					BUG();
+			}
 		}
 	}
 	pte_chain_unlock(page);
 #endif
 
-	pte_chain = pte_chain_alloc();
-
 	pte_chain_lock(page);
 
-	/* Hook up the pte_chain to the page. */
-	pte_chain->ptep = ptep;
-	pte_chain->next = page->pte_chain;
-	page->pte_chain = pte_chain;
+	if (PageDirect(page)) {
+		/* Convert a direct pointer into a pte_chain */
+		pte_chain = pte_chain_alloc();
+		pte_chain->ptep = page->pte.direct;
+		pte_chain->next = NULL;
+		page->pte.chain = pte_chain;
+		ClearPageDirect(page);
+	}
+	if (page->pte.chain) {
+		/* Hook up the pte_chain to the page. */
+		pte_chain = pte_chain_alloc();
+		pte_chain->ptep = ptep;
+		pte_chain->next = page->pte.chain;
+		page->pte.chain = pte_chain;
+	} else {
+		page->pte.direct = ptep;
+		SetPageDirect(page);
+	}
 
 	pte_chain_unlock(page);
 }
@@ -149,18 +171,38 @@
 		return;
 
 	pte_chain_lock(page);
-	for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
-		if (pc->ptep == ptep) {
-			pte_chain_free(pc, prev_pc, page);
+
+	if (PageDirect(page)) {
+		if (page->pte.direct == ptep) {
+			page->pte.direct = NULL;
+			ClearPageDirect(page);
 			goto out;
 		}
+	} else {
+		for (pc = page->pte.chain; pc; prev_pc = pc, pc = pc->next) {
+			if (pc->ptep == ptep) {
+				pte_chain_free(pc, prev_pc, page);
+				/* Check whether we can convert to direct */
+				pc = page->pte.chain;
+				if (!pc->next) {
+					page->pte.direct = pc->ptep;
+					SetPageDirect(page);
+					pte_chain_free(pc, NULL, NULL);
+				}
+				goto out;
+			}
+		}
 	}
 #ifdef DEBUG_RMAP
 	/* Not found. This should NEVER happen! */
 	printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
 	printk(KERN_ERR "page_remove_rmap: only found: ");
-	for (pc = page->pte_chain; pc; pc = pc->next)
-		printk("%p ", pc->ptep);
+	if (PageDirect(page)) {
+		printk("%p ", page->pte.direct);
+	} else {
+		for (pc = page->pte.chain; pc; pc = pc->next)
+			printk("%p ", pc->ptep);
+	}
 	printk("\n");
 	printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
 #endif
@@ -270,25 +312,41 @@
 	if (!page->mapping)
 		BUG();
 
-	for (pc = page->pte_chain; pc; pc = next_pc) {
-		next_pc = pc->next;
-		switch (try_to_unmap_one(page, pc->ptep)) {
-			case SWAP_SUCCESS:
-				/* Free the pte_chain struct. */
-				pte_chain_free(pc, prev_pc, page);
-				break;
-			case SWAP_AGAIN:
-				/* Skip this pte, remembering status. */
-				prev_pc = pc;
-				ret = SWAP_AGAIN;
-				continue;
-			case SWAP_FAIL:
-				return SWAP_FAIL;
-			case SWAP_ERROR:
-				return SWAP_ERROR;
+	if (PageDirect(page)) {
+		ret = try_to_unmap_one(page, page->pte.direct);
+		if (ret == SWAP_SUCCESS) {
+			page->pte.direct = NULL;
+			ClearPageDirect(page);
+		}
+	} else {
+		for (pc = page->pte.chain; pc; pc = next_pc) {
+			next_pc = pc->next;
+			switch (try_to_unmap_one(page, pc->ptep)) {
+				case SWAP_SUCCESS:
+					/* Free the pte_chain struct. */
+					pte_chain_free(pc, prev_pc, page);
+					break;
+				case SWAP_AGAIN:
+					/* Skip this pte, remembering status. */
+					prev_pc = pc;
+					ret = SWAP_AGAIN;
+					continue;
+				case SWAP_FAIL:
+					ret = SWAP_FAIL;
+					break;
+				case SWAP_ERROR:
+					ret = SWAP_ERROR;
+					break;
+			}
+		}
+		/* Check whether we can convert to direct pte pointer */
+		pc = page->pte.chain;
+		if (pc && !pc->next) {
+			page->pte.direct = pc->ptep;
+			SetPageDirect(page);
+			pte_chain_free(pc, NULL, NULL);
 		}
 	}
-
 	return ret;
 }
 
@@ -336,7 +394,7 @@
 	if (prev_pte_chain)
 		prev_pte_chain->next = pte_chain->next;
 	else if (page)
-		page->pte_chain = pte_chain->next;
+		page->pte.chain = pte_chain->next;
 
 	spin_lock(&pte_chain_freelist_lock);
 	pte_chain_push(pte_chain);

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 14:33   ` [PATCH] Optimize out pte_chain take three Dave McCracken
@ 2002-07-10 15:18     ` Rik van Riel
  2002-07-10 17:32       ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: Rik van Riel @ 2002-07-10 15:18 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Linux Memory Management

On Wed, 10 Jul 2002, Dave McCracken wrote:

> I agree that it's icky, at least the #defines.  However, I don't like
> using void *, either.  After thinking about it overnight, I think I
> prefer exposing the union along the lines of pte.chain and pte.direct.

I like it.  This patch seems ready for merging, as soon as
we've gotten rmap in.

Speaking of getting rmap in ... we might need some arguments
to get this thing past Linus, anyone ? ;)

cheers,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 15:18     ` Rik van Riel
@ 2002-07-10 17:32       ` William Lee Irwin III
  2002-07-10 20:01         ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-10 17:32 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Dave McCracken, Linux Memory Management

On Wed, Jul 10, 2002 at 12:18:12PM -0300, Rik van Riel wrote:
> I like it.  This patch seems ready for merging, as soon as
> we've gotten rmap in.
> Speaking of getting rmap in ... we might need some arguments
> to get this thing past Linus, anyone ? ;)

In no particular order:

(1)  page replacement no longer goes around randomly unmapping things
(2)  referenced bits are more accurate because there aren't several ms
	or even seconds between find the multiple pte's mapping a page
(3)  reduces page replacement from O(total virtually mapped) to O(physical)
(4)  enables defragmentation of physical memory
(5)  enables cooperative offlining of memory for friendly guest instance
	behavior in UML and/or LPAR settings
(6)  demonstrable benefit in performance of swapping which is common in
	end-user interactive workstation workloads (I don't like the word
	"desktop"). c.f. Craig Kulesa's post wrt. swapping performance
(7)  evidence from 2.4-based rmap trees indicates approximate parity
	with mainline in kernel compiles with appropriate locking bits
(8)  partitioning of physical memory can reduce the complexity of page
	replacement searches by scanning only the "interesting" zones
	implemented and merged in 2.4-based rmap
(9)  partitioning of physical memory can increase the parallelism of page
	replacement searches by independently processing different zones
	implemented, but not merged in 2.4-based rmap
(10) the reverse mappings may be used for efficiently keeping pte cache
	attributes coherent
(11) they may be used for virtual cache invalidation (with changes)
(12) the reverse mappings enable proper RSS limit enforcement
	implemented and merged in 2.4-based rmap

Hmm, anything else?


Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 17:32       ` William Lee Irwin III
@ 2002-07-10 20:01         ` Andrew Morton
  2002-07-10 20:14           ` Rik van Riel
                             ` (3 more replies)
  0 siblings, 4 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-10 20:01 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

Devil's advocate here.  Sorry.

William Lee Irwin III wrote:
> 
> ...
> (1)  page replacement no longer goes around randomly unmapping things

Got any numbers to back that up?

> (2)  referenced bits are more accurate because there aren't several ms
>         or even seconds between find the multiple pte's mapping a page

Got any numbers to show the benefit of this?

> (3)  reduces page replacement from O(total virtually mapped) to O(physical)

Numbers

> (4)  enables defragmentation of physical memory

Vapourware

> (5)  enables cooperative offlining of memory for friendly guest instance
>         behavior in UML and/or LPAR settings

Vapourware

> (6)  demonstrable benefit in performance of swapping which is common in
>         end-user interactive workstation workloads (I don't like the word
>         "desktop"). c.f. Craig Kulesa's post wrt. swapping performance

Demonstrable?  I see handwaving touchy-feely stuff.

> (7)  evidence from 2.4-based rmap trees indicates approximate parity
>         with mainline in kernel compiles with appropriate locking bits

err.  You mean that if you apply hotrodding to rmap but not mainline,
rmap achieves parity with mainline?

> (8)  partitioning of physical memory can reduce the complexity of page
>         replacement searches by scanning only the "interesting" zones
>         implemented and merged in 2.4-based rmap

But the page reclaim code is wildly inefficient in all kernels.
It's single-threaded via the spinlock.  Scaling that across
realistically two or less zones is insufficient.

Any numbers which demonstrate the benefit of this?

> (9)  partitioning of physical memory can increase the parallelism of page
>         replacement searches by independently processing different zones
>         implemented, but not merged in 2.4-based rmap

Numbers?

> (10) the reverse mappings may be used for efficiently keeping pte cache
>         attributes coherent

Do we need that?

> (11) they may be used for virtual cache invalidation (with changes)

Do we need that?

> (12) the reverse mappings enable proper RSS limit enforcement
>         implemented and merged in 2.4-based rmap
> 

Score one.


c'mon, guys.  It's all fluff.  We *have* to do better than this.
It's just software, and this is just engineering.  There's no
way we should be asking Linus to risk changing his VM based on
this sort of advocacy.

Bill, please throw away your list and come up with a new one.
Consisting of workloads and tests which we can run to evaluate
and optimise page replacement algorithms.

Alternatively, please try to enumerate the `operating regions'
for the page replacement code.  Then, we can identify measurable
tests which exercise them.  Then we can identify combinations of
those tests to model a `workload'.    We need to get this ball
rolling somehow.

btw, I told Rik I'd start on that definition today, but I'm having
trouble getting started.  Your insight would be muchly appreciated.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:01         ` Andrew Morton
@ 2002-07-10 20:14           ` Rik van Riel
  2002-07-10 20:28             ` Andrew Morton
  2002-07-10 20:33             ` Martin J. Bligh
  2002-07-10 22:22           ` William Lee Irwin III
                             ` (2 subsequent siblings)
  3 siblings, 2 replies; 39+ messages in thread
From: Rik van Riel @ 2002-07-10 20:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Wed, 10 Jul 2002, Andrew Morton wrote:

> Vapourware

On request. Linus has indicated that he doesn't _want_
all of the rmap stuff at once, but just the mechanism.

> Bill, please throw away your list and come up with a new one.
> Consisting of workloads and tests which we can run to evaluate
> and optimise page replacement algorithms.

Agreed, we do want nice stuff building on rmap, but
Linus has indicated that he doesn't want it in the
first stage of merging rmap.

Any way out of this chicken&egg situation ?

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:14           ` Rik van Riel
@ 2002-07-10 20:28             ` Andrew Morton
  2002-07-10 20:38               ` Rik van Riel
  2002-07-10 20:33             ` Martin J. Bligh
  1 sibling, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2002-07-10 20:28 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> ...
> Any way out of this chicken&egg situation ?

Well yes.  We can say "this is a sound design, and we
believe we can make it work well.  Let's merge it and
get going on it".

That's perfectly legitimate and sensible.  But it would be
better to not have to go this route just through lack of tools
or effort.   Probably, this is what we'll end up doing :(   But
we'll be much, much better off long term if we have those tests.

How do the BSD and proprietary kernel developers evaluate
their VMs?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:14           ` Rik van Riel
  2002-07-10 20:28             ` Andrew Morton
@ 2002-07-10 20:33             ` Martin J. Bligh
  1 sibling, 0 replies; 39+ messages in thread
From: Martin J. Bligh @ 2002-07-10 20:33 UTC (permalink / raw)
  To: Rik van Riel, Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

>> Bill, please throw away your list and come up with a new one.
>> Consisting of workloads and tests which we can run to evaluate
>> and optimise page replacement algorithms.
> 
> Agreed, we do want nice stuff building on rmap, but
> Linus has indicated that he doesn't want it in the
> first stage of merging rmap.
> 
> Any way out of this chicken&egg situation ?

Surely we can just get the benchmarking done on the full set of patches 
before any part of them gets accepted into mainline? I don't see what the 
problem is ....

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:28             ` Andrew Morton
@ 2002-07-10 20:38               ` Rik van Riel
  2002-07-13 13:42                 ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: Rik van Riel @ 2002-07-10 20:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Wed, 10 Jul 2002, Andrew Morton wrote:

> How do the BSD and proprietary kernel developers evaluate
> their VMs?

Proper statistics inside their VM, combined with some
basic workload testing.

Most importantly, they don't seem to hang themselves
on benchmarks. Benchmarks can be won by finetuning
whatever you have but measuring a new basis (still
without tuning) by benchmarking it just doesn't work.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:01         ` Andrew Morton
  2002-07-10 20:14           ` Rik van Riel
@ 2002-07-10 22:22           ` William Lee Irwin III
  2002-07-11  0:39             ` Andrew Morton
  2002-07-13 14:45             ` Daniel Phillips
  2002-07-13 13:22           ` Daniel Phillips
  2002-07-13 13:41           ` Daniel Phillips
  3 siblings, 2 replies; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-10 22:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, Dave McCracken, Linux Memory Management

William Lee Irwin III wrote:
[11 other things]
>> (12) the reverse mappings enable proper RSS limit enforcement
>>         implemented and merged in 2.4-based rmap

On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> Score one.

Phenomenally harsh.


On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> c'mon, guys.  It's all fluff.  We *have* to do better than this.
> It's just software, and this is just engineering.  There's no
> way we should be asking Linus to risk changing his VM based on
> this sort of advocacy.

I did not give detailed results, but there is quantitative measurement
to back up claims that several of these improve performance. Part of
this "flop" is that the criteria are unclear, and another is my fault
for not benchmarking as much as I should.

What I gave was meant to be of this form, namely "algorithmic
improvement X" and "feature enablement Y". But this is clearly
not what you're after.


On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> Bill, please throw away your list and come up with a new one.
> Consisting of workloads and tests which we can run to evaluate
> and optimise page replacement algorithms.

Your criteria are quantitative. I can't immediately measure all
of them but can go about collecting missing data immediately and post
as I go, then. Perhaps I'll even have helpers. =)


On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> Alternatively, please try to enumerate the `operating regions'
> for the page replacement code.  Then, we can identify measurable
> tests which exercise them.  Then we can identify combinations of
> those tests to model a `workload'.    We need to get this ball
> rolling somehow.
> btw, I told Rik I'd start on that definition today, but I'm having
> trouble getting started.  Your insight would be muchly appreciated.

Excellent. I'll not waste any more time discussing these kinds of
benefits and focus on the ones considered relevant by maintainers.

I've already gone about asking for help benchmarking dmc's pte_chain
space optimization, and I envision the following list of TODO items
being things you're more interested in:

(1) measure the effect of rmap on page fault rate
(1.5) try to figure out how many of mainline's faults came from the
	virtual scan unmapping things
(2) measure the effect of rmap on scan rate
(3) measure the effect of rmap on cpu time consumed by scanning
(4) measure the effect of per-zone LRU lists on cpu time consumed by
	scanning
(5) measure the effect of per-zone LRU list locks on benchmarks
(6) maybe hack a simulator to compare the hardware referenced bits
	to the software one computed by rmap and mainline
(7) re-do(?) swap accuracy measurements in a more meaningful way

(5) may involve some pain to forward port and (6) is painful too.
And these involve writing lots of instrumentation code...

What other missing data are you after and which of these should
be chucked?

As far as operating regions for page replacement go I see 3 obvious ones:
(1) lots of writeback with no swap
(2) churning clean pages with no swap
(3) swapping

And each of these with several proportions of memory sharing.
Sound reasonable?


Thanks,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 22:22           ` William Lee Irwin III
@ 2002-07-11  0:39             ` Andrew Morton
  2002-07-11  0:47               ` Rik van Riel
                                 ` (3 more replies)
  2002-07-13 14:45             ` Daniel Phillips
  1 sibling, 4 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-11  0:39 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

William Lee Irwin III wrote:
> 
> ..
> Phenomenally harsh.

No offence I hope.  Just venting three year's VM frustration.
 
>...
> On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> > Bill, please throw away your list and come up with a new one.
> > Consisting of workloads and tests which we can run to evaluate
> > and optimise page replacement algorithms.
> 
> Your criteria are quantitative. I can't immediately measure all
> of them but can go about collecting missing data immediately and post
> as I go, then. Perhaps I'll even have helpers. =)

A lot of it should be fairly simple.  We have tons of pagecache-intensive
workloads.  But we have gaps when it comes to the VM.  In the area of
page replacement.

Can we fill those gaps with a reasonable amount of effort?

example 1:  "run thirty processes which mmap a common 50000 page file and
touch its pages in a random-but-always-the-same pattern at fifty pages
per second.  Then run (dbench|tiobench|kernel build|slocate|foo).  Then
see how many pages the three processes actually managed to touch."

example 2: "run a process which mallocs 60% of physical memory and
touches it randomly at 1000 pages/sec.  run a second process
which mallocs 60% of physical memory and touches it randomly at
500 pages/sec.   Measure aggregate throughput for both processes".

example 3: "example 2, but do some pagecache stuff as well".

example 4: "run a process which mmaps 60%-worth of memory readonly,
another which mmaps 60%-worth of memory MAP_SHARED.  First process
touches 1000 pages/sec.  Second process modifies 1000 pages/sec.
Optimise for throughput"

Scriptable things.  Things which we can optimise for with a
reasonable expectation that this will improve real workloads.

> On Wed, Jul 10, 2002 at 01:01:12PM -0700, Andrew Morton wrote:
> > Alternatively, please try to enumerate the `operating regions'
> > for the page replacement code.  Then, we can identify measurable
> > tests which exercise them.  Then we can identify combinations of
> > those tests to model a `workload'.    We need to get this ball
> > rolling somehow.
> > btw, I told Rik I'd start on that definition today, but I'm having
> > trouble getting started.  Your insight would be muchly appreciated.
> 
> Excellent. I'll not waste any more time discussing these kinds of
> benefits and focus on the ones considered relevant by maintainers.
> 
> I've already gone about asking for help benchmarking dmc's pte_chain
> space optimization, and I envision the following list of TODO items
> being things you're more interested in:
> 
> (1) measure the effect of rmap on page fault rate
> (1.5) try to figure out how many of mainline's faults came from the
>         virtual scan unmapping things
> (2) measure the effect of rmap on scan rate
> (3) measure the effect of rmap on cpu time consumed by scanning
> (4) measure the effect of per-zone LRU lists on cpu time consumed by
>         scanning
> (5) measure the effect of per-zone LRU list locks on benchmarks
> (6) maybe hack a simulator to compare the hardware referenced bits
>         to the software one computed by rmap and mainline
> (7) re-do(?) swap accuracy measurements in a more meaningful way
> 
> (5) may involve some pain to forward port and (6) is painful too.
> And these involve writing lots of instrumentation code...
> 
> What other missing data are you after and which of these should
> be chucked?

Well there are two phases to this.  One is to run workloads,
and the other is to analyse them.  I think workloads such as my
lame ones above are worth thinking about and setting up first,
don't you?    (And a lot of this will come from me picking your
brains ;).  I can do the coding, although by /bin/sh skills
are woeful.)

After that comes the analysis.  Looks like rmap will be merged in
the next few days for test-and-eval, so we don't need to go through
some great beforehand-justification exercise.  But we do need
a permanent toolkit and we do need a way of optimising the VM.

I would suggest that the toolkit consist of two things:

1: A set of scenarios and associated scripts/tools such as my
   examples above (except more real-worldy) and

2: Permanent in-kernel instrumentation which allows us (and
   remote testers) to understand what is happening in there.

 
> As far as operating regions for page replacement go I see 3 obvious ones:
> (1) lots of writeback with no swap
> (2) churning clean pages with no swap
> (3) swapping
> 
> And each of these with several proportions of memory sharing.
> Sound reasonable?

Sounds ideal.  Care to flesh these out into real use cases?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:39             ` Andrew Morton
@ 2002-07-11  0:47               ` Rik van Riel
  2002-07-11  1:27                 ` Andrew Morton
  2002-07-13 14:10                 ` Daniel Phillips
  2002-07-11  1:51               ` William Lee Irwin III
                                 ` (2 subsequent siblings)
  3 siblings, 2 replies; 39+ messages in thread
From: Rik van Riel @ 2002-07-11  0:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Wed, 10 Jul 2002, Andrew Morton wrote:

> A lot of it should be fairly simple.  We have tons of pagecache-intensive
> workloads.  But we have gaps when it comes to the VM.  In the area of
> page replacement.

Umm, page replacement is about identifying the working set
and paging out those pages which are not in the working set.

None of the benchmark examples you give have anything like
a working set.

> Can we fill those gaps with a reasonable amount of effort?

[snip swap & pagecache IO throughput tests]

> After that comes the analysis.  Looks like rmap will be merged in
> the next few days for test-and-eval, so we don't need to go through
> some great beforehand-justification exercise.  But we do need
> a permanent toolkit and we do need a way of optimising the VM.

Absolutely agreed.

> I would suggest that the toolkit consist of two things:
>
> 1: A set of scenarios and associated scripts/tools such as my
>    examples above (except more real-worldy) and
>
> 2: Permanent in-kernel instrumentation which allows us (and
>    remote testers) to understand what is happening in there.

I'm willing to code up number 2 and get it into a mergeable
shape later this week.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:47               ` Rik van Riel
@ 2002-07-11  1:27                 ` Andrew Morton
  2002-07-13 14:10                 ` Daniel Phillips
  1 sibling, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-11  1:27 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> On Wed, 10 Jul 2002, Andrew Morton wrote:
> 
> ...
> None of the benchmark examples you give have anything like
> a working set.

That's why I'm asking you ;)

Help me out here.  Please suggest something.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:39             ` Andrew Morton
  2002-07-11  0:47               ` Rik van Riel
@ 2002-07-11  1:51               ` William Lee Irwin III
  2002-07-11  2:28                 ` William Lee Irwin III
  2002-07-11 19:54                 ` Andrew Morton
  2002-07-13 14:08               ` Daniel Phillips
  2002-07-13 14:20               ` Daniel Phillips
  3 siblings, 2 replies; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-11  1:51 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, Dave McCracken, Linux Memory Management

William Lee Irwin III wrote:
>> Phenomenally harsh.

On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> No offence I hope.  Just venting three year's VM frustration.

None taken. I believe it's a strong advisory to change direction,
so I have.


William Lee Irwin III wrote:
>> Your criteria are quantitative. I can't immediately measure all
>> of them but can go about collecting missing data immediately and post
>> as I go, then. Perhaps I'll even have helpers. =)

On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> A lot of it should be fairly simple.  We have tons of pagecache-intensive
> workloads.  But we have gaps when it comes to the VM.  In the area of
> page replacement.
> Can we fill those gaps with a reasonable amount of effort?


I'm not entirely sure, but I do have ideas of what I think would
exercise specific (sub)functions of the VM.


On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> example 1:  "run thirty processes which mmap a common 50000 page file and
> touch its pages in a random-but-always-the-same pattern at fifty pages
> per second.  Then run (dbench|tiobench|kernel build|slocate|foo).  Then
> see how many pages the three processes actually managed to touch."

Are we looking for "doesn't evict long-lived stuff" or "figures out
when the long-lived stuff finally died?" Maybe both would be good.


On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> example 2: "run a process which mallocs 60% of physical memory and
> touches it randomly at 1000 pages/sec.  run a second process
> which mallocs 60% of physical memory and touches it randomly at
> 500 pages/sec.   Measure aggregate throughput for both processes".
> example 3: "example 2, but do some pagecache stuff as well".

I think 2 and 3 should be merged, this should basically see how the
parallel pagecache stuff disturbs the prior results.


On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> example 4: "run a process which mmaps 60%-worth of memory readonly,
> another which mmaps 60%-worth of memory MAP_SHARED.  First process
> touches 1000 pages/sec.  Second process modifies 1000 pages/sec.
> Optimise for throughput"
> Scriptable things.  Things which we can optimise for with a
> reasonable expectation that this will improve real workloads.

Sequential/random access is another useful variable here.


William Lee Irwin III wrote:
>> I've already gone about asking for help benchmarking dmc's pte_chain
>> space optimization, and I envision the following list of TODO items
>> being things you're more interested in:
>> What other missing data are you after and which of these should
>> be chucked?

On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> Well there are two phases to this.  One is to run workloads,
> and the other is to analyse them.  I think workloads such as my
> lame ones above are worth thinking about and setting up first,
> don't you?    (And a lot of this will come from me picking your
> brains ;).  I can do the coding, although by /bin/sh skills
> are woeful.)
> After that comes the analysis.  Looks like rmap will be merged in
> the next few days for test-and-eval, so we don't need to go through
> some great beforehand-justification exercise.  But we do need
> a permanent toolkit and we do need a way of optimising the VM.

Okay, this is relatively high priority then.


On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> I would suggest that the toolkit consist of two things:
> 1: A set of scenarios and associated scripts/tools such as my
>    examples above (except more real-worldy) and
> 2: Permanent in-kernel instrumentation which allows us (and
>    remote testers) to understand what is happening in there.

At long last!


William Lee Irwin III wrote:
>> As far as operating regions for page replacement go I see 3 obvious ones:
>> (1) lots of writeback with no swap
>> (2) churning clean pages with no swap
>> (3) swapping
>> And each of these with several proportions of memory sharing.
>> Sound reasonable?

On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> Sounds ideal.  Care to flesh these out into real use cases?

(1) would be a system dedicated to data collection in real-life.
	Lots of writeback without swap is also a common database
	scenario. At any rate, it's intended to measure how effective
	and/or efficient replacement is when writeback is required to
	satisfy allocations.


(2) would be a system dedicated to distributing data in real life.
	This would be a read-only database scenario or (ignoring
	single vs. multiple files) webserving. It's intended to
	measure how effective page replacement is when clean pages
	must be discarded to satisfy allocations.

	These two testcases would be constellations of sibling processes
	mmapping a shared file-backed memory block larger than memory
	(where it departs from database-land) and stomping all over it
	in various patterns. Random, forward sequential, backward
	sequential, mixtures of different patterns across processes,
	mixtures of different patterns over time, varying frequencies
	of access to different regions of the arena and checking to see
	that proportions of memory being reclaimed corresponds to
	frequency of access, and sudden shifts between (1) and (2) would
	all be good things to measure, but the combinatorial explosion
	of options here hurts.

	The measurable criterion is of course throughput shoveling data.
	But there are many useful bits to measure here, e.g. the amount
	of cpu consumed by replacement, how much actually gets scanned
	for that cpu cost, and how right or wrong the guesses were when
	variable frequency access is used.


(3) this is going to be a "slightly overburdened desktop" workload
	here it's difficult to get a notion of how to appropriately
	simulate the workload, but I have a number of things in mind
	as to how to measure some useful things for one known case:

	(A) there are a number (300?) of tasks that start up, fault in
		some crud, and sleep forever -- in fact the majority
		They're just there in the background to chew up some
		space with per-task and per-mm pinned pages.
	(B) there are a number of tasks that are spawned when timer
		goes off, then they stomp on a metric buttload of stuff
		and exit until spawned for the next interval, e.g.
		updatedb, and of course none of the data it uses is
		useful for anything else and is all used just once.
		This can be dcache, buffer cache, pagecache, or anything.
	(C) there are a number of tasks that get prodded and fault on
		minor amounts of stuff and have mostly clean mappings
		but occasionally they'll all sleep for a long time
		(e.g. like end-users sleeping through updatedb runs)
		Basically, get a pool of processes, let them sleep,
		shoot a random process with signals at random times,
		and when shot, a process stomps over some clean data
		with basically random access before going back to
		sleep.

	The goal is to throw (A) out the window, control how much is
	ever taken away from (C) and given to (B), and keep (C), who
	is the real user of the thing, from seeing horribly bad worst
	cases like after updatedb runs at 3AM. Or, perhaps, figuring
	out who the real users are is the whole problem...

	Okay, even though the updatedb stuff can (and must) be solved
	with dcache-specific stuff, the general problem remains as a
	different kind of memory can be allocated in the same way.
	Trying to fool the VM with different kinds of one-shot
	allocations is probably the best variable here; specific timings
	aren't really very interesting.

	The number that comes out of this is of course the peak
	pagefault rate of the tasks in class (C).

The other swapping case is Netscape vs. xmms:
	One huge big fat memory hog is so bloated its working set alone
	drives the machine to swapping. This is poked and prodded at
	repeatedly, at which time it of course faults in a bunch more
	garbage from its inordinately and unreasonably bloated beyond
	belief working set. Now poor little innocent xmms is running
	in what's essentially a fixed amount of memory and cpu but
	can't lose the cpu or the disk for too long or mp3's will skip.

	This test is meant to exercise the VM's control over how much cpu
	and -disk bandwidth- it chews at one time so that well-behaved
	users don't see unreasonable latencies as the VM swamps their
	needed resources.

	I suspect the way to automate it is to generate signals to wake
	the bloated mem hog at random intervals, and that mem hog then
	randomly stomps over a bunch of memory. The innocent victim
	keeps a fixed-size arena where it sequentially slurps in fresh
	files, generates gibberish from their contents into a dedicated
	piece of its fixed-size arena, and then squirts out the data at
	what it wants to be a fixed rate to some character device. So
	the write buffer will always be dirty and the read buffer clean,
	except it doesn't really mmap. Oh, and of course, it takes a
	substantial but not overwhelming amount of cpu (25-40% for
	ancient p200's or something?) to generate the stuff it writes.

	Then the metric used is the variability in the read and write
	rates and %cpu used of the victim.


Uh-oh, this stuff might take a while to write... any userspace helpers
around?


Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  1:51               ` William Lee Irwin III
@ 2002-07-11  2:28                 ` William Lee Irwin III
  2002-07-11 19:54                 ` Andrew Morton
  1 sibling, 0 replies; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-11  2:28 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel, Dave McCracken, Linux Memory Management

On Wed, Jul 10, 2002 at 06:51:02PM -0700, William Lee Irwin III wrote:
> I'm not entirely sure, but I do have ideas of what I think would
> exercise specific (sub)functions of the VM.

I thought of another:

(4) A group of memory dirtying processes dirties memory at a rate
	exceeding the I/O bandwidth of the system.

	The VM's goal is to do "thrash control" for dirty memory
	generation by basically picking a subset of the dirty
	memory generators fitting within its I/O budget, letting
	them run for a while, and then putting them all to sleep
	after that and moving on to a different subset of the
	dirty memory generators, and so alternating between them.

	The measurable criteria would be foremost the variance in dirtying
	rates of the processes over their lifetimes, and secondarily
	dirtying throughput. The pass/fail criteria would be the ability
	to recognize the situation and stay within the "I/O budget".

Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  1:51               ` William Lee Irwin III
  2002-07-11  2:28                 ` William Lee Irwin III
@ 2002-07-11 19:54                 ` Andrew Morton
  2002-07-11 20:05                   ` Rik van Riel
                                     ` (2 more replies)
  1 sibling, 3 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-11 19:54 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

William Lee Irwin III wrote:
> 
> ...
> On Wed, Jul 10, 2002 at 05:39:47PM -0700, Andrew Morton wrote:
> > example 2: "run a process which mallocs 60% of physical memory and
> > touches it randomly at 1000 pages/sec.  run a second process
> > which mallocs 60% of physical memory and touches it randomly at
> > 500 pages/sec.   Measure aggregate throughput for both processes".
> > example 3: "example 2, but do some pagecache stuff as well".
> 
> I think 2 and 3 should be merged, this should basically see how the
> parallel pagecache stuff disturbs the prior results.

What I was attempting to do here was to generate a working set.  To
use a multiply-mapped file and an access pattern to build up that
working set, to apply memory pressure to cause eviction and to use
throughput to measure the effectiveness of the eviction choices.

Rik thought these are pagecache and swap thrashers, but that's not
the intent.  Think of the file as program text and the malloced
memory as, well, malloced memory.

The problem is the access pattern.  It shouldn't be random-uniform.
But what should it be?  random-gaussian?

So: map a large file, access it random-gaussian.  malloc some memory,
access it random-gaussian.  Apply eviction pressure. Measure throughput.
Optimise throughput.

Does this not capture what the VM is supposed to do?

What workload is rmap supposed to be good at?

> ...
> 
> (1) would be a system dedicated to data collection in real-life.
>         Lots of writeback without swap is also a common database
>         scenario. At any rate, it's intended to measure how effective
>         and/or efficient replacement is when writeback is required to
>         satisfy allocations.

Well, we know that lots of writeback trashes everything.  This is
especially irritating when your employer makes personal video
recorders.  It just sits there all day munching away at your
useful pagecache and swapping everything out.  Our kernels have
O_STREAMING because of this.   It simply removes as much pagecache
as it can, each time ->nrpages reaches 256.  It's rather effective.

> (2) would be a system dedicated to distributing data in real life.
>         This would be a read-only database scenario or (ignoring
>         single vs. multiple files) webserving. It's intended to
>         measure how effective page replacement is when clean pages
>         must be discarded to satisfy allocations.
> 
>         These two testcases would be constellations of sibling processes
>         mmapping a shared file-backed memory block larger than memory
>         (where it departs from database-land) and stomping all over it
>         in various patterns. Random, forward sequential, backward
>         sequential, mixtures of different patterns across processes,
>         mixtures of different patterns over time, varying frequencies
>         of access to different regions of the arena and checking to see
>         that proportions of memory being reclaimed corresponds to
>         frequency of access, and sudden shifts between (1) and (2) would
>         all be good things to measure, but the combinatorial explosion
>         of options here hurts.

Yes, that's complex.  Feasible though.  I'll work on this.
 
>         The measurable criterion is of course throughput shoveling data.
>         But there are many useful bits to measure here, e.g. the amount
>         of cpu consumed by replacement, how much actually gets scanned
>         for that cpu cost, and how right or wrong the guesses were when
>         variable frequency access is used.

I agree that throughput and CPU cost are the bottom line.
 
> (3) this is going to be a "slightly overburdened desktop" workload
>         here it's difficult to get a notion of how to appropriately
>         simulate the workload, but I have a number of things in mind
>         as to how to measure some useful things for one known case:
> 
>         (A) there are a number (300?) of tasks that start up, fault in
>                 some crud, and sleep forever -- in fact the majority
>                 They're just there in the background to chew up some
>                 space with per-task and per-mm pinned pages.

OK.

>         (B) there are a number of tasks that are spawned when timer
>                 goes off, then they stomp on a metric buttload of stuff
>                 and exit until spawned for the next interval, e.g.
>                 updatedb, and of course none of the data it uses is
>                 useful for anything else and is all used just once.
>                 This can be dcache, buffer cache, pagecache, or anything.

I installed 2.5.25+rmap on my desktop yesterday.  Come in this morning
to discover half of memory is inodes, quarter of memory is dentries and
I'm 40 megs into swap.  Sigh.

>         (C) there are a number of tasks that get prodded and fault on
>                 minor amounts of stuff and have mostly clean mappings
>                 but occasionally they'll all sleep for a long time
>                 (e.g. like end-users sleeping through updatedb runs)
>                 Basically, get a pool of processes, let them sleep,
>                 shoot a random process with signals at random times,
>                 and when shot, a process stomps over some clean data
>                 with basically random access before going back to
>                 sleep.
> 
>         The goal is to throw (A) out the window, control how much is
>         ever taken away from (C) and given to (B), and keep (C), who
>         is the real user of the thing, from seeing horribly bad worst
>         cases like after updatedb runs at 3AM. Or, perhaps, figuring
>         out who the real users are is the whole problem...
> 
>         Okay, even though the updatedb stuff can (and must) be solved
>         with dcache-specific stuff, the general problem remains as a
>         different kind of memory can be allocated in the same way.
>         Trying to fool the VM with different kinds of one-shot
>         allocations is probably the best variable here; specific timings
>         aren't really very interesting.
> 
>         The number that comes out of this is of course the peak
>         pagefault rate of the tasks in class (C).

This one sounds harder.  I suspect it can be assembled by glueing
together existing tools and applications, but making it repeatable
and quantifiable would be a challenge.
 
> The other swapping case is Netscape vs. xmms:
>         One huge big fat memory hog is so bloated its working set alone
>         drives the machine to swapping. This is poked and prodded at
>         repeatedly, at which time it of course faults in a bunch more
>         garbage from its inordinately and unreasonably bloated beyond
>         belief working set. Now poor little innocent xmms is running
>         in what's essentially a fixed amount of memory and cpu but
>         can't lose the cpu or the disk for too long or mp3's will skip.
> 
>         This test is meant to exercise the VM's control over how much cpu
>         and -disk bandwidth- it chews at one time so that well-behaved
>         users don't see unreasonable latencies as the VM swamps their
>         needed resources.
> 
>         I suspect the way to automate it is to generate signals to wake
>         the bloated mem hog at random intervals, and that mem hog then
>         randomly stomps over a bunch of memory. The innocent victim
>         keeps a fixed-size arena where it sequentially slurps in fresh
>         files, generates gibberish from their contents into a dedicated
>         piece of its fixed-size arena, and then squirts out the data at
>         what it wants to be a fixed rate to some character device. So
>         the write buffer will always be dirty and the read buffer clean,
>         except it doesn't really mmap. Oh, and of course, it takes a
>         substantial but not overwhelming amount of cpu (25-40% for
>         ancient p200's or something?) to generate the stuff it writes.
> 
>         Then the metric used is the variability in the read and write
>         rates and %cpu used of the victim.

The `working set simulator' could provide the memory hog function.
The victim application perhaps doesn't need to be anything as
fancy on day one.  Maybe just a kernel compile or such?

> Uh-oh, this stuff might take a while to write... any userspace helpers
> around?

I'll do the working set simulator.  That'll give you guys something
to crunch on.  Is gaussian reasonable?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 19:54                 ` Andrew Morton
@ 2002-07-11 20:05                   ` Rik van Riel
  2002-07-11 20:42                     ` Andrew Morton
  2002-07-11 22:54                   ` William Lee Irwin III
  2002-07-13 14:52                   ` Daniel Phillips
  2 siblings, 1 reply; 39+ messages in thread
From: Rik van Riel @ 2002-07-11 20:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Thu, 11 Jul 2002, Andrew Morton wrote:

> Rik thought these are pagecache and swap thrashers, but that's not
> the intent.  Think of the file as program text and the malloced
> memory as, well, malloced memory.
>
> The problem is the access pattern.  It shouldn't be random-uniform.
> But what should it be?  random-gaussian?

> Does this not capture what the VM is supposed to do?

Indeed. This sounds like a much much better test.


> useful pagecache and swapping everything out.  Our kernels have
> O_STREAMING because of this.   It simply removes as much pagecache
> as it can, each time ->nrpages reaches 256.  It's rather effective.

Now why does that remind me of drop-behind ? ;)


> I installed 2.5.25+rmap on my desktop yesterday.  Come in this morning
> to discover half of memory is inodes, quarter of memory is dentries and
> I'm 40 megs into swap.  Sigh.

As requested by Linus, this patch only has the mechanism
and none of the balancing changes.

I suspect Ed Tomlinson's patch will fix this issue.


> The `working set simulator' could provide the memory hog function.
> The victim application perhaps doesn't need to be anything as
> fancy on day one.  Maybe just a kernel compile or such?

Or maybe the interactive performance measurement thing by
Bob Matthews ?

	http://people.redhat.com/bmatthews/irman/ (IIRC)

> I'll do the working set simulator.  That'll give you guys something
> to crunch on.  Is gaussian reasonable?

Sounds ok to me.

kind regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 20:05                   ` Rik van Riel
@ 2002-07-11 20:42                     ` Andrew Morton
  2002-07-11 20:54                       ` Rik van Riel
  0 siblings, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2002-07-11 20:42 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> ...
> > useful pagecache and swapping everything out.  Our kernels have
> > O_STREAMING because of this.   It simply removes as much pagecache
> > as it can, each time ->nrpages reaches 256.  It's rather effective.
> 
> Now why does that remind me of drop-behind ? ;)

I looked at 2.4-ac as well.  Seems that the dropbehind there only
addresses reads?

This is a specialised application and frankly, I don't think
magical voodoo kernel logic will ever work as well as exposing
capabilities to the application.   The posix_fadvise() API is
basically ideal for this, but it's quite hard for Linux to
implement efficiently.   How do we efficiently discard the 
10,000 pages starting at page offset 25,000,000?

We can do that in O(not much) time with

	radix_tree_gang_lookup(void **pointers, int how_many, int starting_offset)

but that hasn't been written.  It would make truncate/invalidate_inode_pages
tons faster and cleaner too.
 
> > I installed 2.5.25+rmap on my desktop yesterday.  Come in this morning
> > to discover half of memory is inodes, quarter of memory is dentries and
> > I'm 40 megs into swap.  Sigh.
> 
> As requested by Linus, this patch only has the mechanism
> and none of the balancing changes.
> 
> I suspect Ed Tomlinson's patch will fix this issue.

yup.


btw, I was looking into many-spindle writeback performance
yesterday.  It's pretty bad.  Test case is simply four disks,
four ext2 filesytems, four processes flat-out writing to each
disk.

Throughput is only 60% of O_DIRECT because one of the disk's
queues fills up and everybody ends up blocking on that queue.

2.4 has the same problem, and it's basically unsolvable there
because of the global buffer LRU.

In 2.5, the balance_dirty() path is trivially solved by making
the caller of balance_dirty_pages only write back data against 
the superblock which he just dirtied.

However unless I set the dirty memory thresholds super-low
so that in fact none of the queues ever fills, we still hit
the same interqueue contention in the page reclaim code.

I was scratching my head over this for some time:  how come
there are dirty pages at the tail of the LRU, when the inactive
list is quite enormous?  I need to confirm this, but I suspect
it's metadata: we're moving pages to the head of the LRU when
they are first added to the inode, and when writeback is started.
But we're *not* performing that motion when the fs does
mark_buffer_dirty(bitmap block), for example.

So that dirty-against-a-full-queue bitmap block is a little
timebomb, worming its way to the head of the LRU.

Probably, a touch_buffer() in mark_buffer_dirty() will plug this,
but that's even more atomic operations, even more banging on
the pagemap_lru_lock.

I suspect the best fix here is to not have dirty or writeback 
pagecache pages on the LRU at all.  Throttle on memory coming
reclaimable, put the pages back on the LRU when they're clean,
etc.  As we have often discussed.  Big change.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 20:42                     ` Andrew Morton
@ 2002-07-11 20:54                       ` Rik van Riel
  2002-07-11 21:16                         ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: Rik van Riel @ 2002-07-11 20:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Thu, 11 Jul 2002, Andrew Morton wrote:
> Rik van Riel wrote:
> > ...
> > > useful pagecache and swapping everything out.  Our kernels have
> > > O_STREAMING because of this.   It simply removes as much pagecache
> > > as it can, each time ->nrpages reaches 256.  It's rather effective.
> >
> > Now why does that remind me of drop-behind ? ;)
>
> I looked at 2.4-ac as well.  Seems that the dropbehind there only
> addresses reads?

It should also work on linear writes.


> I suspect the best fix here is to not have dirty or writeback
> pagecache pages on the LRU at all.  Throttle on memory coming
> reclaimable, put the pages back on the LRU when they're clean,
> etc.  As we have often discussed.  Big change.

That just doesn't make sense, if you don't put the dirty pages
on the LRU then what incentive _do_ you have to write them out ?

Will you start writing them out once you run out of clean pages ?

Will you reclaim all glibc mapped pages before writing out dirty
pages ?

If the throttling is wrong, I propose we fix the trottling.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 20:54                       ` Rik van Riel
@ 2002-07-11 21:16                         ` Andrew Morton
  2002-07-11 21:41                           ` Rik van Riel
  0 siblings, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2002-07-11 21:16 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> On Thu, 11 Jul 2002, Andrew Morton wrote:
> > Rik van Riel wrote:
> > > ...
> > > > useful pagecache and swapping everything out.  Our kernels have
> > > > O_STREAMING because of this.   It simply removes as much pagecache
> > > > as it can, each time ->nrpages reaches 256.  It's rather effective.
> > >
> > > Now why does that remind me of drop-behind ? ;)
> >
> > I looked at 2.4-ac as well.  Seems that the dropbehind there only
> > addresses reads?
> 
> It should also work on linear writes.

The only call site for drop_behind() in -ac is generic_file_readahead().

> > I suspect the best fix here is to not have dirty or writeback
> > pagecache pages on the LRU at all.  Throttle on memory coming
> > reclaimable, put the pages back on the LRU when they're clean,
> > etc.  As we have often discussed.  Big change.
> 
> That just doesn't make sense, if you don't put the dirty pages
> on the LRU then what incentive _do_ you have to write them out ?

We have dirty page accounting.  If the page reclaim code decides
there's too much dirty memory then kick pdflush, and sleep on
IO completion's movement of reclaimable pages back onto the LRU.

Making page reclaimers perform writeback in shrink_cache()
just has awful latency problems.  If we don't do that then
there's just no point in keeping those pages on the LRU
because all we do is scan past them and blow cycles.

> ...
> If the throttling is wrong, I propose we fix the trottling.

How?  (Without adding more list scanning)

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 21:16                         ` Andrew Morton
@ 2002-07-11 21:41                           ` Rik van Riel
  2002-07-11 22:38                             ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: Rik van Riel @ 2002-07-11 21:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Thu, 11 Jul 2002, Andrew Morton wrote:
> Rik van Riel wrote:

> > > I looked at 2.4-ac as well.  Seems that the dropbehind there only
> > > addresses reads?
> >
> > It should also work on linear writes.
>
> The only call site for drop_behind() in -ac is generic_file_readahead().

generic_file_write() calls deactivate_page() if it crosses
the page boundary (ie. if it is done writing this page)

> > > I suspect the best fix here is to not have dirty or writeback
> > > pagecache pages on the LRU at all.  Throttle on memory coming
> > > reclaimable, put the pages back on the LRU when they're clean,
> > > etc.  As we have often discussed.  Big change.
> >
> > That just doesn't make sense, if you don't put the dirty pages
> > on the LRU then what incentive _do_ you have to write them out ?
>
> We have dirty page accounting.  If the page reclaim code decides
> there's too much dirty memory then kick pdflush, and sleep on
> IO completion's movement of reclaimable pages back onto the LRU.

At what point in the LRU ?

Are you proposing to reclaim free pages before considering
dirty pages ?

> Making page reclaimers perform writeback in shrink_cache()
> just has awful latency problems.  If we don't do that then
> there's just no point in keeping those pages on the LRU
> because all we do is scan past them and blow cycles.

Why does it have latency problems ?

Keeping them on the LRU _does_ make sense since we know
when we want to evict these pages.  Putting them aside
on a laundry list might make sense though, provided that
they are immediately made a candidate for replacement
after IO completion.

> > ...
> > If the throttling is wrong, I propose we fix the trottling.
>
> How?  (Without adding more list scanning)

For one, we shouldn't let every process go into
try_to_free_pages() and check for itself if the
pages really aren't freeable.

It is enough if one thread (kswapd) does this,
scanning more often won't change the status of
the pages.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 21:41                           ` Rik van Riel
@ 2002-07-11 22:38                             ` Andrew Morton
  2002-07-11 23:18                               ` Rik van Riel
  2002-07-13 15:08                               ` Daniel Phillips
  0 siblings, 2 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-11 22:38 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> On Thu, 11 Jul 2002, Andrew Morton wrote:
> > Rik van Riel wrote:
> 
> > > > I looked at 2.4-ac as well.  Seems that the dropbehind there only
> > > > addresses reads?
> > >
> > > It should also work on linear writes.
> >
> > The only call site for drop_behind() in -ac is generic_file_readahead().
> 
> generic_file_write() calls deactivate_page() if it crosses
> the page boundary (ie. if it is done writing this page)

Ah, OK.  I tried lots of those sorts of things.  But fact is
that moving the unwanted pages to the far of the inactive list
just isn't effective: pagecache which will be used at some time
in the future always ends up getting evicted.

> > > > I suspect the best fix here is to not have dirty or writeback
> > > > pagecache pages on the LRU at all.  Throttle on memory coming
> > > > reclaimable, put the pages back on the LRU when they're clean,
> > > > etc.  As we have often discussed.  Big change.
> > >
> > > That just doesn't make sense, if you don't put the dirty pages
> > > on the LRU then what incentive _do_ you have to write them out ?
> >
> > We have dirty page accounting.  If the page reclaim code decides
> > there's too much dirty memory then kick pdflush, and sleep on
> > IO completion's movement of reclaimable pages back onto the LRU.
> 
> At what point in the LRU ?

Interesting question.  If those pages haven't been moved to the
active list and if they're not referenced then one could just
reclaim them as soon as IO completes.   After all, that's basically
what we do now.
 
> Are you proposing to reclaim free pages before considering
> dirty pages ?

Well that would be a lower-latency implementation.  And given
that the machine is known to be sloshing pagecache pages at
a great rate, accuracy of replacement of those pages probably
isn't very important.

But yes, at some point you do need to stop carving away at the
clean pagecache and wait on writeback completion.  Not sure
how to balance that.
 
> > Making page reclaimers perform writeback in shrink_cache()
> > just has awful latency problems.  If we don't do that then
> > there's just no point in keeping those pages on the LRU
> > because all we do is scan past them and blow cycles.
> 
> Why does it have latency problems ?

Because the request queue may be full.   Page allocators
spend tons of time in get_request_wait.  Always have.

However it just occurs to me that we're doing bad things
in there.  Note how blkdev_release_request only wakes
a waiter when 32 requests are available.  That basically
guarantees a 0.3 second sleep.   If we kill the batch_requests
concept then the sleep time becomes just

	nr_flushing_processes * request interval

ie: 0.01 to 0.02 seconds.  hmm.  Yes.  Ouch.  Ugh.

Suppose you're running a massive `dd of=foo'.  Some innocent
task tries to allocate a page and hits a dirty one.  It
sleeps in get_request_wait().  dd is sleeping there too.
Now 32 requests complete and both tasks get woken.  The innocent
page allocator starts running again.  And `dd' immediately jams
another 32 requests into the queue.  It's very unfair.

> Keeping them on the LRU _does_ make sense since we know
> when we want to evict these pages.  Putting them aside
> on a laundry list might make sense though, provided that
> they are immediately made a candidate for replacement
> after IO completion.

Yes, well that's basically what we do now, if you're referring
to PageWriteback (used to be PageLocked) pages.  We just shove
them onto the far end of the LRU and hope they don't come back
until IO completes. 

> > > ...
> > > If the throttling is wrong, I propose we fix the trottling.
> >
> > How?  (Without adding more list scanning)
> 
> For one, we shouldn't let every process go into
> try_to_free_pages() and check for itself if the
> pages really aren't freeable.

mm.  Well we do want processes to go in and reclaim their own
pages normally, to avoid a context switch (I guess.  No numbers
to back this up).   But if that reclaimer then hits a PageDirty
or PageWriteback page then yup, he needs to ask pdflush to do
some IO and then he needs to sleep on *any* page becoming freeable,
not *that* page.

But I think killing batch_requests may make all this rather better.

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 19:54                 ` Andrew Morton
  2002-07-11 20:05                   ` Rik van Riel
@ 2002-07-11 22:54                   ` William Lee Irwin III
  2002-07-13 14:52                   ` Daniel Phillips
  2 siblings, 0 replies; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-11 22:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Thu, Jul 11, 2002 at 12:54:12PM -0700, Andrew Morton wrote:
> The problem is the access pattern.  It shouldn't be random-uniform.
> But what should it be?  random-gaussian?
> So: map a large file, access it random-gaussian.  malloc some memory,
> access it random-gaussian.  Apply eviction pressure. Measure throughput.
> Optimise throughput.
> Does this not capture what the VM is supposed to do?
> What workload is rmap supposed to be good at?

I wouldn't go through the trouble of Guassian, a step distribution,
i.e. sets A and B with P(A) = p < q = P(B), and "effective detection
of the working set" is seeing how much of the working set was retained
instead of reclaimed as p -> m(A)/m(A U B) and how much determining it
cost in terms of cpu. Or that's my first impression. The distributions
P(. | A) and P(. | B) don't matter aside from the probabilities of the
whole of A and B themselves and non-uniform creates harder to analyze
things as bits of A may well be less likely than bits of B so making
P(. | A) and P(. | B) uniform sounds easiest to me.

Throughput is probably easy to disturb in unrelated ways, but (of course)
necessary to keep track of.

Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 22:38                             ` Andrew Morton
@ 2002-07-11 23:18                               ` Rik van Riel
  2002-07-12 18:27                                 ` Paul Larson
  2002-07-12 19:28                                 ` Andrew Morton
  2002-07-13 15:08                               ` Daniel Phillips
  1 sibling, 2 replies; 39+ messages in thread
From: Rik van Riel @ 2002-07-11 23:18 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Thu, 11 Jul 2002, Andrew Morton wrote:

> > generic_file_write() calls deactivate_page() if it crosses
> > the page boundary (ie. if it is done writing this page)
>
> Ah, OK.  I tried lots of those sorts of things.  But fact is
> that moving the unwanted pages to the far of the inactive list
> just isn't effective: pagecache which will be used at some time
> in the future always ends up getting evicted.

There's no way around that, unless you know your application
workload well enough to be able to predict the future.


> But yes, at some point you do need to stop carving away at the
> clean pagecache and wait on writeback completion.  Not sure
> how to balance that.

Keeping all the pages on the same LRU would be a start.
There's no reason you couldn't start (or even finish)
writeout of the dirty pages before they reach the end
of the LRU.  If a page is still dirty when it reaches
the end of the LRU it can be moved aside onto a laundry
list, from where it gets freed after IO completes.

As soon as you start putting dirty pages on a different
LRU list we'll almost certainly lose the ability to
balance things.


> Suppose you're running a massive `dd of=foo'.  Some innocent
> task tries to allocate a page and hits a dirty one.  It
> sleeps in get_request_wait().  dd is sleeping there too.
> Now 32 requests complete and both tasks get woken.  The innocent
> page allocator starts running again.  And `dd' immediately jams
> another 32 requests into the queue.  It's very unfair.

Fixing this unfairness when the queue is "almost full"
is probably a useful thing to do.


> > Keeping them on the LRU _does_ make sense since we know
> > when we want to evict these pages.  Putting them aside
> > on a laundry list might make sense though, provided that
> > they are immediately made a candidate for replacement
> > after IO completion.
>
> Yes, well that's basically what we do now, if you're referring
> to PageWriteback (used to be PageLocked) pages.  We just shove
> them onto the far end of the LRU and hope they don't come back
> until IO completes.

Which is wrong, not only because these pages have reached the
end of the LRU and want to get replaced, but also because we
just don't want to go through the trouble of scanning them
again.

This would be a good place to catch these pages and put them
on a separate list from which they go to the free list once
IO completes.


> > > > ...
> > > > If the throttling is wrong, I propose we fix the trottling.
> > >
> > > How?  (Without adding more list scanning)
> >
> > For one, we shouldn't let every process go into
> > try_to_free_pages() and check for itself if the
> > pages really aren't freeable.
>
> mm.  Well we do want processes to go in and reclaim their own
> pages normally, to avoid a context switch (I guess.  No numbers
> to back this up).

Falling from __alloc_pages into the pageout path shouldn't be
part of the fast path.  If it is we have bigger problems...


> But I think killing batch_requests may make all this rather better.

Probably.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 23:18                               ` Rik van Riel
@ 2002-07-12 18:27                                 ` Paul Larson
  2002-07-12 19:06                                   ` Andrew Morton
  2002-07-12 19:28                                 ` Andrew Morton
  1 sibling, 1 reply; 39+ messages in thread
From: Paul Larson @ 2002-07-12 18:27 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andrew Morton, William Lee Irwin III, Dave McCracken,
	Linux Memory Management

I've tried booting this patch on 2.5.25+rmap on an 8-way, with highmem.  I
got a loot of oops on boot (couldn't see the top one because it scrolled
off the screen) and I havn't had time to set it up with serial console yet
but I will.  Before I do that though I wanted to know if there are any
known issues with my configuration.  I vaguely remember someone mentioning
problems with multiple swap partitions a while back and that's what I have

/etc/fstab:
/dev/sda5               swap                    swap    defaults        0 0
/dev/sda6               swap                    swap    defaults        0 0
/dev/sda7               swap                    swap    defaults        0 0
/dev/sda8               swap                    swap    defaults        0 0
/dev/sda9               swap                    swap    defaults        0 0
/dev/sda10              swap                    swap    defaults        0 0
/dev/sda11              swap                    swap    defaults        0 0
/dev/sda12              swap                    swap    defaults        0 0

for a total of about 15GB swap.

Any known problems with this?

Thanks,
Paul Larson


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-12 18:27                                 ` Paul Larson
@ 2002-07-12 19:06                                   ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-12 19:06 UTC (permalink / raw)
  To: Paul Larson
  Cc: Rik van Riel, William Lee Irwin III, Dave McCracken,
	Linux Memory Management

Paul Larson wrote:
> 
> I've tried booting this patch on 2.5.25+rmap on an 8-way, with highmem.  I
> got a loot of oops on boot (couldn't see the top one because it scrolled
> off the screen) and I havn't had time to set it up with serial console yet
> but I will.

you could stick a `for(;;);' in arch/i386/kernel/traps.c:die() to
stop the scrolling...

I was using Dave's patch for several hours yesterday, no probs.

>  Before I do that though I wanted to know if there are any
> known issues with my configuration.  I vaguely remember someone mentioning
> problems with multiple swap partitions a while back and that's what I have
> 
> /etc/fstab:
> /dev/sda5               swap                    swap    defaults        0 0
> /dev/sda6               swap                    swap    defaults        0 0
> /dev/sda7               swap                    swap    defaults        0 0
> /dev/sda8               swap                    swap    defaults        0 0
> /dev/sda9               swap                    swap    defaults        0 0
> /dev/sda10              swap                    swap    defaults        0 0
> /dev/sda11              swap                    swap    defaults        0 0
> /dev/sda12              swap                    swap    defaults        0 0
> 
> for a total of about 15GB swap.

In theory the limits are:

Max of 32 swapdevs
Max of 64G per swapdev.

I normally use two equal-priority disks for swap.  Works OK, but I did
considerable futzing in the swap code a while back so some bugs may have
been added.

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 23:18                               ` Rik van Riel
  2002-07-12 18:27                                 ` Paul Larson
@ 2002-07-12 19:28                                 ` Andrew Morton
  1 sibling, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2002-07-12 19:28 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

Rik van Riel wrote:
> 
> On Thu, 11 Jul 2002, Andrew Morton wrote:
> 
> > > generic_file_write() calls deactivate_page() if it crosses
> > > the page boundary (ie. if it is done writing this page)
> >
> > Ah, OK.  I tried lots of those sorts of things.  But fact is
> > that moving the unwanted pages to the far of the inactive list
> > just isn't effective: pagecache which will be used at some time
> > in the future always ends up getting evicted.
> 
> There's no way around that, unless you know your application
> workload well enough to be able to predict the future.

Well, we do know that.  In fact probably most applications know
their buffering requirements better than the kernel does.
(soapbox: default open() mode should be unbuffered, and the
application developer should be forced to specify what caching
behaviour [s]he wants.  Ah well).

> ...
> > mm.  Well we do want processes to go in and reclaim their own
> > pages normally, to avoid a context switch (I guess.  No numbers
> > to back this up).
> 
> Falling from __alloc_pages into the pageout path shouldn't be
> part of the fast path.  If it is we have bigger problems...

Oh, direct reclaim is the common case when the system is working
hard (and when it's not, we don't care about anything).

It's the common case because, yup, kswapd is asleep on request
queues all the time.
 
> > But I think killing batch_requests may make all this rather better.
> 
> Probably.

It does.  Still testing what ended up being a pretty broad
patch.  Frankly, I'd rather not finish the patch - it's
a workaround for a basic design mistake.  This is one of the
biggest performance bugs in linux.  The biggest, actually.  Don't
underestimate it.  In 2.4 it is halving our multi-spindle
writeback throughput.  It makes machines unusable during heavy
writeback loads.

Anyway.  I've uploaded my current patchset to
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.25/

What I tend to do is to keep a FIFO of patches.  Every now and
then I flush out the code which is 1-2 weeks old.  This way it has
had a good amount of testing by the time I submit it.

So anytime anyone has anything else which is reasonably close to
ready and which needs stability testing,  I'd be happy to add
it to the mix.

Not much of that code is ready to go at present.  kmap stuff needs
confirmation against more hardware and I still have many filesystems
to hack away at.  O_DIRECT rework is awaiting Ben's kvecs.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:01         ` Andrew Morton
  2002-07-10 20:14           ` Rik van Riel
  2002-07-10 22:22           ` William Lee Irwin III
@ 2002-07-13 13:22           ` Daniel Phillips
  2002-07-13 13:30             ` William Lee Irwin III
  2002-07-13 13:41           ` Daniel Phillips
  3 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 13:22 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Wednesday 10 July 2002 22:01, Andrew Morton wrote:
> Devil's advocate here.  Sorry.
> > (4)  enables defragmentation of physical memory
> 
> Vapourware

Prediction: it will continue to be vaporware until after rmap is merged.
Meanwhile, there's no denying that fragmentation is a burning issue.

> > (5)  enables cooperative offlining of memory for friendly guest instance
> >         behavior in UML and/or LPAR settings
> 
> Vapourware

See "enables" above.  Though I agree we want the thing at parity or
better on its own merits, I don't see the point of throwing tomatoes at
the "enables" points.  Recommendation: separate the list into "improves"
and "enables".

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-13 13:22           ` Daniel Phillips
@ 2002-07-13 13:30             ` William Lee Irwin III
  2002-07-13 13:55               ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2002-07-13 13:30 UTC (permalink / raw)
  To: Daniel Phillips
  Cc: Andrew Morton, Rik van Riel, Dave McCracken, Linux Memory Management

At some point in the past, I wrote:
>>> (5)  enables cooperative offlining of memory for friendly guest instance
>>>         behavior in UML and/or LPAR settings

On Wednesday 10 July 2002 22:01, Andrew Morton wrote:
>> Vapourware

On Sat, Jul 13, 2002 at 03:22:28PM +0200, Daniel Phillips wrote:
> See "enables" above.  Though I agree we want the thing at parity or
> better on its own merits, I don't see the point of throwing tomatoes at
> the "enables" points.  Recommendation: separate the list into "improves"
> and "enables".

The direction has been set and I'm following it. These things are now
off the roadmap entirely regardless, or at least I won't pursue them
until the things needing to be done now are addressed.

Say, we could use a number of helpers with the quantitative measurement
effort, Is there any chance you could help out here as well? It'd
certainly help get the cost/benefit analysis of rmap going for the
merge, and maybe even pinpoint things needing to be addressed.

Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:01         ` Andrew Morton
                             ` (2 preceding siblings ...)
  2002-07-13 13:22           ` Daniel Phillips
@ 2002-07-13 13:41           ` Daniel Phillips
  3 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 13:41 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Wednesday 10 July 2002 22:01, Andrew Morton wrote:
> Bill, please throw away your list and come up with a new one.
> Consisting of workloads and tests which we can run to evaluate
> and optimise page replacement algorithms.

I liked the list, I think it just needs to be reorganized.  All the
vaporware needs to go into an "enables" section (repeating myself)
and it needs to aquire a 'disadvantages' section, under which I'd
like to contribute:

  - For pure computational loads with no swapping, incurs unavoidable)
    overhead on page setup and teardown

       (measure it)

  - Adds new struct page overhead of one word, plus two words per
    shared pte (share > 1)

       (measure this)

  - Introduces a new resource, pte chain nodes, with associated
    locks and management issues

       (show locking profiles)

  - Is thought to cause swap read fragmentation

       (demonstrate this, if possible)

And an advantage to add to Bill's 'enables' list:

  - Enables a swap fragmentation reduction algorithm based
    on finding virtually adjacent swapout candidates via the
    pte_chains

> Alternatively, please try to enumerate the `operating regions'
> for the page replacement code.  Then, we can identify measurable
> tests which exercise them.  Then we can identify combinations of
> those tests to model a `workload'.    We need to get this ball
> rolling somehow.

Strongly agreed that the focus has to be on workload modeling.
We should be able to organize the modeling strictly around the
advantages/disadvantages list.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 20:38               ` Rik van Riel
@ 2002-07-13 13:42                 ` Daniel Phillips
  0 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 13:42 UTC (permalink / raw)
  To: Rik van Riel, Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Wednesday 10 July 2002 22:38, Rik van Riel wrote:
> On Wed, 10 Jul 2002, Andrew Morton wrote:
> 
> > How do the BSD and proprietary kernel developers evaluate
> > their VMs?
> 
> Proper statistics inside their VM, combined with some
> basic workload testing.
> 
> Most importantly, they don't seem to hang themselves
> on benchmarks. Benchmarks can be won by finetuning
> whatever you have but measuring a new basis (still
> without tuning) by benchmarking it just doesn't work.

As I recall, Matt likes to talk about load counts on heavily loaded
mail servers.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-13 13:30             ` William Lee Irwin III
@ 2002-07-13 13:55               ` Daniel Phillips
  0 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 13:55 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Andrew Morton, Rik van Riel, Dave McCracken,
	Linux Memory Management, rwhron

On Saturday 13 July 2002 15:30, William Lee Irwin III wrote:
> On Sat, Jul 13, 2002 at 03:22:28PM +0200, Daniel Phillips wrote:
> > See "enables" above.  Though I agree we want the thing at parity or
> > better on its own merits, I don't see the point of throwing tomatoes at
> > the "enables" points.  Recommendation: separate the list into "improves"
> > and "enables".
> 
> The direction has been set and I'm following it. These things are now
> off the roadmap entirely regardless, or at least I won't pursue them
> until the things needing to be done now are addressed.

> Say, we could use a number of helpers with the quantitative measurement
> effort, Is there any chance you could help out here as well?

Forget it :-)

I'll help with the test design.  We have a bunch of people ready to do
the work on the actual benchmarking.  We need to provide precise statements
of the test model, and put out a call for testers.  I'll help with that
too.

> It'd certainly help get the cost/benefit analysis of rmap going for the
> merge, and maybe even pinpoint things needing to be addressed.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:39             ` Andrew Morton
  2002-07-11  0:47               ` Rik van Riel
  2002-07-11  1:51               ` William Lee Irwin III
@ 2002-07-13 14:08               ` Daniel Phillips
  2002-07-13 14:20               ` Daniel Phillips
  3 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 14:08 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Thursday 11 July 2002 02:39, Andrew Morton wrote:
> example 1:  "run thirty processes which mmap a common 50000 page file and
> touch its pages in a random-but-always-the-same pattern at fifty pages
> per second.  Then run (dbench|tiobench|kernel build|slocate|foo).

Postmark looks like an excellend benchmark to add to the list, in fact
from the description, I'd put it at the front of the list.  It's
apparently much more stable than dbench, a property we desperately
need just now.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:47               ` Rik van Riel
  2002-07-11  1:27                 ` Andrew Morton
@ 2002-07-13 14:10                 ` Daniel Phillips
  1 sibling, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 14:10 UTC (permalink / raw)
  To: Rik van Riel, Andrew Morton
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Thursday 11 July 2002 02:47, Rik van Riel wrote:
> On Wed, 10 Jul 2002, Andrew Morton wrote:
> 
> > A lot of it should be fairly simple.  We have tons of pagecache-intensive
> > workloads.  But we have gaps when it comes to the VM.  In the area of
> > page replacement.
> 
> Umm, page replacement is about identifying the working set
> and paging out those pages which are not in the working set.
> 
> None of the benchmark examples you give have anything like
> a working set.

What we need is a short C program that similates a working set.  I
believe we discussed such a thing briefly at Ottawa.

This is just random-fu.  The main challenge is making it short and
sweet.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11  0:39             ` Andrew Morton
                                 ` (2 preceding siblings ...)
  2002-07-13 14:08               ` Daniel Phillips
@ 2002-07-13 14:20               ` Daniel Phillips
  3 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 14:20 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Thursday 11 July 2002 02:39, Andrew Morton wrote:
> I can do the coding, although by /bin/sh skills
> are woeful...

Careful... I've seen some recent test loads where the script time
dominated run time, and this was for filesystem operations!

Little C programs are both more readable and more accurate.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-10 22:22           ` William Lee Irwin III
  2002-07-11  0:39             ` Andrew Morton
@ 2002-07-13 14:45             ` Daniel Phillips
  1 sibling, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 14:45 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Thursday 11 July 2002 00:22, William Lee Irwin III wrote:
> As far as operating regions for page replacement go I see 3 obvious ones:
> (1) lots of writeback with no swap
> (2) churning clean pages with no swap
> (3) swapping

Another big variable is the balance of streaming IO versus program workload.
Within streaming IO there is the read/write balance (starting to get away 
from core VM here).  There's also the balance of file cache activity vs
anonymous memory.  It goes on of course - we have to boil this down to a few 
reasonably orthogonal variables.

Your specific load examples can be thought of as specific points in a 
two-dimensional test model with two variables:

  - Total working set relative to physical memory (should range from 
    significantly less to significantly more)

  - Balance of read accesses vs write accesses

Adding up the above, there are 5 knobs to turn so far.  This whole thing 
sounds like it can be modeled elegantly by a couple of C programs, one to 
generate memory loads and another to generate file loads, or we may be able 
to identify an off-the-shelf benchmark program that provides some of the 
tunables we need.

Needless to say, we need more than one or two points on each axis.  We need 
to capture our data in some organized way.  How?

Clearly, this benchmarking project as I've described it is starting to blow 
up into unreasonable dimensions.  I'd like to return to the idea of going 
through the original list of advantages and disadvantages item by item and 
identifying exactly which variables in our complete test space are relevant.  
This lets us set the knobs on our test programs, define the test, and know 
which numbers we want to capture/graph.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 19:54                 ` Andrew Morton
  2002-07-11 20:05                   ` Rik van Riel
  2002-07-11 22:54                   ` William Lee Irwin III
@ 2002-07-13 14:52                   ` Daniel Phillips
  2 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 14:52 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III
  Cc: Rik van Riel, Dave McCracken, Linux Memory Management

On Thursday 11 July 2002 21:54, Andrew Morton wrote:
> The problem is the access pattern.  It shouldn't be random-uniform.
> But what should it be?  random-gaussian?

It should be kinda-fractal.  That is, random distributions of random
distributions.  In concrete terms, this could mean running one random
load for a while then randomly switching to a different random load.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] Optimize out pte_chain take three
  2002-07-11 22:38                             ` Andrew Morton
  2002-07-11 23:18                               ` Rik van Riel
@ 2002-07-13 15:08                               ` Daniel Phillips
  1 sibling, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2002-07-13 15:08 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel
  Cc: William Lee Irwin III, Dave McCracken, Linux Memory Management

On Friday 12 July 2002 00:38, Andrew Morton wrote:
> But yes, at some point you do need to stop carving away at the
> clean pagecache and wait on writeback completion.  Not sure
> how to balance that.

Alert alert!  We're supposed to be benchmarking rmap, not fixing the
world just now.  We should just ensure that our rmap and non-rmap
kernels are equally crippled with respect to purging of streaming
IO, unless we have a very specific reason why the situation should
be affected by the difference in scanning strategies.

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2002-07-13 15:08 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-07-09 19:04 [PATCH] Optimize out pte_chain take two Dave McCracken
2002-07-10  0:21 ` Andrew Morton
2002-07-10 14:33   ` [PATCH] Optimize out pte_chain take three Dave McCracken
2002-07-10 15:18     ` Rik van Riel
2002-07-10 17:32       ` William Lee Irwin III
2002-07-10 20:01         ` Andrew Morton
2002-07-10 20:14           ` Rik van Riel
2002-07-10 20:28             ` Andrew Morton
2002-07-10 20:38               ` Rik van Riel
2002-07-13 13:42                 ` Daniel Phillips
2002-07-10 20:33             ` Martin J. Bligh
2002-07-10 22:22           ` William Lee Irwin III
2002-07-11  0:39             ` Andrew Morton
2002-07-11  0:47               ` Rik van Riel
2002-07-11  1:27                 ` Andrew Morton
2002-07-13 14:10                 ` Daniel Phillips
2002-07-11  1:51               ` William Lee Irwin III
2002-07-11  2:28                 ` William Lee Irwin III
2002-07-11 19:54                 ` Andrew Morton
2002-07-11 20:05                   ` Rik van Riel
2002-07-11 20:42                     ` Andrew Morton
2002-07-11 20:54                       ` Rik van Riel
2002-07-11 21:16                         ` Andrew Morton
2002-07-11 21:41                           ` Rik van Riel
2002-07-11 22:38                             ` Andrew Morton
2002-07-11 23:18                               ` Rik van Riel
2002-07-12 18:27                                 ` Paul Larson
2002-07-12 19:06                                   ` Andrew Morton
2002-07-12 19:28                                 ` Andrew Morton
2002-07-13 15:08                               ` Daniel Phillips
2002-07-11 22:54                   ` William Lee Irwin III
2002-07-13 14:52                   ` Daniel Phillips
2002-07-13 14:08               ` Daniel Phillips
2002-07-13 14:20               ` Daniel Phillips
2002-07-13 14:45             ` Daniel Phillips
2002-07-13 13:22           ` Daniel Phillips
2002-07-13 13:30             ` William Lee Irwin III
2002-07-13 13:55               ` Daniel Phillips
2002-07-13 13:41           ` Daniel Phillips

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.