All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/10] per device dirty throttling -v5
@ 2007-04-20 15:51 ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

The latest version of the per device dirty throttling.

against 2.6.21-rc6-mm1; the first patch is for easy application.
Andrew can of course just drop the patch it reverts.

Merged BDI_DIRTY and BDI_UNSTABLE into BDI_RECLAIMABLE, and multiplied
bdi_stat_delta() by the number of counters summed.

Thanks to Miklos for pointing these out.

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 00/10] per device dirty throttling -v5
@ 2007-04-20 15:51 ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

The latest version of the per device dirty throttling.

against 2.6.21-rc6-mm1; the first patch is for easy application.
Andrew can of course just drop the patch it reverts.

Merged BDI_DIRTY and BDI_UNSTABLE into BDI_RECLAIMABLE, and multiplied
bdi_stat_delta() by the number of counters summed.

Thanks to Miklos for pointing these out.

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 01/10] revert per-backing_dev-dirty-and-writeback-page-accounting
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:51   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: revert.patch --]
[-- Type: text/plain, Size: 6388 bytes --]

For ease of application..

---
 block/ll_rw_blk.c           |   29 -----------------------------
 fs/buffer.c                 |    1 -
 include/linux/backing-dev.h |    2 --
 mm/page-writeback.c         |   13 ++-----------
 mm/truncate.c               |    1 -
 5 files changed, 2 insertions(+), 44 deletions(-)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c	2007-04-10 16:30:55.000000000 +0200
+++ linux-2.6/block/ll_rw_blk.c	2007-04-10 16:35:24.000000000 +0200
@@ -201,8 +201,6 @@ EXPORT_SYMBOL(blk_queue_softirq_done);
  **/
 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 {
-	struct backing_dev_info *bdi = &q->backing_dev_info;
-
 	/*
 	 * set defaults
 	 */
@@ -210,8 +208,6 @@ void blk_queue_make_request(request_queu
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	q->make_request_fn = mfn;
-	atomic_long_set(&bdi->nr_dirty, 0);
-	atomic_long_set(&bdi->nr_writeback, 0);
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
@@ -3978,19 +3974,6 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
-static ssize_t queue_nr_dirty_show(struct request_queue *q, char *page)
-{
-	return sprintf(page, "%lu\n",
-		atomic_long_read(&q->backing_dev_info.nr_dirty));
-
-}
-
-static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
-{
-	return sprintf(page, "%lu\n",
-		atomic_long_read(&q->backing_dev_info.nr_writeback));
-
-}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4021,16 +4004,6 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
-static struct queue_sysfs_entry queue_nr_dirty_entry = {
-	.attr = {.name = "nr_dirty", .mode = S_IRUGO },
-	.show = queue_nr_dirty_show,
-};
-
-static struct queue_sysfs_entry queue_nr_writeback_entry = {
-	.attr = {.name = "nr_writeback", .mode = S_IRUGO },
-	.show = queue_nr_writeback_show,
-};
-
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4043,8 +4016,6 @@ static struct attribute *default_attrs[]
 	&queue_initial_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
-	&queue_nr_dirty_entry.attr,
-	&queue_nr_writeback_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/fs/buffer.c	2007-04-10 16:35:03.000000000 +0200
@@ -740,7 +740,6 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			atomic_long_inc(&mapping->backing_dev_info->nr_dirty);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-10 16:30:55.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-10 16:35:03.000000000 +0200
@@ -30,8 +30,6 @@ struct backing_dev_info {
 	unsigned long ra_thrash_bytes;	/* estimated thrashing threshold */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
-	atomic_long_t nr_dirty;	/* Pages dirty against this BDI */
-	atomic_long_t nr_writeback;/* Pages under writeback against this BDI */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-10 16:35:03.000000000 +0200
@@ -828,8 +828,6 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
-				atomic_long_inc(&mapping->backing_dev_info->
-						nr_dirty);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -963,7 +961,6 @@ int clear_page_dirty_for_io(struct page 
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			atomic_long_dec(&mapping->backing_dev_info->nr_dirty);
 			return 1;
 		}
 		return 0;
@@ -982,13 +979,10 @@ int test_clear_page_writeback(struct pag
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret) {
+		if (ret)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			atomic_long_dec(&mapping->backing_dev_info->
-					nr_writeback);
-		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -1008,13 +1002,10 @@ int test_set_page_writeback(struct page 
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret) {
+		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			atomic_long_inc(&mapping->backing_dev_info->
-					nr_writeback);
-		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/mm/truncate.c	2007-04-10 16:35:03.000000000 +0200
@@ -70,7 +70,6 @@ void cancel_dirty_page(struct page *page
 	if (TestClearPageDirty(page)) {
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
-			atomic_long_dec(&mapping->backing_dev_info->nr_dirty);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 01/10] revert per-backing_dev-dirty-and-writeback-page-accounting
@ 2007-04-20 15:51   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: revert.patch --]
[-- Type: text/plain, Size: 6613 bytes --]

For ease of application..

---
 block/ll_rw_blk.c           |   29 -----------------------------
 fs/buffer.c                 |    1 -
 include/linux/backing-dev.h |    2 --
 mm/page-writeback.c         |   13 ++-----------
 mm/truncate.c               |    1 -
 5 files changed, 2 insertions(+), 44 deletions(-)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c	2007-04-10 16:30:55.000000000 +0200
+++ linux-2.6/block/ll_rw_blk.c	2007-04-10 16:35:24.000000000 +0200
@@ -201,8 +201,6 @@ EXPORT_SYMBOL(blk_queue_softirq_done);
  **/
 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 {
-	struct backing_dev_info *bdi = &q->backing_dev_info;
-
 	/*
 	 * set defaults
 	 */
@@ -210,8 +208,6 @@ void blk_queue_make_request(request_queu
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	q->make_request_fn = mfn;
-	atomic_long_set(&bdi->nr_dirty, 0);
-	atomic_long_set(&bdi->nr_writeback, 0);
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
@@ -3978,19 +3974,6 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
-static ssize_t queue_nr_dirty_show(struct request_queue *q, char *page)
-{
-	return sprintf(page, "%lu\n",
-		atomic_long_read(&q->backing_dev_info.nr_dirty));
-
-}
-
-static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
-{
-	return sprintf(page, "%lu\n",
-		atomic_long_read(&q->backing_dev_info.nr_writeback));
-
-}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4021,16 +4004,6 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
-static struct queue_sysfs_entry queue_nr_dirty_entry = {
-	.attr = {.name = "nr_dirty", .mode = S_IRUGO },
-	.show = queue_nr_dirty_show,
-};
-
-static struct queue_sysfs_entry queue_nr_writeback_entry = {
-	.attr = {.name = "nr_writeback", .mode = S_IRUGO },
-	.show = queue_nr_writeback_show,
-};
-
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4043,8 +4016,6 @@ static struct attribute *default_attrs[]
 	&queue_initial_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
-	&queue_nr_dirty_entry.attr,
-	&queue_nr_writeback_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/fs/buffer.c	2007-04-10 16:35:03.000000000 +0200
@@ -740,7 +740,6 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			atomic_long_inc(&mapping->backing_dev_info->nr_dirty);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-10 16:30:55.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-10 16:35:03.000000000 +0200
@@ -30,8 +30,6 @@ struct backing_dev_info {
 	unsigned long ra_thrash_bytes;	/* estimated thrashing threshold */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
-	atomic_long_t nr_dirty;	/* Pages dirty against this BDI */
-	atomic_long_t nr_writeback;/* Pages under writeback against this BDI */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-10 16:35:03.000000000 +0200
@@ -828,8 +828,6 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
-				atomic_long_inc(&mapping->backing_dev_info->
-						nr_dirty);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -963,7 +961,6 @@ int clear_page_dirty_for_io(struct page 
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			atomic_long_dec(&mapping->backing_dev_info->nr_dirty);
 			return 1;
 		}
 		return 0;
@@ -982,13 +979,10 @@ int test_clear_page_writeback(struct pag
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret) {
+		if (ret)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			atomic_long_dec(&mapping->backing_dev_info->
-					nr_writeback);
-		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -1008,13 +1002,10 @@ int test_set_page_writeback(struct page 
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret) {
+		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			atomic_long_inc(&mapping->backing_dev_info->
-					nr_writeback);
-		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-04-10 16:30:15.000000000 +0200
+++ linux-2.6/mm/truncate.c	2007-04-10 16:35:03.000000000 +0200
@@ -70,7 +70,6 @@ void cancel_dirty_page(struct page *page
 	if (TestClearPageDirty(page)) {
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
-			atomic_long_dec(&mapping->backing_dev_info->nr_dirty);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 02/10] nfs: remove congestion_end()
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:51   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: nfs_congestion_fixup.patch --]
[-- Type: text/plain, Size: 2215 bytes --]

Its redundant, clear_bdi_congested() already wakes the waiters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/nfs/write.c              |    4 +---
 include/linux/backing-dev.h |    1 -
 mm/backing-dev.c            |   13 -------------
 3 files changed, 1 insertion(+), 17 deletions(-)

Index: linux-2.6-mm/fs/nfs/write.c
===================================================================
--- linux-2.6-mm.orig/fs/nfs/write.c	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/fs/nfs/write.c	2007-04-05 16:25:04.000000000 +0200
@@ -235,10 +235,8 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, WRITE);
-		congestion_end(WRITE);
-	}
 }
 
 /*
Index: linux-2.6-mm/include/linux/backing-dev.h
===================================================================
--- linux-2.6-mm.orig/include/linux/backing-dev.h	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/include/linux/backing-dev.h	2007-04-05 16:25:08.000000000 +0200
@@ -96,7 +96,6 @@ void clear_bdi_congested(struct backing_
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
 long congestion_wait_interruptible(int rw, long timeout);
-void congestion_end(int rw);
 
 #define bdi_cap_writeback_dirty(bdi) \
 	(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
Index: linux-2.6-mm/mm/backing-dev.c
===================================================================
--- linux-2.6-mm.orig/mm/backing-dev.c	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/mm/backing-dev.c	2007-04-05 16:25:16.000000000 +0200
@@ -70,16 +70,3 @@ long congestion_wait_interruptible(int r
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait_interruptible);
-
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	if (waitqueue_active(wqh))
-		wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 02/10] nfs: remove congestion_end()
@ 2007-04-20 15:51   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: nfs_congestion_fixup.patch --]
[-- Type: text/plain, Size: 2440 bytes --]

Its redundant, clear_bdi_congested() already wakes the waiters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/nfs/write.c              |    4 +---
 include/linux/backing-dev.h |    1 -
 mm/backing-dev.c            |   13 -------------
 3 files changed, 1 insertion(+), 17 deletions(-)

Index: linux-2.6-mm/fs/nfs/write.c
===================================================================
--- linux-2.6-mm.orig/fs/nfs/write.c	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/fs/nfs/write.c	2007-04-05 16:25:04.000000000 +0200
@@ -235,10 +235,8 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, WRITE);
-		congestion_end(WRITE);
-	}
 }
 
 /*
Index: linux-2.6-mm/include/linux/backing-dev.h
===================================================================
--- linux-2.6-mm.orig/include/linux/backing-dev.h	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/include/linux/backing-dev.h	2007-04-05 16:25:08.000000000 +0200
@@ -96,7 +96,6 @@ void clear_bdi_congested(struct backing_
 void set_bdi_congested(struct backing_dev_info *bdi, int rw);
 long congestion_wait(int rw, long timeout);
 long congestion_wait_interruptible(int rw, long timeout);
-void congestion_end(int rw);
 
 #define bdi_cap_writeback_dirty(bdi) \
 	(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
Index: linux-2.6-mm/mm/backing-dev.c
===================================================================
--- linux-2.6-mm.orig/mm/backing-dev.c	2007-04-05 16:24:50.000000000 +0200
+++ linux-2.6-mm/mm/backing-dev.c	2007-04-05 16:25:16.000000000 +0200
@@ -70,16 +70,3 @@ long congestion_wait_interruptible(int r
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait_interruptible);
-
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	if (waitqueue_active(wqh))
-		wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:51   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: percpu_counter_batch.patch --]
[-- Type: text/plain, Size: 949 bytes --]

With the current logic the percpu_counter's accuracy delta is quadric
wrt the number of cpus in the system, reduce this to O(n ln n).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/percpu_counter.h |    7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

Index: linux-2.6-mm/include/linux/percpu_counter.h
===================================================================
--- linux-2.6-mm.orig/include/linux/percpu_counter.h
+++ linux-2.6-mm/include/linux/percpu_counter.h
@@ -11,6 +11,7 @@
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <linux/log2.h>
 
 #ifdef CONFIG_SMP
 
@@ -20,11 +21,7 @@ struct percpu_counter {
 	s32 *counters;
 };
 
-#if NR_CPUS >= 16
-#define FBC_BATCH	(NR_CPUS*2)
-#else
-#define FBC_BATCH	(NR_CPUS*4)
-#endif
+#define FBC_BATCH	(8*ilog2(NR_CPUS))
 
 static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
@ 2007-04-20 15:51   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: percpu_counter_batch.patch --]
[-- Type: text/plain, Size: 1174 bytes --]

With the current logic the percpu_counter's accuracy delta is quadric
wrt the number of cpus in the system, reduce this to O(n ln n).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/percpu_counter.h |    7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

Index: linux-2.6-mm/include/linux/percpu_counter.h
===================================================================
--- linux-2.6-mm.orig/include/linux/percpu_counter.h
+++ linux-2.6-mm/include/linux/percpu_counter.h
@@ -11,6 +11,7 @@
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/types.h>
+#include <linux/log2.h>
 
 #ifdef CONFIG_SMP
 
@@ -20,11 +21,7 @@ struct percpu_counter {
 	s32 *counters;
 };
 
-#if NR_CPUS >= 16
-#define FBC_BATCH	(NR_CPUS*2)
-#else
-#define FBC_BATCH	(NR_CPUS*4)
-#endif
+#define FBC_BATCH	(8*ilog2(NR_CPUS))
 
 static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 04/10] lib: percpu_counter_mod64
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:51   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: percpu_counter_mod.patch --]
[-- Type: text/plain, Size: 2297 bytes --]

Add percpu_counter_mod64() to allow large modifications.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/percpu_counter.h |    9 +++++++++
 lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
+++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
@@ -36,6 +36,7 @@ static inline void percpu_counter_destro
 }
 
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
+void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
 s64 percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
@@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
 	preempt_enable();
 }
 
+static inline void
+percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
+{
+	preempt_disable();
+	fbc->count += amount;
+	preempt_enable();
+}
+
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
+++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
@@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
 }
 EXPORT_SYMBOL(percpu_counter_mod);
 
+void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
+{
+	long count;
+	s32 *pcount;
+	int cpu;
+
+	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += amount;
+		spin_unlock(&fbc->lock);
+		return;
+	}
+
+	cpu = get_cpu();
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	} else {
+		*pcount = count;
+	}
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_counter_mod64);
+
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 04/10] lib: percpu_counter_mod64
@ 2007-04-20 15:51   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: percpu_counter_mod.patch --]
[-- Type: text/plain, Size: 2522 bytes --]

Add percpu_counter_mod64() to allow large modifications.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/percpu_counter.h |    9 +++++++++
 lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
+++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
@@ -36,6 +36,7 @@ static inline void percpu_counter_destro
 }
 
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
+void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
 s64 percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
@@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
 	preempt_enable();
 }
 
+static inline void
+percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
+{
+	preempt_disable();
+	fbc->count += amount;
+	preempt_enable();
+}
+
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
+++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
@@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
 }
 EXPORT_SYMBOL(percpu_counter_mod);
 
+void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
+{
+	long count;
+	s32 *pcount;
+	int cpu;
+
+	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += amount;
+		spin_unlock(&fbc->lock);
+		return;
+	}
+
+	cpu = get_cpu();
+	pcount = per_cpu_ptr(fbc->counters, cpu);
+	count = *pcount + amount;
+	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
+		spin_lock(&fbc->lock);
+		fbc->count += count;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	} else {
+		*pcount = count;
+	}
+	put_cpu();
+}
+EXPORT_SYMBOL(percpu_counter_mod64);
+
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 05/10] mm: bdi init hooks
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:51   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_init.patch --]
[-- Type: text/plain, Size: 13430 bytes --]

provide BDI constructor/destructor hooks

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c               |    2 ++
 drivers/block/rd.c              |    6 ++++++
 drivers/char/mem.c              |    2 ++
 drivers/mtd/mtdcore.c           |    5 +++++
 fs/char_dev.c                   |    1 +
 fs/configfs/configfs_internal.h |    2 ++
 fs/configfs/inode.c             |    8 ++++++++
 fs/configfs/mount.c             |    2 ++
 fs/fuse/inode.c                 |    2 ++
 fs/hugetlbfs/inode.c            |    3 +++
 fs/nfs/client.c                 |    3 +++
 fs/ocfs2/dlm/dlmfs.c            |    6 +++++-
 fs/ramfs/inode.c                |    7 ++++++-
 fs/sysfs/inode.c                |    5 +++++
 fs/sysfs/mount.c                |    2 ++
 fs/sysfs/sysfs.h                |    1 +
 include/linux/backing-dev.h     |    7 +++++++
 kernel/cpuset.c                 |    3 +++
 mm/shmem.c                      |    1 +
 mm/swap.c                       |    2 ++
 20 files changed, 68 insertions(+), 2 deletions(-)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c	2007-04-12 11:35:53.000000000 +0200
+++ linux-2.6/block/ll_rw_blk.c	2007-04-12 13:19:40.000000000 +0200
@@ -1771,6 +1771,7 @@ static void blk_release_queue(struct kob
 
 	blk_trace_shutdown(q);
 
+	bdi_destroy(&q->backing_dev_info);
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -1836,6 +1837,7 @@ request_queue_t *blk_alloc_queue_node(gf
 	q->kobj.ktype = &queue_ktype;
 	kobject_init(&q->kobj);
 	q->backing_dev_info = default_backing_dev_info;
+	bdi_init(&q->backing_dev_info);
 
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/drivers/block/rd.c	2007-04-12 11:35:59.000000000 +0200
@@ -411,6 +411,9 @@ static void __exit rd_cleanup(void)
 		blk_cleanup_queue(rd_queue[i]);
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+	bdi_destroy(&rd_file_backing_dev_info);
+	bdi_destroy(&rd_backing_dev_info);
 }
 
 /*
@@ -421,6 +424,9 @@ static int __init rd_init(void)
 	int i;
 	int err = -ENOMEM;
 
+	bdi_init(&rd_backing_dev_info);
+	bdi_init(&rd_file_backing_dev_info);
+
 	if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
 			(rd_blocksize & (rd_blocksize-1))) {
 		printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/drivers/char/mem.c	2007-04-12 11:35:59.000000000 +0200
@@ -987,6 +987,8 @@ static int __init chr_dev_init(void)
 			      MKDEV(MEM_MAJOR, devlist[i].minor),
 			      devlist[i].name);
 
+	bdi_init(&zero_bdi);
+
 	return 0;
 }
 
Index: linux-2.6/fs/char_dev.c
===================================================================
--- linux-2.6.orig/fs/char_dev.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/char_dev.c	2007-04-12 11:35:59.000000000 +0200
@@ -546,6 +546,7 @@ static struct kobject *base_probe(dev_t 
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+	bdi_init(&directly_mappable_cdev_bdi);
 }
 
 
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/fuse/inode.c	2007-04-12 11:35:59.000000000 +0200
@@ -415,6 +415,7 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		bdi_init(&fc->bdi);
 		fc->reqctr = 0;
 		fc->blocked = 1;
 		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
@@ -428,6 +429,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
+		bdi_destroy(&fc->bdi);
 		kfree(fc);
 	}
 }
Index: linux-2.6/fs/nfs/client.c
===================================================================
--- linux-2.6.orig/fs/nfs/client.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/nfs/client.c	2007-04-12 11:35:59.000000000 +0200
@@ -657,6 +657,8 @@ static void nfs_server_set_fsinfo(struct
 	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	bdi_init(&server->backing_dev_info);
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 	server->backing_dev_info.ra_pages0 = min_t(unsigned, server->rpages,
 				VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10));
@@ -789,6 +791,7 @@ void nfs_free_server(struct nfs_server *
 	nfs_put_client(server->nfs_client);
 
 	nfs_free_iostats(server->io_stats);
+	bdi_destroy(&server->backing_dev_info);
 	kfree(server);
 	nfs_release_automount_timer();
 	dprintk("<-- nfs_free_server()\n");
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-12 11:35:57.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-12 13:19:40.000000000 +0200
@@ -36,6 +36,13 @@ struct backing_dev_info {
 	void *unplug_io_data;
 };
 
+static inline void bdi_init(struct backing_dev_info *bdi)
+{
+}
+
+static inline void bdi_destroy(struct backing_dev_info *bdi)
+{
+}
 
 /*
  * Flags in backing_dev_info::capability
Index: linux-2.6/drivers/mtd/mtdcore.c
===================================================================
--- linux-2.6.orig/drivers/mtd/mtdcore.c	2007-04-03 13:58:08.000000000 +0200
+++ linux-2.6/drivers/mtd/mtdcore.c	2007-04-12 11:37:45.000000000 +0200
@@ -60,6 +60,7 @@ int add_mtd_device(struct mtd_info *mtd)
 			break;
 		}
 	}
+	bdi_init(mtd->backing_dev_info);
 
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
@@ -142,6 +143,10 @@ int del_mtd_device (struct mtd_info *mtd
 	}
 
 	mutex_unlock(&mtd_table_mutex);
+
+	if (mtd->backing_dev_info)
+		bdi_destroy(mtd->backing_dev_info);
+
 	return ret;
 }
 
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/hugetlbfs/inode.c	2007-04-12 12:11:15.000000000 +0200
@@ -819,6 +819,8 @@ static int __init init_hugetlbfs_fs(void
  out:
 	if (error)
 		kmem_cache_destroy(hugetlbfs_inode_cachep);
+	else
+		bdi_init(&hugetlbfs_backing_dev_info);
 	return error;
 }
 
@@ -826,6 +828,7 @@ static void __exit exit_hugetlbfs_fs(voi
 {
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	unregister_filesystem(&hugetlbfs_fs_type);
+	bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 
 module_init(init_hugetlbfs_fs)
Index: linux-2.6/fs/ocfs2/dlm/dlmfs.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/dlm/dlmfs.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/ocfs2/dlm/dlmfs.c	2007-04-12 12:08:18.000000000 +0200
@@ -614,8 +614,10 @@ bail:
 			kmem_cache_destroy(dlmfs_inode_cache);
 		if (cleanup_worker)
 			destroy_workqueue(user_dlm_worker);
-	} else
+	} else {
+		bdi_init(&dlmfs_backing_dev_info);
 		printk("OCFS2 User DLM kernel interface loaded\n");
+	}
 	return status;
 }
 
@@ -627,6 +629,8 @@ static void __exit exit_dlmfs_fs(void)
 	destroy_workqueue(user_dlm_worker);
 
 	kmem_cache_destroy(dlmfs_inode_cache);
+
+	bdi_destroy(&dlmfs_backing_dev_info);
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/configfs/configfs_internal.h
===================================================================
--- linux-2.6.orig/fs/configfs/configfs_internal.h	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/configfs_internal.h	2007-04-12 11:51:13.000000000 +0200
@@ -55,6 +55,8 @@ extern int configfs_is_root(struct confi
 
 extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
 extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern void configfs_inode_init(void);
+extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/inode.c	2007-04-12 13:04:45.000000000 +0200
@@ -255,4 +255,12 @@ void configfs_hash_and_remove(struct den
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
+void __init configfs_inode_init(void)
+{
+	bdi_init(&configfs_backing_dev_info);
+}
 
+void __exit configfs_inode_exit(void)
+{
+	bdi_destroy(&configfs_backing_dev_info);
+}
Index: linux-2.6/fs/configfs/mount.c
===================================================================
--- linux-2.6.orig/fs/configfs/mount.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/mount.c	2007-04-12 12:07:34.000000000 +0200
@@ -156,6 +156,7 @@ static int __init configfs_init(void)
 		configfs_dir_cachep = NULL;
 	}
 
+	configfs_inode_init();
 out:
 	return err;
 }
@@ -166,6 +167,7 @@ static void __exit configfs_exit(void)
 	subsystem_unregister(&config_subsys);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
+	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/ramfs/inode.c
===================================================================
--- linux-2.6.orig/fs/ramfs/inode.c	2007-04-12 11:53:41.000000000 +0200
+++ linux-2.6/fs/ramfs/inode.c	2007-04-12 12:06:22.000000000 +0200
@@ -210,12 +210,17 @@ static struct file_system_type rootfs_fs
 
 static int __init init_ramfs_fs(void)
 {
-	return register_filesystem(&ramfs_fs_type);
+	int ret;
+	ret = register_filesystem(&ramfs_fs_type);
+	if (!ret)
+		bdi_init(&ramfs_backing_dev_info);
+	return ret;
 }
 
 static void __exit exit_ramfs_fs(void)
 {
 	unregister_filesystem(&ramfs_fs_type);
+	bdi_destroy(&ramfs_backing_dev_info);
 }
 
 module_init(init_ramfs_fs)
Index: linux-2.6/fs/sysfs/inode.c
===================================================================
--- linux-2.6.orig/fs/sysfs/inode.c	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/inode.c	2007-04-12 12:23:04.000000000 +0200
@@ -299,3 +299,8 @@ int sysfs_hash_and_remove(struct dentry 
 
 	return found ? 0 : -ENOENT;
 }
+
+void __init sysfs_inode_init(void)
+{
+	bdi_init(&sysfs_backing_dev_info);
+}
Index: linux-2.6/fs/sysfs/mount.c
===================================================================
--- linux-2.6.orig/fs/sysfs/mount.c	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/mount.c	2007-04-12 12:23:08.000000000 +0200
@@ -108,6 +108,8 @@ int __init sysfs_init(void)
 	} else
 		goto out_err;
 out:
+	if (!err)
+		sysfs_inode_init();
 	return err;
 out_err:
 	kmem_cache_destroy(sysfs_dir_cachep);
Index: linux-2.6/fs/sysfs/sysfs.h
===================================================================
--- linux-2.6.orig/fs/sysfs/sysfs.h	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/sysfs.h	2007-04-12 12:22:53.000000000 +0200
@@ -16,6 +16,7 @@ extern struct kmem_cache *sysfs_dir_cach
 extern void sysfs_delete_inode(struct inode *inode);
 extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
 extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern void sysfs_inode_init(void);
 
 extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *);
 extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
Index: linux-2.6/kernel/cpuset.c
===================================================================
--- linux-2.6.orig/kernel/cpuset.c	2007-04-12 12:26:33.000000000 +0200
+++ linux-2.6/kernel/cpuset.c	2007-04-12 12:37:43.000000000 +0200
@@ -1921,6 +1921,7 @@ int __init cpuset_init_early(void)
 
 	tsk->cpuset = &top_cpuset;
 	tsk->cpuset->mems_generation = cpuset_mems_generation++;
+
 	return 0;
 }
 
@@ -1963,6 +1964,8 @@ int __init cpuset_init(void)
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
 		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
+	if (!err)
+		bdi_init(&cpuset_backing_dev_info);
 out:
 	return err;
 }
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2007-04-12 12:28:01.000000000 +0200
+++ linux-2.6/mm/shmem.c	2007-04-12 12:28:14.000000000 +0200
@@ -2478,6 +2478,7 @@ static int __init init_tmpfs(void)
 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
 		goto out1;
 	}
+	bdi_init(&shmem_backing_dev_info);
 	return 0;
 
 out1:
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c	2007-04-12 12:30:04.000000000 +0200
+++ linux-2.6/mm/swap.c	2007-04-12 12:37:18.000000000 +0200
@@ -550,6 +550,8 @@ void __init swap_setup(void)
 {
 	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 
+	bdi_init(swapper_space.backing_dev_info);
+
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
 		page_cluster = 2;

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 05/10] mm: bdi init hooks
@ 2007-04-20 15:51   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:51 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_init.patch --]
[-- Type: text/plain, Size: 13655 bytes --]

provide BDI constructor/destructor hooks

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c               |    2 ++
 drivers/block/rd.c              |    6 ++++++
 drivers/char/mem.c              |    2 ++
 drivers/mtd/mtdcore.c           |    5 +++++
 fs/char_dev.c                   |    1 +
 fs/configfs/configfs_internal.h |    2 ++
 fs/configfs/inode.c             |    8 ++++++++
 fs/configfs/mount.c             |    2 ++
 fs/fuse/inode.c                 |    2 ++
 fs/hugetlbfs/inode.c            |    3 +++
 fs/nfs/client.c                 |    3 +++
 fs/ocfs2/dlm/dlmfs.c            |    6 +++++-
 fs/ramfs/inode.c                |    7 ++++++-
 fs/sysfs/inode.c                |    5 +++++
 fs/sysfs/mount.c                |    2 ++
 fs/sysfs/sysfs.h                |    1 +
 include/linux/backing-dev.h     |    7 +++++++
 kernel/cpuset.c                 |    3 +++
 mm/shmem.c                      |    1 +
 mm/swap.c                       |    2 ++
 20 files changed, 68 insertions(+), 2 deletions(-)

Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c	2007-04-12 11:35:53.000000000 +0200
+++ linux-2.6/block/ll_rw_blk.c	2007-04-12 13:19:40.000000000 +0200
@@ -1771,6 +1771,7 @@ static void blk_release_queue(struct kob
 
 	blk_trace_shutdown(q);
 
+	bdi_destroy(&q->backing_dev_info);
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -1836,6 +1837,7 @@ request_queue_t *blk_alloc_queue_node(gf
 	q->kobj.ktype = &queue_ktype;
 	kobject_init(&q->kobj);
 	q->backing_dev_info = default_backing_dev_info;
+	bdi_init(&q->backing_dev_info);
 
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
Index: linux-2.6/drivers/block/rd.c
===================================================================
--- linux-2.6.orig/drivers/block/rd.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/drivers/block/rd.c	2007-04-12 11:35:59.000000000 +0200
@@ -411,6 +411,9 @@ static void __exit rd_cleanup(void)
 		blk_cleanup_queue(rd_queue[i]);
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+	bdi_destroy(&rd_file_backing_dev_info);
+	bdi_destroy(&rd_backing_dev_info);
 }
 
 /*
@@ -421,6 +424,9 @@ static int __init rd_init(void)
 	int i;
 	int err = -ENOMEM;
 
+	bdi_init(&rd_backing_dev_info);
+	bdi_init(&rd_file_backing_dev_info);
+
 	if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
 			(rd_blocksize & (rd_blocksize-1))) {
 		printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/drivers/char/mem.c	2007-04-12 11:35:59.000000000 +0200
@@ -987,6 +987,8 @@ static int __init chr_dev_init(void)
 			      MKDEV(MEM_MAJOR, devlist[i].minor),
 			      devlist[i].name);
 
+	bdi_init(&zero_bdi);
+
 	return 0;
 }
 
Index: linux-2.6/fs/char_dev.c
===================================================================
--- linux-2.6.orig/fs/char_dev.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/char_dev.c	2007-04-12 11:35:59.000000000 +0200
@@ -546,6 +546,7 @@ static struct kobject *base_probe(dev_t 
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
+	bdi_init(&directly_mappable_cdev_bdi);
 }
 
 
Index: linux-2.6/fs/fuse/inode.c
===================================================================
--- linux-2.6.orig/fs/fuse/inode.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/fuse/inode.c	2007-04-12 11:35:59.000000000 +0200
@@ -415,6 +415,7 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
+		bdi_init(&fc->bdi);
 		fc->reqctr = 0;
 		fc->blocked = 1;
 		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
@@ -428,6 +429,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
+		bdi_destroy(&fc->bdi);
 		kfree(fc);
 	}
 }
Index: linux-2.6/fs/nfs/client.c
===================================================================
--- linux-2.6.orig/fs/nfs/client.c	2007-04-12 11:35:51.000000000 +0200
+++ linux-2.6/fs/nfs/client.c	2007-04-12 11:35:59.000000000 +0200
@@ -657,6 +657,8 @@ static void nfs_server_set_fsinfo(struct
 	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	bdi_init(&server->backing_dev_info);
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 	server->backing_dev_info.ra_pages0 = min_t(unsigned, server->rpages,
 				VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10));
@@ -789,6 +791,7 @@ void nfs_free_server(struct nfs_server *
 	nfs_put_client(server->nfs_client);
 
 	nfs_free_iostats(server->io_stats);
+	bdi_destroy(&server->backing_dev_info);
 	kfree(server);
 	nfs_release_automount_timer();
 	dprintk("<-- nfs_free_server()\n");
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-12 11:35:57.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-12 13:19:40.000000000 +0200
@@ -36,6 +36,13 @@ struct backing_dev_info {
 	void *unplug_io_data;
 };
 
+static inline void bdi_init(struct backing_dev_info *bdi)
+{
+}
+
+static inline void bdi_destroy(struct backing_dev_info *bdi)
+{
+}
 
 /*
  * Flags in backing_dev_info::capability
Index: linux-2.6/drivers/mtd/mtdcore.c
===================================================================
--- linux-2.6.orig/drivers/mtd/mtdcore.c	2007-04-03 13:58:08.000000000 +0200
+++ linux-2.6/drivers/mtd/mtdcore.c	2007-04-12 11:37:45.000000000 +0200
@@ -60,6 +60,7 @@ int add_mtd_device(struct mtd_info *mtd)
 			break;
 		}
 	}
+	bdi_init(mtd->backing_dev_info);
 
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
@@ -142,6 +143,10 @@ int del_mtd_device (struct mtd_info *mtd
 	}
 
 	mutex_unlock(&mtd_table_mutex);
+
+	if (mtd->backing_dev_info)
+		bdi_destroy(mtd->backing_dev_info);
+
 	return ret;
 }
 
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/hugetlbfs/inode.c	2007-04-12 12:11:15.000000000 +0200
@@ -819,6 +819,8 @@ static int __init init_hugetlbfs_fs(void
  out:
 	if (error)
 		kmem_cache_destroy(hugetlbfs_inode_cachep);
+	else
+		bdi_init(&hugetlbfs_backing_dev_info);
 	return error;
 }
 
@@ -826,6 +828,7 @@ static void __exit exit_hugetlbfs_fs(voi
 {
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	unregister_filesystem(&hugetlbfs_fs_type);
+	bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 
 module_init(init_hugetlbfs_fs)
Index: linux-2.6/fs/ocfs2/dlm/dlmfs.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/dlm/dlmfs.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/ocfs2/dlm/dlmfs.c	2007-04-12 12:08:18.000000000 +0200
@@ -614,8 +614,10 @@ bail:
 			kmem_cache_destroy(dlmfs_inode_cache);
 		if (cleanup_worker)
 			destroy_workqueue(user_dlm_worker);
-	} else
+	} else {
+		bdi_init(&dlmfs_backing_dev_info);
 		printk("OCFS2 User DLM kernel interface loaded\n");
+	}
 	return status;
 }
 
@@ -627,6 +629,8 @@ static void __exit exit_dlmfs_fs(void)
 	destroy_workqueue(user_dlm_worker);
 
 	kmem_cache_destroy(dlmfs_inode_cache);
+
+	bdi_destroy(&dlmfs_backing_dev_info);
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/configfs/configfs_internal.h
===================================================================
--- linux-2.6.orig/fs/configfs/configfs_internal.h	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/configfs_internal.h	2007-04-12 11:51:13.000000000 +0200
@@ -55,6 +55,8 @@ extern int configfs_is_root(struct confi
 
 extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
 extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern void configfs_inode_init(void);
+extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/inode.c	2007-04-12 13:04:45.000000000 +0200
@@ -255,4 +255,12 @@ void configfs_hash_and_remove(struct den
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
 
+void __init configfs_inode_init(void)
+{
+	bdi_init(&configfs_backing_dev_info);
+}
 
+void __exit configfs_inode_exit(void)
+{
+	bdi_destroy(&configfs_backing_dev_info);
+}
Index: linux-2.6/fs/configfs/mount.c
===================================================================
--- linux-2.6.orig/fs/configfs/mount.c	2007-04-03 13:58:17.000000000 +0200
+++ linux-2.6/fs/configfs/mount.c	2007-04-12 12:07:34.000000000 +0200
@@ -156,6 +156,7 @@ static int __init configfs_init(void)
 		configfs_dir_cachep = NULL;
 	}
 
+	configfs_inode_init();
 out:
 	return err;
 }
@@ -166,6 +167,7 @@ static void __exit configfs_exit(void)
 	subsystem_unregister(&config_subsys);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
+	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
Index: linux-2.6/fs/ramfs/inode.c
===================================================================
--- linux-2.6.orig/fs/ramfs/inode.c	2007-04-12 11:53:41.000000000 +0200
+++ linux-2.6/fs/ramfs/inode.c	2007-04-12 12:06:22.000000000 +0200
@@ -210,12 +210,17 @@ static struct file_system_type rootfs_fs
 
 static int __init init_ramfs_fs(void)
 {
-	return register_filesystem(&ramfs_fs_type);
+	int ret;
+	ret = register_filesystem(&ramfs_fs_type);
+	if (!ret)
+		bdi_init(&ramfs_backing_dev_info);
+	return ret;
 }
 
 static void __exit exit_ramfs_fs(void)
 {
 	unregister_filesystem(&ramfs_fs_type);
+	bdi_destroy(&ramfs_backing_dev_info);
 }
 
 module_init(init_ramfs_fs)
Index: linux-2.6/fs/sysfs/inode.c
===================================================================
--- linux-2.6.orig/fs/sysfs/inode.c	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/inode.c	2007-04-12 12:23:04.000000000 +0200
@@ -299,3 +299,8 @@ int sysfs_hash_and_remove(struct dentry 
 
 	return found ? 0 : -ENOENT;
 }
+
+void __init sysfs_inode_init(void)
+{
+	bdi_init(&sysfs_backing_dev_info);
+}
Index: linux-2.6/fs/sysfs/mount.c
===================================================================
--- linux-2.6.orig/fs/sysfs/mount.c	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/mount.c	2007-04-12 12:23:08.000000000 +0200
@@ -108,6 +108,8 @@ int __init sysfs_init(void)
 	} else
 		goto out_err;
 out:
+	if (!err)
+		sysfs_inode_init();
 	return err;
 out_err:
 	kmem_cache_destroy(sysfs_dir_cachep);
Index: linux-2.6/fs/sysfs/sysfs.h
===================================================================
--- linux-2.6.orig/fs/sysfs/sysfs.h	2007-04-03 13:58:18.000000000 +0200
+++ linux-2.6/fs/sysfs/sysfs.h	2007-04-12 12:22:53.000000000 +0200
@@ -16,6 +16,7 @@ extern struct kmem_cache *sysfs_dir_cach
 extern void sysfs_delete_inode(struct inode *inode);
 extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
 extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+extern void sysfs_inode_init(void);
 
 extern int sysfs_dirent_exist(struct sysfs_dirent *, const unsigned char *);
 extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
Index: linux-2.6/kernel/cpuset.c
===================================================================
--- linux-2.6.orig/kernel/cpuset.c	2007-04-12 12:26:33.000000000 +0200
+++ linux-2.6/kernel/cpuset.c	2007-04-12 12:37:43.000000000 +0200
@@ -1921,6 +1921,7 @@ int __init cpuset_init_early(void)
 
 	tsk->cpuset = &top_cpuset;
 	tsk->cpuset->mems_generation = cpuset_mems_generation++;
+
 	return 0;
 }
 
@@ -1963,6 +1964,8 @@ int __init cpuset_init(void)
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
 		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
+	if (!err)
+		bdi_init(&cpuset_backing_dev_info);
 out:
 	return err;
 }
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2007-04-12 12:28:01.000000000 +0200
+++ linux-2.6/mm/shmem.c	2007-04-12 12:28:14.000000000 +0200
@@ -2478,6 +2478,7 @@ static int __init init_tmpfs(void)
 		printk(KERN_ERR "Could not kern_mount tmpfs\n");
 		goto out1;
 	}
+	bdi_init(&shmem_backing_dev_info);
 	return 0;
 
 out1:
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c	2007-04-12 12:30:04.000000000 +0200
+++ linux-2.6/mm/swap.c	2007-04-12 12:37:18.000000000 +0200
@@ -550,6 +550,8 @@ void __init swap_setup(void)
 {
 	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 
+	bdi_init(swapper_space.backing_dev_info);
+
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
 		page_cluster = 2;

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 06/10] mm: scalable bdi statistics counters.
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:52   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat.patch --]
[-- Type: text/plain, Size: 3643 bytes --]

Provide scalable per backing_dev_info statistics counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |   50 ++++++++++++++++++++++++++++++++++++++++++--
 mm/backing-dev.c            |   26 ++++++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-12 13:27:13.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-12 13:28:40.000000000 +0200
@@ -8,6 +8,7 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/percpu_counter.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -24,6 +25,10 @@ enum bdi_state {
 
 typedef int (congested_fn)(void *, int);
 
+enum bdi_stat_item {
+	NR_BDI_STAT_ITEMS
+};
+
 struct backing_dev_info {
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long ra_pages0; /* min readahead on start of file */
@@ -34,14 +39,55 @@ struct backing_dev_info {
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+
+	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 };
 
-static inline void bdi_init(struct backing_dev_info *bdi)
+void bdi_init(struct backing_dev_info *bdi);
+void bdi_destroy(struct backing_dev_info *bdi);
+
+static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, s32 amount)
+{
+	percpu_counter_mod(&bdi->bdi_stat[item], amount);
+}
+
+static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	__mod_bdi_stat(bdi, item, 1);
+}
+
+static inline void inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__inc_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+
+static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
 {
+	__mod_bdi_stat(bdi, item, -1);
+}
+
+static inline void dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__dec_bdi_stat(bdi, item);
+	local_irq_restore(flags);
 }
 
-static inline void bdi_destroy(struct backing_dev_info *bdi)
+static inline s64 bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
 {
+	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
 }
 
 /*
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-12 13:27:10.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-12 13:28:26.000000000 +0200
@@ -5,6 +5,30 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 
+void bdi_init(struct backing_dev_info *bdi)
+{
+	int i;
+
+	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+		return;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		percpu_counter_init(&bdi->bdi_stat[i], 0);
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	int i;
+
+	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+		return;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		percpu_counter_destroy(&bdi->bdi_stat[i]);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -70,3 +94,5 @@ long congestion_wait_interruptible(int r
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait_interruptible);
+
+

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 06/10] mm: scalable bdi statistics counters.
@ 2007-04-20 15:52   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat.patch --]
[-- Type: text/plain, Size: 3868 bytes --]

Provide scalable per backing_dev_info statistics counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |   50 ++++++++++++++++++++++++++++++++++++++++++--
 mm/backing-dev.c            |   26 ++++++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-12 13:27:13.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-12 13:28:40.000000000 +0200
@@ -8,6 +8,7 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
+#include <linux/percpu_counter.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -24,6 +25,10 @@ enum bdi_state {
 
 typedef int (congested_fn)(void *, int);
 
+enum bdi_stat_item {
+	NR_BDI_STAT_ITEMS
+};
+
 struct backing_dev_info {
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long ra_pages0; /* min readahead on start of file */
@@ -34,14 +39,55 @@ struct backing_dev_info {
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+
+	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 };
 
-static inline void bdi_init(struct backing_dev_info *bdi)
+void bdi_init(struct backing_dev_info *bdi);
+void bdi_destroy(struct backing_dev_info *bdi);
+
+static inline void __mod_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, s32 amount)
+{
+	percpu_counter_mod(&bdi->bdi_stat[item], amount);
+}
+
+static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	__mod_bdi_stat(bdi, item, 1);
+}
+
+static inline void inc_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__inc_bdi_stat(bdi, item);
+	local_irq_restore(flags);
+}
+
+static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
 {
+	__mod_bdi_stat(bdi, item, -1);
+}
+
+static inline void dec_bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__dec_bdi_stat(bdi, item);
+	local_irq_restore(flags);
 }
 
-static inline void bdi_destroy(struct backing_dev_info *bdi)
+static inline s64 bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
 {
+	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
 }
 
 /*
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-12 13:27:10.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-12 13:28:26.000000000 +0200
@@ -5,6 +5,30 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 
+void bdi_init(struct backing_dev_info *bdi)
+{
+	int i;
+
+	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+		return;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		percpu_counter_init(&bdi->bdi_stat[i], 0);
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+	int i;
+
+	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+		return;
+
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		percpu_counter_destroy(&bdi->bdi_stat[i]);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -70,3 +94,5 @@ long congestion_wait_interruptible(int r
 	return ret;
 }
 EXPORT_SYMBOL(congestion_wait_interruptible);
+
+

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 07/10] mm: count reclaimable pages per BDI
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:52   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_reclaimable.patch --]
[-- Type: text/plain, Size: 4411 bytes --]

Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/buffer.c                 |    2 ++
 fs/nfs/write.c              |    7 +++++++
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |    4 ++++
 mm/truncate.c               |    2 ++
 5 files changed, 16 insertions(+)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/fs/buffer.c	2007-04-20 15:26:31.000000000 +0200
@@ -740,6 +740,8 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
@@ -828,6 +828,8 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
+				__inc_bdi_stat(mapping->backing_dev_info,
+						BDI_RECLAIMABLE);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -961,6 +963,8 @@ int clear_page_dirty_for_io(struct page 
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/mm/truncate.c	2007-04-20 15:27:38.000000000 +0200
@@ -71,6 +71,8 @@ void cancel_dirty_page(struct page *page
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c	2007-04-20 15:20:05.000000000 +0200
+++ linux-2.6/fs/nfs/write.c	2007-04-20 15:27:07.000000000 +0200
@@ -449,6 +449,7 @@ nfs_mark_request_commit(struct nfs_page 
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 #endif
@@ -509,6 +510,8 @@ static void nfs_cancel_commit_list(struc
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_list_remove_request(req);
 		nfs_inode_remove_request(req);
 		nfs_unlock_request(req);
@@ -1234,6 +1237,8 @@ nfs_commit_list(struct inode *inode, str
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
@@ -1258,6 +1263,8 @@ static void nfs_commit_done(struct rpc_t
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:25:47.000000000 +0200
@@ -26,6 +26,7 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
+	BDI_RECLAIMABLE,
 	NR_BDI_STAT_ITEMS
 };
 

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 07/10] mm: count reclaimable pages per BDI
@ 2007-04-20 15:52   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_reclaimable.patch --]
[-- Type: text/plain, Size: 4636 bytes --]

Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 fs/buffer.c                 |    2 ++
 fs/nfs/write.c              |    7 +++++++
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |    4 ++++
 mm/truncate.c               |    2 ++
 5 files changed, 16 insertions(+)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/fs/buffer.c	2007-04-20 15:26:31.000000000 +0200
@@ -740,6 +740,8 @@ int __set_page_dirty_buffers(struct page
 	if (page->mapping) {	/* Race with truncate? */
 		if (mapping_cap_account_dirty(mapping)) {
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
@@ -828,6 +828,8 @@ int __set_page_dirty_nobuffers(struct pa
 			BUG_ON(mapping2 != mapping);
 			if (mapping_cap_account_dirty(mapping)) {
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
+				__inc_bdi_stat(mapping->backing_dev_info,
+						BDI_RECLAIMABLE);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -961,6 +963,8 @@ int clear_page_dirty_for_io(struct page 
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/mm/truncate.c	2007-04-20 15:27:38.000000000 +0200
@@ -71,6 +71,8 @@ void cancel_dirty_page(struct page *page
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
+			dec_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}
Index: linux-2.6/fs/nfs/write.c
===================================================================
--- linux-2.6.orig/fs/nfs/write.c	2007-04-20 15:20:05.000000000 +0200
+++ linux-2.6/fs/nfs/write.c	2007-04-20 15:27:07.000000000 +0200
@@ -449,6 +449,7 @@ nfs_mark_request_commit(struct nfs_page 
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 #endif
@@ -509,6 +510,8 @@ static void nfs_cancel_commit_list(struc
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_list_remove_request(req);
 		nfs_inode_remove_request(req);
 		nfs_unlock_request(req);
@@ -1234,6 +1237,8 @@ nfs_commit_list(struct inode *inode, str
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
@@ -1258,6 +1263,8 @@ static void nfs_commit_done(struct rpc_t
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+				BDI_RECLAIMABLE);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:20:48.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:25:47.000000000 +0200
@@ -26,6 +26,7 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
+	BDI_RECLAIMABLE,
 	NR_BDI_STAT_ITEMS
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 08/10] mm: count writeback pages per BDI
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:52   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_writeback.patch --]
[-- Type: text/plain, Size: 2075 bytes --]

Count per BDI writeback pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |   12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
@@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
+		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi))
+				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -1002,14 +1006,18 @@ int test_set_page_writeback(struct page 
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
+		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi))
+				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:25:47.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
@@ -27,6 +27,7 @@ typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
+	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
 

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 08/10] mm: count writeback pages per BDI
@ 2007-04-20 15:52   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_writeback.patch --]
[-- Type: text/plain, Size: 2300 bytes --]

Count per BDI writeback pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |    1 +
 mm/page-writeback.c         |   12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
@@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
+		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi))
+				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -1002,14 +1006,18 @@ int test_set_page_writeback(struct page 
 	int ret;
 
 	if (mapping) {
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
+		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (bdi_cap_writeback_dirty(bdi))
+				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:25:47.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
@@ -27,6 +27,7 @@ typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
+	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
 

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 09/10] mm: expose BDI statistics in sysfs.
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:52   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_sysfs.patch --]
[-- Type: text/plain, Size: 1814 bytes --]

Expose the per BDI stats in /sys/block/<dev>/queue/*

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

Index: linux-2.6-mm/block/ll_rw_blk.c
===================================================================
--- linux-2.6-mm.orig/block/ll_rw_blk.c
+++ linux-2.6-mm/block/ll_rw_blk.c
@@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
+}
+
+static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_WRITEBACK));
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4006,6 +4020,16 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_reclaimable_entry = {
+	.attr = {.name = "reclaimable_pages", .mode = S_IRUGO },
+	.show = queue_nr_reclaimable_show,
+};
+
+static struct queue_sysfs_entry queue_writeback_entry = {
+	.attr = {.name = "writeback_pages", .mode = S_IRUGO },
+	.show = queue_nr_writeback_show,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4018,6 +4047,8 @@ static struct attribute *default_attrs[]
 	&queue_initial_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_reclaimable_entry.attr,
+	&queue_writeback_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 09/10] mm: expose BDI statistics in sysfs.
@ 2007-04-20 15:52   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: bdi_stat_sysfs.patch --]
[-- Type: text/plain, Size: 2039 bytes --]

Expose the per BDI stats in /sys/block/<dev>/queue/*

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

Index: linux-2.6-mm/block/ll_rw_blk.c
===================================================================
--- linux-2.6-mm.orig/block/ll_rw_blk.c
+++ linux-2.6-mm/block/ll_rw_blk.c
@@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
+}
+
+static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_WRITEBACK));
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -4006,6 +4020,16 @@ static struct queue_sysfs_entry queue_ma
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_reclaimable_entry = {
+	.attr = {.name = "reclaimable_pages", .mode = S_IRUGO },
+	.show = queue_nr_reclaimable_show,
+};
+
+static struct queue_sysfs_entry queue_writeback_entry = {
+	.attr = {.name = "writeback_pages", .mode = S_IRUGO },
+	.show = queue_nr_writeback_show,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -4018,6 +4047,8 @@ static struct attribute *default_attrs[]
 	&queue_initial_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_reclaimable_entry.attr,
+	&queue_writeback_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 10/10] mm: per device dirty threshold
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-20 15:52   ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: writeback-balance-per-backing_dev.patch --]
[-- Type: text/plain, Size: 12897 bytes --]

Scale writeback cache per backing device, proportional to its writeout speed.

By decoupling the BDI dirty thresholds a number of problems we currently have
will go away, namely:

 - mutual interference starvation (for any number of BDIs);
 - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).

It might be that all dirty pages are for a single BDI while other BDIs are
idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
dirty pages outstanding and make progress.

A global threshold also creates a deadlock for stacked BDIs; when A writes to
B, and A generates enough dirty pages to get throttled, B will never start
writeback until the dirty pages go away. Again, by giving each BDI its own
'independent' dirty limit, this problem is avoided.

So the problem is to determine how to distribute the total dirty limit across
the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
not have any dirty pages outstanding is a waste.

What is done is to keep a floating proportion between the DBIs based on
writeback completions. This way faster/more active devices get a larger share
than slower/idle devices.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |   51 ++++++++++++
 mm/backing-dev.c            |    3 
 mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
 3 files changed, 206 insertions(+), 29 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
@@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITEOUT,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -43,6 +44,13 @@ struct backing_dev_info {
 	void *unplug_io_data;
 
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
+
+	/*
+	 * data used for scaling the writeback cache
+	 */
+	spinlock_t lock;	/* protect the cycle count */
+	unsigned long cycles;	/* writeout cycles */
+	int dirty_exceeded;
 };
 
 void bdi_init(struct backing_dev_info *bdi);
@@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
 	percpu_counter_mod(&bdi->bdi_stat[item], amount);
 }
 
+static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, s64 amount)
+{
+	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
+}
+
 static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
@@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
 	local_irq_restore(flags);
 }
 
+static inline s64 __bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_read(&bdi->bdi_stat[item]);
+}
+
 static inline s64 bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
 }
 
+static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_sum(&bdi->bdi_stat[item]);
+}
+
+static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	s64 sum;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	sum = __bdi_stat_sum(bdi, item);
+	local_irq_restore(flags);
+
+	return sum;
+}
+
+/*
+ * maximal error of a stat counter.
+ */
+static inline unsigned long bdi_stat_delta(void)
+{
+#ifdef CONFIG_SMP
+	return NR_CPUS * FBC_BATCH;
+#else
+	return 1UL;
+#endif
+}
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
@@ -49,8 +49,6 @@
  */
 static long ratelimit_pages = 32;
 
-static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
-
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by tracking a floating average per BDI and a global floating
+ * average. We optimize away the '/= 2' for the global average by noting that:
+ *
+ *  if (++i > thresh) i /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   thresh/2 + (++i % thresh/2)
+ *
+ * Furthermore, when we choose thresh to be 2^n it can be written in terms of
+ * binary operations and wraparound artifacts disappear.
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   i / thresh
+ *
+ * Its monotonous increasing property can be applied to mitigate the wrap-
+ * around issue.
+ */
+static int vm_cycle_shift __read_mostly;
+static struct percpu_counter vm_writeout_total;
+
+/*
+ * Sync up the per BDI average to the global cycle.
+ */
+static void bdi_writeout_norm(struct backing_dev_info *bdi)
+{
+	int bits = vm_cycle_shift;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = ~(cycle - 1);
+	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
+	unsigned long flags;
+
+	global_cycle <<= 1;
+	global_cycle &= mask;
+
+	if ((bdi->cycles & mask) == global_cycle)
+		return;
+
+	spin_lock_irqsave(&bdi->lock, flags);
+	bdi->cycles &= mask;
+	while (bdi->cycles != global_cycle) {
+		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
+		unsigned long half = (val + 1) >> 1;
+
+		if (!val)
+			break;
+
+		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
+		bdi->cycles += cycle;
+	}
+	bdi->cycles = global_cycle;
+	spin_unlock_irqrestore(&bdi->lock, flags);
+}
+
+static void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+	bdi_writeout_norm(bdi);
+
+	__inc_bdi_stat(bdi, BDI_WRITEOUT);
+	percpu_counter_mod(&vm_writeout_total, 1);
+}
+
+void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
+{
+	int bits = vm_cycle_shift - 1;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = cycle - 1;
+	unsigned long total = percpu_counter_read(&vm_writeout_total);
+
+	if (bdi_cap_writeback_dirty(bdi)) {
+		bdi_writeout_norm(bdi);
+		*scale = bdi_stat(bdi, BDI_WRITEOUT);
+	} else
+		*scale = 0;
+
+	*div = cycle + (total & mask);
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
 }
 
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
-					struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi)
 {
 	int background_ratio;		/* Percentages */
 	int dirty_ratio;
@@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+
+	if (bdi) {
+		long long tmp = dirty;
+		long reserve;
+		long scale, div;
+
+		get_writeout_scale(bdi, &scale, &div);
+
+		tmp *= scale;
+		do_div(tmp, div);
+
+		reserve = dirty -
+			(global_page_state(NR_FILE_DIRTY) +
+			 global_page_state(NR_WRITEBACK) +
+			 global_page_state(NR_UNSTABLE_NFS));
+
+		if (reserve < 0)
+			reserve = 0;
+
+		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
+			bdi_stat(bdi, BDI_WRITEBACK);
+
+		*pbdi_dirty = min((long)tmp, reserve);
+	}
 }
 
 /*
@@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-	long nr_reclaimable;
+	long bdi_nr_reclaimable;
+	long bdi_nr_writeback;
 	long background_thresh;
 	long dirty_thresh;
+	long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long write_chunk = sync_writeback_pages();
 
@@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-			dirty_thresh)
+		get_dirty_limits(&background_thresh, &dirty_thresh,
+				&bdi_thresh, bdi);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 				break;
 
-		if (!dirty_exceeded)
-			dirty_exceeded = 1;
+		if (!bdi->dirty_exceeded)
+			bdi->dirty_exceeded = 1;
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
@@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 */
-		if (nr_reclaimable) {
+		if (bdi_nr_reclaimable) {
 			writeback_inodes(&wbc);
-			get_dirty_limits(&background_thresh,
-					 	&dirty_thresh, mapping);
-			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-			if (nr_reclaimable +
-				global_page_state(NR_WRITEBACK)
-					<= dirty_thresh)
-						break;
+
+			get_dirty_limits(&background_thresh, &dirty_thresh,
+				       &bdi_thresh, bdi);
+
+			if (bdi_thresh < 2*bdi_stat_delta()) {
+				bdi_nr_reclaimable =
+					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat_sum(bdi, BDI_WRITEBACK);
+			} else {
+				bdi_nr_reclaimable =
+					bdi_stat(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat(bdi, BDI_WRITEBACK);
+			}
+
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+				break;
+
 			pages_written += write_chunk - wbc.nr_to_write;
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
@@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
 		congestion_wait(WRITE, HZ/10);
 	}
 
-	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
-		<= dirty_thresh && dirty_exceeded)
-			dirty_exceeded = 0;
+	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+			bdi->dirty_exceeded)
+		bdi->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
 		return;		/* pdflush is already working this queue */
@@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+					  + global_page_state(NR_UNSTABLE_NFS)
+					  > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
 	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
-	if (dirty_exceeded)
+	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
 
 	/*
@@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
 	}
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -377,7 +496,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 		if (global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
@@ -585,6 +704,8 @@ void __init page_writeback_init(void)
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
+	vm_cycle_shift = 1 + ilog2(vm_total_pages);
+	percpu_counter_init(&vm_writeout_total, 0);
 }
 
 /**
@@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_writeback_dirty(bdi))
+			if (bdi_cap_writeback_dirty(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+				__bdi_writeout_inc(bdi);
+			}
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
@@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
 	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
 		return;
 
+	spin_lock_init(&bdi->lock);
+	bdi->cycles = 0;
+	bdi->dirty_exceeded = 0;
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_init(&bdi->bdi_stat[i], 0);
 }

-- 


^ permalink raw reply	[flat|nested] 110+ messages in thread

* [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-20 15:52   ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-20 15:52 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: miklos, akpm, neilb, dgc, tomoki.sekiyama.qu, a.p.zijlstra,
	nikita, trond.myklebust, yingchao.zhou

[-- Attachment #1: writeback-balance-per-backing_dev.patch --]
[-- Type: text/plain, Size: 13122 bytes --]

Scale writeback cache per backing device, proportional to its writeout speed.

By decoupling the BDI dirty thresholds a number of problems we currently have
will go away, namely:

 - mutual interference starvation (for any number of BDIs);
 - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).

It might be that all dirty pages are for a single BDI while other BDIs are
idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
dirty pages outstanding and make progress.

A global threshold also creates a deadlock for stacked BDIs; when A writes to
B, and A generates enough dirty pages to get throttled, B will never start
writeback until the dirty pages go away. Again, by giving each BDI its own
'independent' dirty limit, this problem is avoided.

So the problem is to determine how to distribute the total dirty limit across
the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
not have any dirty pages outstanding is a waste.

What is done is to keep a floating proportion between the DBIs based on
writeback completions. This way faster/more active devices get a larger share
than slower/idle devices.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h |   51 ++++++++++++
 mm/backing-dev.c            |    3 
 mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
 3 files changed, 206 insertions(+), 29 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
@@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITEOUT,
 	NR_BDI_STAT_ITEMS
 };
 
@@ -43,6 +44,13 @@ struct backing_dev_info {
 	void *unplug_io_data;
 
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
+
+	/*
+	 * data used for scaling the writeback cache
+	 */
+	spinlock_t lock;	/* protect the cycle count */
+	unsigned long cycles;	/* writeout cycles */
+	int dirty_exceeded;
 };
 
 void bdi_init(struct backing_dev_info *bdi);
@@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
 	percpu_counter_mod(&bdi->bdi_stat[item], amount);
 }
 
+static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
+		enum bdi_stat_item item, s64 amount)
+{
+	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
+}
+
 static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
@@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
 	local_irq_restore(flags);
 }
 
+static inline s64 __bdi_stat(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_read(&bdi->bdi_stat[item]);
+}
+
 static inline s64 bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item)
 {
 	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
 }
 
+static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	return percpu_counter_sum(&bdi->bdi_stat[item]);
+}
+
+static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
+		enum bdi_stat_item item)
+{
+	s64 sum;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	sum = __bdi_stat_sum(bdi, item);
+	local_irq_restore(flags);
+
+	return sum;
+}
+
+/*
+ * maximal error of a stat counter.
+ */
+static inline unsigned long bdi_stat_delta(void)
+{
+#ifdef CONFIG_SMP
+	return NR_CPUS * FBC_BATCH;
+#else
+	return 1UL;
+#endif
+}
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
@@ -49,8 +49,6 @@
  */
 static long ratelimit_pages = 32;
 
-static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
-
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
  * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by tracking a floating average per BDI and a global floating
+ * average. We optimize away the '/= 2' for the global average by noting that:
+ *
+ *  if (++i > thresh) i /= 2:
+ *
+ * Can be approximated by:
+ *
+ *   thresh/2 + (++i % thresh/2)
+ *
+ * Furthermore, when we choose thresh to be 2^n it can be written in terms of
+ * binary operations and wraparound artifacts disappear.
+ *
+ * Also note that this yields a natural counter of the elapsed periods:
+ *
+ *   i / thresh
+ *
+ * Its monotonous increasing property can be applied to mitigate the wrap-
+ * around issue.
+ */
+static int vm_cycle_shift __read_mostly;
+static struct percpu_counter vm_writeout_total;
+
+/*
+ * Sync up the per BDI average to the global cycle.
+ */
+static void bdi_writeout_norm(struct backing_dev_info *bdi)
+{
+	int bits = vm_cycle_shift;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = ~(cycle - 1);
+	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
+	unsigned long flags;
+
+	global_cycle <<= 1;
+	global_cycle &= mask;
+
+	if ((bdi->cycles & mask) == global_cycle)
+		return;
+
+	spin_lock_irqsave(&bdi->lock, flags);
+	bdi->cycles &= mask;
+	while (bdi->cycles != global_cycle) {
+		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
+		unsigned long half = (val + 1) >> 1;
+
+		if (!val)
+			break;
+
+		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
+		bdi->cycles += cycle;
+	}
+	bdi->cycles = global_cycle;
+	spin_unlock_irqrestore(&bdi->lock, flags);
+}
+
+static void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+	bdi_writeout_norm(bdi);
+
+	__inc_bdi_stat(bdi, BDI_WRITEOUT);
+	percpu_counter_mod(&vm_writeout_total, 1);
+}
+
+void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
+{
+	int bits = vm_cycle_shift - 1;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = cycle - 1;
+	unsigned long total = percpu_counter_read(&vm_writeout_total);
+
+	if (bdi_cap_writeback_dirty(bdi)) {
+		bdi_writeout_norm(bdi);
+		*scale = bdi_stat(bdi, BDI_WRITEOUT);
+	} else
+		*scale = 0;
+
+	*div = cycle + (total & mask);
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
 }
 
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
-					struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi)
 {
 	int background_ratio;		/* Percentages */
 	int dirty_ratio;
@@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+
+	if (bdi) {
+		long long tmp = dirty;
+		long reserve;
+		long scale, div;
+
+		get_writeout_scale(bdi, &scale, &div);
+
+		tmp *= scale;
+		do_div(tmp, div);
+
+		reserve = dirty -
+			(global_page_state(NR_FILE_DIRTY) +
+			 global_page_state(NR_WRITEBACK) +
+			 global_page_state(NR_UNSTABLE_NFS));
+
+		if (reserve < 0)
+			reserve = 0;
+
+		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
+			bdi_stat(bdi, BDI_WRITEBACK);
+
+		*pbdi_dirty = min((long)tmp, reserve);
+	}
 }
 
 /*
@@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-	long nr_reclaimable;
+	long bdi_nr_reclaimable;
+	long bdi_nr_writeback;
 	long background_thresh;
 	long dirty_thresh;
+	long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long write_chunk = sync_writeback_pages();
 
@@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-			dirty_thresh)
+		get_dirty_limits(&background_thresh, &dirty_thresh,
+				&bdi_thresh, bdi);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
 				break;
 
-		if (!dirty_exceeded)
-			dirty_exceeded = 1;
+		if (!bdi->dirty_exceeded)
+			bdi->dirty_exceeded = 1;
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
@@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 */
-		if (nr_reclaimable) {
+		if (bdi_nr_reclaimable) {
 			writeback_inodes(&wbc);
-			get_dirty_limits(&background_thresh,
-					 	&dirty_thresh, mapping);
-			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
-			if (nr_reclaimable +
-				global_page_state(NR_WRITEBACK)
-					<= dirty_thresh)
-						break;
+
+			get_dirty_limits(&background_thresh, &dirty_thresh,
+				       &bdi_thresh, bdi);
+
+			if (bdi_thresh < 2*bdi_stat_delta()) {
+				bdi_nr_reclaimable =
+					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat_sum(bdi, BDI_WRITEBACK);
+			} else {
+				bdi_nr_reclaimable =
+					bdi_stat(bdi, BDI_RECLAIMABLE);
+				bdi_nr_writeback =
+					bdi_stat(bdi, BDI_WRITEBACK);
+			}
+
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+				break;
+
 			pages_written += write_chunk - wbc.nr_to_write;
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
@@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
 		congestion_wait(WRITE, HZ/10);
 	}
 
-	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
-		<= dirty_thresh && dirty_exceeded)
-			dirty_exceeded = 0;
+	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+			bdi->dirty_exceeded)
+		bdi->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
 		return;		/* pdflush is already working this queue */
@@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-	     (!laptop_mode && (nr_reclaimable > background_thresh)))
+			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+					  + global_page_state(NR_UNSTABLE_NFS)
+					  > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
 	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
-	if (dirty_exceeded)
+	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
 
 	/*
@@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
 	}
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -377,7 +496,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 		if (global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
@@ -585,6 +704,8 @@ void __init page_writeback_init(void)
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
+	vm_cycle_shift = 1 + ilog2(vm_total_pages);
+	percpu_counter_init(&vm_writeout_total, 0);
 }
 
 /**
@@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_writeback_dirty(bdi))
+			if (bdi_cap_writeback_dirty(bdi)) {
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
+				__bdi_writeout_inc(bdi);
+			}
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
@@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
 	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
 		return;
 
+	spin_lock_init(&bdi->lock);
+	bdi->cycles = 0;
+	bdi->dirty_exceeded = 0;
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_init(&bdi->bdi_stat[i], 0);
 }

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
  2007-04-20 15:51   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:51:57 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> With the current logic the percpu_counter's accuracy delta is quadric
> wrt the number of cpus in the system, reduce this to O(n ln n).
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/percpu_counter.h |    7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6-mm/include/linux/percpu_counter.h
> ===================================================================
> --- linux-2.6-mm.orig/include/linux/percpu_counter.h
> +++ linux-2.6-mm/include/linux/percpu_counter.h
> @@ -11,6 +11,7 @@
>  #include <linux/threads.h>
>  #include <linux/percpu.h>
>  #include <linux/types.h>
> +#include <linux/log2.h>
>  
>  #ifdef CONFIG_SMP
>  
> @@ -20,11 +21,7 @@ struct percpu_counter {
>  	s32 *counters;
>  };
>  
> -#if NR_CPUS >= 16
> -#define FBC_BATCH	(NR_CPUS*2)
> -#else
> -#define FBC_BATCH	(NR_CPUS*4)
> -#endif
> +#define FBC_BATCH	(8*ilog2(NR_CPUS))
>  
>  static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
>  {

I worry that this might be too small when there are hundreds of CPUs online.

With 1024 CPUs we go for the lock once per 80 counts.  That's not much. 

If we have 1024 CPUs, each one of which is incrementing this counter at N
Hz, we have 1024/80=12 CPUs all going for the same lock at N Hz.  It could
get bad.

But I don't know what the gain is for this loss.  Your changelog should
have told us.

What problem is this patch solving?

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:51:57 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> With the current logic the percpu_counter's accuracy delta is quadric
> wrt the number of cpus in the system, reduce this to O(n ln n).
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/percpu_counter.h |    7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6-mm/include/linux/percpu_counter.h
> ===================================================================
> --- linux-2.6-mm.orig/include/linux/percpu_counter.h
> +++ linux-2.6-mm/include/linux/percpu_counter.h
> @@ -11,6 +11,7 @@
>  #include <linux/threads.h>
>  #include <linux/percpu.h>
>  #include <linux/types.h>
> +#include <linux/log2.h>
>  
>  #ifdef CONFIG_SMP
>  
> @@ -20,11 +21,7 @@ struct percpu_counter {
>  	s32 *counters;
>  };
>  
> -#if NR_CPUS >= 16
> -#define FBC_BATCH	(NR_CPUS*2)
> -#else
> -#define FBC_BATCH	(NR_CPUS*4)
> -#endif
> +#define FBC_BATCH	(8*ilog2(NR_CPUS))
>  
>  static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
>  {

I worry that this might be too small when there are hundreds of CPUs online.

With 1024 CPUs we go for the lock once per 80 counts.  That's not much. 

If we have 1024 CPUs, each one of which is incrementing this counter at N
Hz, we have 1024/80=12 CPUs all going for the same lock at N Hz.  It could
get bad.

But I don't know what the gain is for this loss.  Your changelog should
have told us.

What problem is this patch solving?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
  2007-04-20 15:51   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:51:58 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Add percpu_counter_mod64() to allow large modifications.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/percpu_counter.h |    9 +++++++++
>  lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+)
> 
> Index: linux-2.6/include/linux/percpu_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
> +++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
> @@ -36,6 +36,7 @@ static inline void percpu_counter_destro
>  }
>  
>  void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
> +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
>  s64 percpu_counter_sum(struct percpu_counter *fbc);
>  
>  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> @@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
>  	preempt_enable();
>  }
>  
> +static inline void
> +percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> +{
> +	preempt_disable();
> +	fbc->count += amount;
> +	preempt_enable();
> +}
> +
>  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
>  {
>  	return fbc->count;
> Index: linux-2.6/lib/percpu_counter.c
> ===================================================================
> --- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
> +++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
> @@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
>  }
>  EXPORT_SYMBOL(percpu_counter_mod);
>  
> +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> +{
> +	long count;
> +	s32 *pcount;
> +	int cpu;
> +
> +	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
> +		spin_lock(&fbc->lock);
> +		fbc->count += amount;
> +		spin_unlock(&fbc->lock);
> +		return;
> +	}

This is wrong, a little.

If the counter was at -FBC_BATCH/2 and the caller passed in FBC_BATCH, we
could just set the cpu-local counter to FBC_BATCH/2 instead of going for
the lock.

Probably doesn't matter though.

> +	cpu = get_cpu();
> +	pcount = per_cpu_ptr(fbc->counters, cpu);
> +	count = *pcount + amount;
> +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> +		spin_lock(&fbc->lock);
> +		fbc->count += count;
> +		*pcount = 0;
> +		spin_unlock(&fbc->lock);
> +	} else {
> +		*pcount = count;
> +	}
> +	put_cpu();
> +}
> +EXPORT_SYMBOL(percpu_counter_mod64);

Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
PAE has only 64,000,000 pages and won't be using deltas of more than 4
gigapages?

<Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
another changelog bug>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:51:58 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Add percpu_counter_mod64() to allow large modifications.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/percpu_counter.h |    9 +++++++++
>  lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
>  2 files changed, 37 insertions(+)
> 
> Index: linux-2.6/include/linux/percpu_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
> +++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
> @@ -36,6 +36,7 @@ static inline void percpu_counter_destro
>  }
>  
>  void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
> +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
>  s64 percpu_counter_sum(struct percpu_counter *fbc);
>  
>  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> @@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
>  	preempt_enable();
>  }
>  
> +static inline void
> +percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> +{
> +	preempt_disable();
> +	fbc->count += amount;
> +	preempt_enable();
> +}
> +
>  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
>  {
>  	return fbc->count;
> Index: linux-2.6/lib/percpu_counter.c
> ===================================================================
> --- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
> +++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
> @@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
>  }
>  EXPORT_SYMBOL(percpu_counter_mod);
>  
> +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> +{
> +	long count;
> +	s32 *pcount;
> +	int cpu;
> +
> +	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
> +		spin_lock(&fbc->lock);
> +		fbc->count += amount;
> +		spin_unlock(&fbc->lock);
> +		return;
> +	}

This is wrong, a little.

If the counter was at -FBC_BATCH/2 and the caller passed in FBC_BATCH, we
could just set the cpu-local counter to FBC_BATCH/2 instead of going for
the lock.

Probably doesn't matter though.

> +	cpu = get_cpu();
> +	pcount = per_cpu_ptr(fbc->counters, cpu);
> +	count = *pcount + amount;
> +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> +		spin_lock(&fbc->lock);
> +		fbc->count += count;
> +		*pcount = 0;
> +		spin_unlock(&fbc->lock);
> +	} else {
> +		*pcount = count;
> +	}
> +	put_cpu();
> +}
> +EXPORT_SYMBOL(percpu_counter_mod64);

Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
PAE has only 64,000,000 pages and won't be using deltas of more than 4
gigapages?

<Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
another changelog bug>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 07/10] mm: count reclaimable pages per BDI
  2007-04-20 15:52   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:01 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.

hm.  Aggregating dirty and unstable at inc/dec time is a bit kludgy.  If
later on we want to know just "dirty" then we're in trouble.

I can see the logic behind it though.

Perhaps one could have separate BDI_DIRTY and BDI_UNSTABLE and treat them
separately at inc/dec time, but give them the same numerical value, so
they in fact refer to the same counter.  That's kludgy too.

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 07/10] mm: count reclaimable pages per BDI
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:01 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.

hm.  Aggregating dirty and unstable at inc/dec time is a bit kludgy.  If
later on we want to know just "dirty" then we're in trouble.

I can see the logic behind it though.

Perhaps one could have separate BDI_DIRTY and BDI_UNSTABLE and treat them
separately at inc/dec time, but give them the same numerical value, so
they in fact refer to the same counter.  That's kludgy too.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
  2007-04-20 15:52   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Count per BDI writeback pages.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/backing-dev.h |    1 +
>  mm/page-writeback.c         |   12 ++++++++++--
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
>  	int ret;
>  
>  	if (mapping) {
> +		struct backing_dev_info *bdi = mapping->backing_dev_info;
>  		unsigned long flags;
>  
>  		write_lock_irqsave(&mapping->tree_lock, flags);
>  		ret = TestClearPageWriteback(page);
> -		if (ret)
> +		if (ret) {
>  			radix_tree_tag_clear(&mapping->page_tree,
>  						page_index(page),
>  						PAGECACHE_TAG_WRITEBACK);
> +			if (bdi_cap_writeback_dirty(bdi))
> +				__dec_bdi_stat(bdi, BDI_WRITEBACK);

Why do we test bdi_cap_writeback_dirty() here?

If we remove that test, we end up accumulating statistics for
non-writebackable backing devs, but does that matter?  Probably the common
case is writebackable backing-devs, so eliminating the test-n-branch might
be a net microgain.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Count per BDI writeback pages.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/backing-dev.h |    1 +
>  mm/page-writeback.c         |   12 ++++++++++--
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
>  	int ret;
>  
>  	if (mapping) {
> +		struct backing_dev_info *bdi = mapping->backing_dev_info;
>  		unsigned long flags;
>  
>  		write_lock_irqsave(&mapping->tree_lock, flags);
>  		ret = TestClearPageWriteback(page);
> -		if (ret)
> +		if (ret) {
>  			radix_tree_tag_clear(&mapping->page_tree,
>  						page_index(page),
>  						PAGECACHE_TAG_WRITEBACK);
> +			if (bdi_cap_writeback_dirty(bdi))
> +				__dec_bdi_stat(bdi, BDI_WRITEBACK);

Why do we test bdi_cap_writeback_dirty() here?

If we remove that test, we end up accumulating statistics for
non-writebackable backing devs, but does that matter?  Probably the common
case is writebackable backing-devs, so eliminating the test-n-branch might
be a net microgain.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 09/10] mm: expose BDI statistics in sysfs.
  2007-04-20 15:52   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:03 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Expose the per BDI stats in /sys/block/<dev>/queue/*
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
>  1 file changed, 32 insertions(+)
> 
> Index: linux-2.6-mm/block/ll_rw_blk.c
> ===================================================================
> --- linux-2.6-mm.orig/block/ll_rw_blk.c
> +++ linux-2.6-mm/block/ll_rw_blk.c
> @@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
>  	return queue_var_show(max_hw_sectors_kb, (page));
>  }
>  
> +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
> +{
> +	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
> +}

We try to present memory statistics to userspace in bytes or kbytes rather
than number-of-pages.  Because page-size varies between architectures and
between .configs.  Displaying number-of-pages is just inviting people to write
it-broke-when-i-moved-it-to-ia64 applications.

Plus kbytes is a bit more user-friendly, particularly when the user will
want to compare these numbers to /proc/meminfo, for example.

Using %llu might be more appropriate than %lld.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 09/10] mm: expose BDI statistics in sysfs.
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:03 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Expose the per BDI stats in /sys/block/<dev>/queue/*
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
>  1 file changed, 32 insertions(+)
> 
> Index: linux-2.6-mm/block/ll_rw_blk.c
> ===================================================================
> --- linux-2.6-mm.orig/block/ll_rw_blk.c
> +++ linux-2.6-mm/block/ll_rw_blk.c
> @@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
>  	return queue_var_show(max_hw_sectors_kb, (page));
>  }
>  
> +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
> +{
> +	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
> +}

We try to present memory statistics to userspace in bytes or kbytes rather
than number-of-pages.  Because page-size varies between architectures and
between .configs.  Displaying number-of-pages is just inviting people to write
it-broke-when-i-moved-it-to-ia64 applications.

Plus kbytes is a bit more user-friendly, particularly when the user will
want to compare these numbers to /proc/meminfo, for example.

Using %llu might be more appropriate than %lld.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-20 15:52   ` Peter Zijlstra
@ 2007-04-21  9:55     ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Scale writeback cache per backing device, proportional to its writeout speed.
> 
> By decoupling the BDI dirty thresholds a number of problems we currently have
> will go away, namely:
> 
>  - mutual interference starvation (for any number of BDIs);
>  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> 
> It might be that all dirty pages are for a single BDI while other BDIs are
> idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> dirty pages outstanding and make progress.
> 
> A global threshold also creates a deadlock for stacked BDIs; when A writes to
> B, and A generates enough dirty pages to get throttled, B will never start
> writeback until the dirty pages go away. Again, by giving each BDI its own
> 'independent' dirty limit, this problem is avoided.
> 
> So the problem is to determine how to distribute the total dirty limit across
> the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> not have any dirty pages outstanding is a waste.
> 
> What is done is to keep a floating proportion between the DBIs based on
> writeback completions. This way faster/more active devices get a larger share
> than slower/idle devices.

This is a pretty major improvement to various nasty corner-cases, if it
works.

Does it work?  Please describe the testing you did, and the results.

Has this been confirmed to fix Miklos's FUSE and loopback problems?


> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/backing-dev.h |   51 ++++++++++++
>  mm/backing-dev.c            |    3 
>  mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
>  3 files changed, 206 insertions(+), 29 deletions(-)
> 
> Index: linux-2.6/include/linux/backing-dev.h
> ===================================================================
> --- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
> +++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
> @@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
>  enum bdi_stat_item {
>  	BDI_RECLAIMABLE,
>  	BDI_WRITEBACK,
> +	BDI_WRITEOUT,
>  	NR_BDI_STAT_ITEMS
>  };

Whoa, head is now swimming.  What's the difference between "writeback" and
"writeout"?

> @@ -43,6 +44,13 @@ struct backing_dev_info {
>  	void *unplug_io_data;
>  
>  	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
> +
> +	/*
> +	 * data used for scaling the writeback cache
> +	 */
> +	spinlock_t lock;	/* protect the cycle count */
> +	unsigned long cycles;	/* writeout cycles */
> +	int dirty_exceeded;
>  };
>  
>  void bdi_init(struct backing_dev_info *bdi);
> @@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
>  	percpu_counter_mod(&bdi->bdi_stat[item], amount);
>  }
>  
> +static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item, s64 amount)
> +{
> +	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
> +}
> +
>  static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
>  		enum bdi_stat_item item)
>  {
> @@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
>  	local_irq_restore(flags);
>  }
>  
> +static inline s64 __bdi_stat(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	return percpu_counter_read(&bdi->bdi_stat[item]);
> +}
> +
>  static inline s64 bdi_stat(struct backing_dev_info *bdi,
>  		enum bdi_stat_item item)
>  {
>  	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
>  }

So here, the __ means "it doesn't do the force-it-positive" treatment.

> +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	return percpu_counter_sum(&bdi->bdi_stat[item]);
> +}
> +
> +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	s64 sum;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	sum = __bdi_stat_sum(bdi, item);
> +	local_irq_restore(flags);
> +
> +	return sum;
> +}

And here __ means "not safe to use if this counter is updated from
interrupt context".

At least, I think that's what it all means.  The lack of code comments
casts some doubt.


The interfaces here could do with a little more thought wrt regularity,
naming and commenting, methinks.


> +/*
> + * maximal error of a stat counter.
> + */
> +static inline unsigned long bdi_stat_delta(void)
> +{
> +#ifdef CONFIG_SMP
> +	return NR_CPUS * FBC_BATCH;

This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.

> +#else
> +	return 1UL;

The UL is pretty pointless IMO.  The compiler will happily convert "1" to
unsigned long here.  And if we later change the return type to signed char,
we don't have to remember to edit this line too.

> +#endif
> +}
>
>  /*
>   * Flags in backing_dev_info::capability
>   * - The first two flags control whether dirty pages will contribute to the
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
> @@ -49,8 +49,6 @@
>   */
>  static long ratelimit_pages = 32;
>  
> -static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
> -
>  /*
>   * When balance_dirty_pages decides that the caller needs to perform some
>   * non-background writeback, this is how many pages it will attempt to write.
> @@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
>  static void background_writeout(unsigned long _min_pages);
>  
>  /*
> + * Scale the writeback cache size proportional to the relative writeout speeds.
> + *
> + * We do this by tracking a floating average per BDI and a global floating
> + * average. We optimize away the '/= 2' for the global average by noting that:
> + *
> + *  if (++i > thresh) i /= 2:
> + *
> + * Can be approximated by:
> + *
> + *   thresh/2 + (++i % thresh/2)
> + *
> + * Furthermore, when we choose thresh to be 2^n it can be written in terms of
> + * binary operations and wraparound artifacts disappear.
> + *
> + * Also note that this yields a natural counter of the elapsed periods:
> + *
> + *   i / thresh
> + *
> + * Its monotonous increasing property can be applied to mitigate the wrap-
> + * around issue.
> + */
> +static int vm_cycle_shift __read_mostly;
> +static struct percpu_counter vm_writeout_total;
> +
> +/*
> + * Sync up the per BDI average to the global cycle.
> + */
> +static void bdi_writeout_norm(struct backing_dev_info *bdi)
> +{
> +	int bits = vm_cycle_shift;
> +	unsigned long cycle = 1UL << bits;
> +	unsigned long mask = ~(cycle - 1);
> +	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
> +	unsigned long flags;
> +
> +	global_cycle <<= 1;
> +	global_cycle &= mask;
> +
> +	if ((bdi->cycles & mask) == global_cycle)
> +		return;
> +
> +	spin_lock_irqsave(&bdi->lock, flags);
> +	bdi->cycles &= mask;
> +	while (bdi->cycles != global_cycle) {
> +		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
> +		unsigned long half = (val + 1) >> 1;
> +
> +		if (!val)
> +			break;
> +
> +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> +		bdi->cycles += cycle;
> +	}
> +	bdi->cycles = global_cycle;
> +	spin_unlock_irqrestore(&bdi->lock, flags);
> +}

Here we get to the real critical substance of the patchset, and I don't
have a clue what it's doing nor how it's doing it.  And I bet nobody else
does either.

<continues to wonder wtf "writeout" is.  Perhaps knowing that would help>

I dunno.  I'm sure it's very good code but I don't have the time nor
inclination to reverse engineer the design from the implementation.

This is a very important part of the kernel - one of the few most important
parts, really.  See all the crap going around about CPU schedulers at
present?  Well hoo-boy, if we get this part of code even a little bit
wrong, they won't know what hit them.

So please, spend quite a lot of time thinking about how we can make this
code as comprehensible and approachable and maintainable as possible. 
Often this is done with comments ;)

> +static void __bdi_writeout_inc(struct backing_dev_info *bdi)
> +{
> +	bdi_writeout_norm(bdi);

I'm assuming that "norm" here means "normalise".  There's a hint for me.

> +	__inc_bdi_stat(bdi, BDI_WRITEOUT);
> +	percpu_counter_mod(&vm_writeout_total, 1);
> +}
> +
> +void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
> +{
> +	int bits = vm_cycle_shift - 1;
> +	unsigned long cycle = 1UL << bits;
> +	unsigned long mask = cycle - 1;
> +	unsigned long total = percpu_counter_read(&vm_writeout_total);
> +
> +	if (bdi_cap_writeback_dirty(bdi)) {
> +		bdi_writeout_norm(bdi);
> +		*scale = bdi_stat(bdi, BDI_WRITEOUT);
> +	} else
> +		*scale = 0;
> +
> +	*div = cycle + (total & mask);
> +}

I suppose that if I stared at this for long enough I could work out what
it's doing, and why it's doing it.  But given that it needs comments
telling others that, there isn't much point in me blowing the time to do
so.

It should have static scope.


Are all the per-bdi counters being exposed in sysfs?  I think not. 
Probably they should be?


> +/*
>   * Work out the current dirty-memory clamping and background writeout
>   * thresholds.
>   *
> @@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
>  }
>  
>  static void
> -get_dirty_limits(long *pbackground, long *pdirty,
> -					struct address_space *mapping)
> +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> +		 struct backing_dev_info *bdi)
>  {
>  	int background_ratio;		/* Percentages */
>  	int dirty_ratio;
> @@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
>  	}
>  	*pbackground = background;
>  	*pdirty = dirty;
> +
> +	if (bdi) {
> +		long long tmp = dirty;
> +		long reserve;
> +		long scale, div;
> +
> +		get_writeout_scale(bdi, &scale, &div);
> +
> +		tmp *= scale;
> +		do_div(tmp, div);
> +
> +		reserve = dirty -
> +			(global_page_state(NR_FILE_DIRTY) +
> +			 global_page_state(NR_WRITEBACK) +
> +			 global_page_state(NR_UNSTABLE_NFS));
> +
> +		if (reserve < 0)
> +			reserve = 0;
> +
> +		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
> +			bdi_stat(bdi, BDI_WRITEBACK);
> +
> +		*pbdi_dirty = min((long)tmp, reserve);

min_t is preferred

tmp isn't a particularly good identifier

> +	}
>  }
>  
>  /*
> @@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
>   */
>  static void balance_dirty_pages(struct address_space *mapping)
>  {
> -	long nr_reclaimable;
> +	long bdi_nr_reclaimable;
> +	long bdi_nr_writeback;
>  	long background_thresh;
>  	long dirty_thresh;
> +	long bdi_thresh;
>  	unsigned long pages_written = 0;
>  	unsigned long write_chunk = sync_writeback_pages();
>  
> @@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
>  			.range_cyclic	= 1,
>  		};
>  
> -		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
> -		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
> -			dirty_thresh)
> +		get_dirty_limits(&background_thresh, &dirty_thresh,
> +				&bdi_thresh, bdi);
> +		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> +		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> +		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
>  				break;
>  
> -		if (!dirty_exceeded)
> -			dirty_exceeded = 1;
> +		if (!bdi->dirty_exceeded)
> +			bdi->dirty_exceeded = 1;
>  
>  		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
>  		 * Unstable writes are a feature of certain networked
> @@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
>  		 * written to the server's write cache, but has not yet
>  		 * been flushed to permanent storage.
>  		 */
> -		if (nr_reclaimable) {
> +		if (bdi_nr_reclaimable) {
>  			writeback_inodes(&wbc);
> -			get_dirty_limits(&background_thresh,
> -					 	&dirty_thresh, mapping);
> -			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -			if (nr_reclaimable +
> -				global_page_state(NR_WRITEBACK)
> -					<= dirty_thresh)
> -						break;
> +
> +			get_dirty_limits(&background_thresh, &dirty_thresh,
> +				       &bdi_thresh, bdi);
> +
> +			if (bdi_thresh < 2*bdi_stat_delta()) {
> +				bdi_nr_reclaimable =
> +					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> +				bdi_nr_writeback =
> +					bdi_stat_sum(bdi, BDI_WRITEBACK);
> +			} else {
> +				bdi_nr_reclaimable =
> +					bdi_stat(bdi, BDI_RECLAIMABLE);
> +				bdi_nr_writeback =
> +					bdi_stat(bdi, BDI_WRITEBACK);
> +			}
> +
> +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> +				break;
> +
>  			pages_written += write_chunk - wbc.nr_to_write;
>  			if (pages_written >= write_chunk)
>  				break;		/* We've done our duty */
> @@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
>  		congestion_wait(WRITE, HZ/10);
>  	}
>  
> -	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
> -		<= dirty_thresh && dirty_exceeded)
> -			dirty_exceeded = 0;
> +	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
> +			bdi->dirty_exceeded)
> +		bdi->dirty_exceeded = 0;
>  
>  	if (writeback_in_progress(bdi))
>  		return;		/* pdflush is already working this queue */
> @@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
>  	 * background_thresh, to keep the amount of dirty memory low.
>  	 */
>  	if ((laptop_mode && pages_written) ||
> -	     (!laptop_mode && (nr_reclaimable > background_thresh)))
> +			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> +					  + global_page_state(NR_UNSTABLE_NFS)
> +					  > background_thresh)))
>  		pdflush_operation(background_writeout, 0);
>  }

Did you test laptop mode?

> @@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
>  	unsigned long *p;
>  
>  	ratelimit = ratelimit_pages;
> -	if (dirty_exceeded)
> +	if (mapping->backing_dev_info->dirty_exceeded)
>  		ratelimit = 8;
>  
>  	/*
> @@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
>  	}
>  
>          for ( ; ; ) {
> -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>  
>                  /*
>                   * Boost the allowable dirty threshold a bit for page
> @@ -377,7 +496,7 @@ static void background_writeout(unsigned
>  		long background_thresh;
>  		long dirty_thresh;
>  
> -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>  		if (global_page_state(NR_FILE_DIRTY) +
>  			global_page_state(NR_UNSTABLE_NFS) < background_thresh
>  				&& min_pages <= 0)
> @@ -585,6 +704,8 @@ void __init page_writeback_init(void)
>  	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
>  	writeback_set_ratelimit();
>  	register_cpu_notifier(&ratelimit_nb);
> +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> +	percpu_counter_init(&vm_writeout_total, 0);
>  }
>  
>  /**
> @@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
>  			radix_tree_tag_clear(&mapping->page_tree,
>  						page_index(page),
>  						PAGECACHE_TAG_WRITEBACK);
> -			if (bdi_cap_writeback_dirty(bdi))
> +			if (bdi_cap_writeback_dirty(bdi)) {
>  				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> +				__bdi_writeout_inc(bdi);
> +			}
>  		}
>  		write_unlock_irqrestore(&mapping->tree_lock, flags);
>  	} else {
> Index: linux-2.6/mm/backing-dev.c
> ===================================================================
> --- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
> +++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
> @@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
>  	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
>  		return;
>  
> +	spin_lock_init(&bdi->lock);
> +	bdi->cycles = 0;
> +	bdi->dirty_exceeded = 0;
>  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
>  		percpu_counter_init(&bdi->bdi_stat[i], 0);
>  }
> 

ho hum, I'll toss it all in -mm, see what happens.

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21  9:55     ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21  9:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> Scale writeback cache per backing device, proportional to its writeout speed.
> 
> By decoupling the BDI dirty thresholds a number of problems we currently have
> will go away, namely:
> 
>  - mutual interference starvation (for any number of BDIs);
>  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> 
> It might be that all dirty pages are for a single BDI while other BDIs are
> idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> dirty pages outstanding and make progress.
> 
> A global threshold also creates a deadlock for stacked BDIs; when A writes to
> B, and A generates enough dirty pages to get throttled, B will never start
> writeback until the dirty pages go away. Again, by giving each BDI its own
> 'independent' dirty limit, this problem is avoided.
> 
> So the problem is to determine how to distribute the total dirty limit across
> the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> not have any dirty pages outstanding is a waste.
> 
> What is done is to keep a floating proportion between the DBIs based on
> writeback completions. This way faster/more active devices get a larger share
> than slower/idle devices.

This is a pretty major improvement to various nasty corner-cases, if it
works.

Does it work?  Please describe the testing you did, and the results.

Has this been confirmed to fix Miklos's FUSE and loopback problems?


> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/linux/backing-dev.h |   51 ++++++++++++
>  mm/backing-dev.c            |    3 
>  mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
>  3 files changed, 206 insertions(+), 29 deletions(-)
> 
> Index: linux-2.6/include/linux/backing-dev.h
> ===================================================================
> --- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
> +++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
> @@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
>  enum bdi_stat_item {
>  	BDI_RECLAIMABLE,
>  	BDI_WRITEBACK,
> +	BDI_WRITEOUT,
>  	NR_BDI_STAT_ITEMS
>  };

Whoa, head is now swimming.  What's the difference between "writeback" and
"writeout"?

> @@ -43,6 +44,13 @@ struct backing_dev_info {
>  	void *unplug_io_data;
>  
>  	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
> +
> +	/*
> +	 * data used for scaling the writeback cache
> +	 */
> +	spinlock_t lock;	/* protect the cycle count */
> +	unsigned long cycles;	/* writeout cycles */
> +	int dirty_exceeded;
>  };
>  
>  void bdi_init(struct backing_dev_info *bdi);
> @@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
>  	percpu_counter_mod(&bdi->bdi_stat[item], amount);
>  }
>  
> +static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item, s64 amount)
> +{
> +	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
> +}
> +
>  static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
>  		enum bdi_stat_item item)
>  {
> @@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
>  	local_irq_restore(flags);
>  }
>  
> +static inline s64 __bdi_stat(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	return percpu_counter_read(&bdi->bdi_stat[item]);
> +}
> +
>  static inline s64 bdi_stat(struct backing_dev_info *bdi,
>  		enum bdi_stat_item item)
>  {
>  	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
>  }

So here, the __ means "it doesn't do the force-it-positive" treatment.

> +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	return percpu_counter_sum(&bdi->bdi_stat[item]);
> +}
> +
> +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
> +		enum bdi_stat_item item)
> +{
> +	s64 sum;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	sum = __bdi_stat_sum(bdi, item);
> +	local_irq_restore(flags);
> +
> +	return sum;
> +}

And here __ means "not safe to use if this counter is updated from
interrupt context".

At least, I think that's what it all means.  The lack of code comments
casts some doubt.


The interfaces here could do with a little more thought wrt regularity,
naming and commenting, methinks.


> +/*
> + * maximal error of a stat counter.
> + */
> +static inline unsigned long bdi_stat_delta(void)
> +{
> +#ifdef CONFIG_SMP
> +	return NR_CPUS * FBC_BATCH;

This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.

> +#else
> +	return 1UL;

The UL is pretty pointless IMO.  The compiler will happily convert "1" to
unsigned long here.  And if we later change the return type to signed char,
we don't have to remember to edit this line too.

> +#endif
> +}
>
>  /*
>   * Flags in backing_dev_info::capability
>   * - The first two flags control whether dirty pages will contribute to the
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
> @@ -49,8 +49,6 @@
>   */
>  static long ratelimit_pages = 32;
>  
> -static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
> -
>  /*
>   * When balance_dirty_pages decides that the caller needs to perform some
>   * non-background writeback, this is how many pages it will attempt to write.
> @@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
>  static void background_writeout(unsigned long _min_pages);
>  
>  /*
> + * Scale the writeback cache size proportional to the relative writeout speeds.
> + *
> + * We do this by tracking a floating average per BDI and a global floating
> + * average. We optimize away the '/= 2' for the global average by noting that:
> + *
> + *  if (++i > thresh) i /= 2:
> + *
> + * Can be approximated by:
> + *
> + *   thresh/2 + (++i % thresh/2)
> + *
> + * Furthermore, when we choose thresh to be 2^n it can be written in terms of
> + * binary operations and wraparound artifacts disappear.
> + *
> + * Also note that this yields a natural counter of the elapsed periods:
> + *
> + *   i / thresh
> + *
> + * Its monotonous increasing property can be applied to mitigate the wrap-
> + * around issue.
> + */
> +static int vm_cycle_shift __read_mostly;
> +static struct percpu_counter vm_writeout_total;
> +
> +/*
> + * Sync up the per BDI average to the global cycle.
> + */
> +static void bdi_writeout_norm(struct backing_dev_info *bdi)
> +{
> +	int bits = vm_cycle_shift;
> +	unsigned long cycle = 1UL << bits;
> +	unsigned long mask = ~(cycle - 1);
> +	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
> +	unsigned long flags;
> +
> +	global_cycle <<= 1;
> +	global_cycle &= mask;
> +
> +	if ((bdi->cycles & mask) == global_cycle)
> +		return;
> +
> +	spin_lock_irqsave(&bdi->lock, flags);
> +	bdi->cycles &= mask;
> +	while (bdi->cycles != global_cycle) {
> +		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
> +		unsigned long half = (val + 1) >> 1;
> +
> +		if (!val)
> +			break;
> +
> +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> +		bdi->cycles += cycle;
> +	}
> +	bdi->cycles = global_cycle;
> +	spin_unlock_irqrestore(&bdi->lock, flags);
> +}

Here we get to the real critical substance of the patchset, and I don't
have a clue what it's doing nor how it's doing it.  And I bet nobody else
does either.

<continues to wonder wtf "writeout" is.  Perhaps knowing that would help>

I dunno.  I'm sure it's very good code but I don't have the time nor
inclination to reverse engineer the design from the implementation.

This is a very important part of the kernel - one of the few most important
parts, really.  See all the crap going around about CPU schedulers at
present?  Well hoo-boy, if we get this part of code even a little bit
wrong, they won't know what hit them.

So please, spend quite a lot of time thinking about how we can make this
code as comprehensible and approachable and maintainable as possible. 
Often this is done with comments ;)

> +static void __bdi_writeout_inc(struct backing_dev_info *bdi)
> +{
> +	bdi_writeout_norm(bdi);

I'm assuming that "norm" here means "normalise".  There's a hint for me.

> +	__inc_bdi_stat(bdi, BDI_WRITEOUT);
> +	percpu_counter_mod(&vm_writeout_total, 1);
> +}
> +
> +void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
> +{
> +	int bits = vm_cycle_shift - 1;
> +	unsigned long cycle = 1UL << bits;
> +	unsigned long mask = cycle - 1;
> +	unsigned long total = percpu_counter_read(&vm_writeout_total);
> +
> +	if (bdi_cap_writeback_dirty(bdi)) {
> +		bdi_writeout_norm(bdi);
> +		*scale = bdi_stat(bdi, BDI_WRITEOUT);
> +	} else
> +		*scale = 0;
> +
> +	*div = cycle + (total & mask);
> +}

I suppose that if I stared at this for long enough I could work out what
it's doing, and why it's doing it.  But given that it needs comments
telling others that, there isn't much point in me blowing the time to do
so.

It should have static scope.


Are all the per-bdi counters being exposed in sysfs?  I think not. 
Probably they should be?


> +/*
>   * Work out the current dirty-memory clamping and background writeout
>   * thresholds.
>   *
> @@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
>  }
>  
>  static void
> -get_dirty_limits(long *pbackground, long *pdirty,
> -					struct address_space *mapping)
> +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> +		 struct backing_dev_info *bdi)
>  {
>  	int background_ratio;		/* Percentages */
>  	int dirty_ratio;
> @@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
>  	}
>  	*pbackground = background;
>  	*pdirty = dirty;
> +
> +	if (bdi) {
> +		long long tmp = dirty;
> +		long reserve;
> +		long scale, div;
> +
> +		get_writeout_scale(bdi, &scale, &div);
> +
> +		tmp *= scale;
> +		do_div(tmp, div);
> +
> +		reserve = dirty -
> +			(global_page_state(NR_FILE_DIRTY) +
> +			 global_page_state(NR_WRITEBACK) +
> +			 global_page_state(NR_UNSTABLE_NFS));
> +
> +		if (reserve < 0)
> +			reserve = 0;
> +
> +		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
> +			bdi_stat(bdi, BDI_WRITEBACK);
> +
> +		*pbdi_dirty = min((long)tmp, reserve);

min_t is preferred

tmp isn't a particularly good identifier

> +	}
>  }
>  
>  /*
> @@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
>   */
>  static void balance_dirty_pages(struct address_space *mapping)
>  {
> -	long nr_reclaimable;
> +	long bdi_nr_reclaimable;
> +	long bdi_nr_writeback;
>  	long background_thresh;
>  	long dirty_thresh;
> +	long bdi_thresh;
>  	unsigned long pages_written = 0;
>  	unsigned long write_chunk = sync_writeback_pages();
>  
> @@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
>  			.range_cyclic	= 1,
>  		};
>  
> -		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
> -		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
> -			dirty_thresh)
> +		get_dirty_limits(&background_thresh, &dirty_thresh,
> +				&bdi_thresh, bdi);
> +		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> +		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> +		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
>  				break;
>  
> -		if (!dirty_exceeded)
> -			dirty_exceeded = 1;
> +		if (!bdi->dirty_exceeded)
> +			bdi->dirty_exceeded = 1;
>  
>  		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
>  		 * Unstable writes are a feature of certain networked
> @@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
>  		 * written to the server's write cache, but has not yet
>  		 * been flushed to permanent storage.
>  		 */
> -		if (nr_reclaimable) {
> +		if (bdi_nr_reclaimable) {
>  			writeback_inodes(&wbc);
> -			get_dirty_limits(&background_thresh,
> -					 	&dirty_thresh, mapping);
> -			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -			if (nr_reclaimable +
> -				global_page_state(NR_WRITEBACK)
> -					<= dirty_thresh)
> -						break;
> +
> +			get_dirty_limits(&background_thresh, &dirty_thresh,
> +				       &bdi_thresh, bdi);
> +
> +			if (bdi_thresh < 2*bdi_stat_delta()) {
> +				bdi_nr_reclaimable =
> +					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> +				bdi_nr_writeback =
> +					bdi_stat_sum(bdi, BDI_WRITEBACK);
> +			} else {
> +				bdi_nr_reclaimable =
> +					bdi_stat(bdi, BDI_RECLAIMABLE);
> +				bdi_nr_writeback =
> +					bdi_stat(bdi, BDI_WRITEBACK);
> +			}
> +
> +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> +				break;
> +
>  			pages_written += write_chunk - wbc.nr_to_write;
>  			if (pages_written >= write_chunk)
>  				break;		/* We've done our duty */
> @@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
>  		congestion_wait(WRITE, HZ/10);
>  	}
>  
> -	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
> -		<= dirty_thresh && dirty_exceeded)
> -			dirty_exceeded = 0;
> +	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
> +			bdi->dirty_exceeded)
> +		bdi->dirty_exceeded = 0;
>  
>  	if (writeback_in_progress(bdi))
>  		return;		/* pdflush is already working this queue */
> @@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
>  	 * background_thresh, to keep the amount of dirty memory low.
>  	 */
>  	if ((laptop_mode && pages_written) ||
> -	     (!laptop_mode && (nr_reclaimable > background_thresh)))
> +			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> +					  + global_page_state(NR_UNSTABLE_NFS)
> +					  > background_thresh)))
>  		pdflush_operation(background_writeout, 0);
>  }

Did you test laptop mode?

> @@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
>  	unsigned long *p;
>  
>  	ratelimit = ratelimit_pages;
> -	if (dirty_exceeded)
> +	if (mapping->backing_dev_info->dirty_exceeded)
>  		ratelimit = 8;
>  
>  	/*
> @@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
>  	}
>  
>          for ( ; ; ) {
> -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>  
>                  /*
>                   * Boost the allowable dirty threshold a bit for page
> @@ -377,7 +496,7 @@ static void background_writeout(unsigned
>  		long background_thresh;
>  		long dirty_thresh;
>  
> -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
>  		if (global_page_state(NR_FILE_DIRTY) +
>  			global_page_state(NR_UNSTABLE_NFS) < background_thresh
>  				&& min_pages <= 0)
> @@ -585,6 +704,8 @@ void __init page_writeback_init(void)
>  	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
>  	writeback_set_ratelimit();
>  	register_cpu_notifier(&ratelimit_nb);
> +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> +	percpu_counter_init(&vm_writeout_total, 0);
>  }
>  
>  /**
> @@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
>  			radix_tree_tag_clear(&mapping->page_tree,
>  						page_index(page),
>  						PAGECACHE_TAG_WRITEBACK);
> -			if (bdi_cap_writeback_dirty(bdi))
> +			if (bdi_cap_writeback_dirty(bdi)) {
>  				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> +				__bdi_writeout_inc(bdi);
> +			}
>  		}
>  		write_unlock_irqrestore(&mapping->tree_lock, flags);
>  	} else {
> Index: linux-2.6/mm/backing-dev.c
> ===================================================================
> --- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
> +++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
> @@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
>  	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
>  		return;
>  
> +	spin_lock_init(&bdi->lock);
> +	bdi->cycles = 0;
> +	bdi->dirty_exceeded = 0;
>  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
>  		percpu_counter_init(&bdi->bdi_stat[i], 0);
>  }
> 

ho hum, I'll toss it all in -mm, see what happens.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 10:38       ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-21 10:38 UTC (permalink / raw)
  To: akpm
  Cc: a.p.zijlstra, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Scale writeback cache per backing device, proportional to its writeout speed.
> > 
> > By decoupling the BDI dirty thresholds a number of problems we currently have
> > will go away, namely:
> > 
> >  - mutual interference starvation (for any number of BDIs);
> >  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> > 
> > It might be that all dirty pages are for a single BDI while other BDIs are
> > idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> > dirty pages outstanding and make progress.
> > 
> > A global threshold also creates a deadlock for stacked BDIs; when A writes to
> > B, and A generates enough dirty pages to get throttled, B will never start
> > writeback until the dirty pages go away. Again, by giving each BDI its own
> > 'independent' dirty limit, this problem is avoided.
> > 
> > So the problem is to determine how to distribute the total dirty limit across
> > the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> > not have any dirty pages outstanding is a waste.
> > 
> > What is done is to keep a floating proportion between the DBIs based on
> > writeback completions. This way faster/more active devices get a larger share
> > than slower/idle devices.
> 
> This is a pretty major improvement to various nasty corner-cases, if it
> works.
> 
> Does it work?  Please describe the testing you did, and the results.
> 
> Has this been confirmed to fix Miklos's FUSE and loopback problems?

I haven't yet tested it (will do), but I'm sure it does solve the
deadlock in balance_dirty_pages(), if for no other reason, that when
the queue is idle (no dirty or writeback pages), then it allowes the
caller to dirty some more pages.

The other deadlock, in throttle_vm_writeout() is still to be solved.

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 10:38       ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-21 10:38 UTC (permalink / raw)
  To: akpm
  Cc: a.p.zijlstra, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Scale writeback cache per backing device, proportional to its writeout speed.
> > 
> > By decoupling the BDI dirty thresholds a number of problems we currently have
> > will go away, namely:
> > 
> >  - mutual interference starvation (for any number of BDIs);
> >  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> > 
> > It might be that all dirty pages are for a single BDI while other BDIs are
> > idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> > dirty pages outstanding and make progress.
> > 
> > A global threshold also creates a deadlock for stacked BDIs; when A writes to
> > B, and A generates enough dirty pages to get throttled, B will never start
> > writeback until the dirty pages go away. Again, by giving each BDI its own
> > 'independent' dirty limit, this problem is avoided.
> > 
> > So the problem is to determine how to distribute the total dirty limit across
> > the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> > not have any dirty pages outstanding is a waste.
> > 
> > What is done is to keep a floating proportion between the DBIs based on
> > writeback completions. This way faster/more active devices get a larger share
> > than slower/idle devices.
> 
> This is a pretty major improvement to various nasty corner-cases, if it
> works.
> 
> Does it work?  Please describe the testing you did, and the results.
> 
> Has this been confirmed to fix Miklos's FUSE and loopback problems?

I haven't yet tested it (will do), but I'm sure it does solve the
deadlock in balance_dirty_pages(), if for no other reason, that when
the queue is idle (no dirty or writeback pages), then it allowes the
caller to dirty some more pages.

The other deadlock, in throttle_vm_writeout() is still to be solved.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 10:38       ` Miklos Szeredi
@ 2007-04-21 10:54         ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21 10:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 12:38:45 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> The other deadlock, in throttle_vm_writeout() is still to be solved.

Let's go back to the original changelog:

Author: marcelo.tosatti <marcelo.tosatti>
Date:   Tue Mar 8 17:25:19 2005 +0000

    [PATCH] vm: pageout throttling
    
    With silly pageout testcases it is possible to place huge amounts of memory
    under I/O.  With a large request queue (CFQ uses 8192 requests) it is
    possible to place _all_ memory under I/O at the same time.
    
    This means that all memory is pinned and unreclaimable and the VM gets
    upset and goes oom.
    
    The patch limits the amount of memory which is under pageout writeout to be
    a little more than the amount of memory at which balance_dirty_pages()
    callers will synchronously throttle.
    
    This means that heavy pageout activity can starve heavy writeback activity
    completely, but heavy writeback activity will not cause starvation of
    pageout.  Because we don't want a simple `dd' to be causing excessive
    latencies in page reclaim.
    
    Signed-off-by: Andrew Morton <akpm@osdl.org>
    Signed-off-by: Linus Torvalds <torvalds@osdl.org>

(A good one!  I wrote it ;))


I believe that the combination of dirty-page-tracking and its calls to
balance_dirty_pages() mean that we can now never get more than dirty_ratio
of memory into the dirty-or-writeback condition.

The vm scanner can convert dirty pages into clean, under-writeback pages,
but it cannot increase the total of dirty+writeback.

Hence I assert that the problem which throttle_vm_writeout() was designed
to address can no longer happen, so we can simply remove it.

(There might be problems with ZONE_DMA or ZONE_NORMAL 100% full of
dirty+writeback pages, but throttle_vm_writeout() wont help in this case
anyway)

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 10:54         ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21 10:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 12:38:45 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> The other deadlock, in throttle_vm_writeout() is still to be solved.

Let's go back to the original changelog:

Author: marcelo.tosatti <marcelo.tosatti>
Date:   Tue Mar 8 17:25:19 2005 +0000

    [PATCH] vm: pageout throttling
    
    With silly pageout testcases it is possible to place huge amounts of memory
    under I/O.  With a large request queue (CFQ uses 8192 requests) it is
    possible to place _all_ memory under I/O at the same time.
    
    This means that all memory is pinned and unreclaimable and the VM gets
    upset and goes oom.
    
    The patch limits the amount of memory which is under pageout writeout to be
    a little more than the amount of memory at which balance_dirty_pages()
    callers will synchronously throttle.
    
    This means that heavy pageout activity can starve heavy writeback activity
    completely, but heavy writeback activity will not cause starvation of
    pageout.  Because we don't want a simple `dd' to be causing excessive
    latencies in page reclaim.
    
    Signed-off-by: Andrew Morton <akpm@osdl.org>
    Signed-off-by: Linus Torvalds <torvalds@osdl.org>

(A good one!  I wrote it ;))


I believe that the combination of dirty-page-tracking and its calls to
balance_dirty_pages() mean that we can now never get more than dirty_ratio
of memory into the dirty-or-writeback condition.

The vm scanner can convert dirty pages into clean, under-writeback pages,
but it cannot increase the total of dirty+writeback.

Hence I assert that the problem which throttle_vm_writeout() was designed
to address can no longer happen, so we can simply remove it.

(There might be problems with ZONE_DMA or ZONE_NORMAL 100% full of
dirty+writeback pages, but throttle_vm_writeout() wont help in this case
anyway)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 10:58       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:51:57 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > With the current logic the percpu_counter's accuracy delta is quadric
> > wrt the number of cpus in the system, reduce this to O(n ln n).
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/percpu_counter.h |    7 ++-----
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > Index: linux-2.6-mm/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux-2.6-mm.orig/include/linux/percpu_counter.h
> > +++ linux-2.6-mm/include/linux/percpu_counter.h
> > @@ -11,6 +11,7 @@
> >  #include <linux/threads.h>
> >  #include <linux/percpu.h>
> >  #include <linux/types.h>
> > +#include <linux/log2.h>
> >  
> >  #ifdef CONFIG_SMP
> >  
> > @@ -20,11 +21,7 @@ struct percpu_counter {
> >  	s32 *counters;
> >  };
> >  
> > -#if NR_CPUS >= 16
> > -#define FBC_BATCH	(NR_CPUS*2)
> > -#else
> > -#define FBC_BATCH	(NR_CPUS*4)
> > -#endif
> > +#define FBC_BATCH	(8*ilog2(NR_CPUS))
> >  
> >  static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> >  {
> 
> I worry that this might be too small when there are hundreds of CPUs online.
> 
> With 1024 CPUs we go for the lock once per 80 counts.  That's not much. 
> 
> If we have 1024 CPUs, each one of which is incrementing this counter at N
> Hz, we have 1024/80=12 CPUs all going for the same lock at N Hz.  It could
> get bad.
> 
> But I don't know what the gain is for this loss.  Your changelog should
> have told us.
> 
> What problem is this patch solving?

In 10/10 I introduce bdi_stat_delta() which gives the maximum error of a
single counter. That is used to switch between precise
(percpu_counter_sum) and imprecise (percpu_counter_read) accesses of the
stats.

I worried that the current quadric error would be too large; and as the
ZVC counters also use a logarithmic error bound I thought it would be
good to have here as well.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH
@ 2007-04-21 10:58       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 10:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:51:57 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > With the current logic the percpu_counter's accuracy delta is quadric
> > wrt the number of cpus in the system, reduce this to O(n ln n).
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/percpu_counter.h |    7 ++-----
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > Index: linux-2.6-mm/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux-2.6-mm.orig/include/linux/percpu_counter.h
> > +++ linux-2.6-mm/include/linux/percpu_counter.h
> > @@ -11,6 +11,7 @@
> >  #include <linux/threads.h>
> >  #include <linux/percpu.h>
> >  #include <linux/types.h>
> > +#include <linux/log2.h>
> >  
> >  #ifdef CONFIG_SMP
> >  
> > @@ -20,11 +21,7 @@ struct percpu_counter {
> >  	s32 *counters;
> >  };
> >  
> > -#if NR_CPUS >= 16
> > -#define FBC_BATCH	(NR_CPUS*2)
> > -#else
> > -#define FBC_BATCH	(NR_CPUS*4)
> > -#endif
> > +#define FBC_BATCH	(8*ilog2(NR_CPUS))
> >  
> >  static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> >  {
> 
> I worry that this might be too small when there are hundreds of CPUs online.
> 
> With 1024 CPUs we go for the lock once per 80 counts.  That's not much. 
> 
> If we have 1024 CPUs, each one of which is incrementing this counter at N
> Hz, we have 1024/80=12 CPUs all going for the same lock at N Hz.  It could
> get bad.
> 
> But I don't know what the gain is for this loss.  Your changelog should
> have told us.
> 
> What problem is this patch solving?

In 10/10 I introduce bdi_stat_delta() which gives the maximum error of a
single counter. That is used to switch between precise
(percpu_counter_sum) and imprecise (percpu_counter_read) accesses of the
stats.

I worried that the current quadric error would be too large; and as the
ZVC counters also use a logarithmic error bound I thought it would be
good to have here as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 11:02       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:51:58 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Add percpu_counter_mod64() to allow large modifications.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/percpu_counter.h |    9 +++++++++
> >  lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+)
> > 
> > Index: linux-2.6/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
> > +++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
> > @@ -36,6 +36,7 @@ static inline void percpu_counter_destro
> >  }
> >  
> >  void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
> > +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
> >  s64 percpu_counter_sum(struct percpu_counter *fbc);
> >  
> >  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > @@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
> >  	preempt_enable();
> >  }
> >  
> > +static inline void
> > +percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> > +{
> > +	preempt_disable();
> > +	fbc->count += amount;
> > +	preempt_enable();
> > +}
> > +
> >  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> >  {
> >  	return fbc->count;
> > Index: linux-2.6/lib/percpu_counter.c
> > ===================================================================
> > --- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
> > +++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
> > @@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
> >  }
> >  EXPORT_SYMBOL(percpu_counter_mod);
> >  
> > +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> > +{
> > +	long count;
> > +	s32 *pcount;
> > +	int cpu;
> > +
> > +	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
> > +		spin_lock(&fbc->lock);
> > +		fbc->count += amount;
> > +		spin_unlock(&fbc->lock);
> > +		return;
> > +	}
> 
> This is wrong, a little.
> 
> If the counter was at -FBC_BATCH/2 and the caller passed in FBC_BATCH, we
> could just set the cpu-local counter to FBC_BATCH/2 instead of going for
> the lock.
> 
> Probably doesn't matter though.

Right, I could have taken along the current percpu offset.

> > +	cpu = get_cpu();
> > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > +	count = *pcount + amount;
> > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > +		spin_lock(&fbc->lock);
> > +		fbc->count += count;
> > +		*pcount = 0;
> > +		spin_unlock(&fbc->lock);
> > +	} else {
> > +		*pcount = count;
> > +	}
> > +	put_cpu();
> > +}
> > +EXPORT_SYMBOL(percpu_counter_mod64);
> 
> Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> PAE has only 64,000,000 pages and won't be using deltas of more than 4
> gigapages?
> 
> <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> another changelog bug>

Yeah, /me chastises himself for that...

This is because percpu_counter is s64 instead of the native long; I need
to halve the counter at some point (bdi_writeout_norm) and do that by
subtracting half the current value.

If percpu_counter_mod is limited to s32 this might not always work
(although in practice it might just fit).




^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
@ 2007-04-21 11:02       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:51:58 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Add percpu_counter_mod64() to allow large modifications.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/percpu_counter.h |    9 +++++++++
> >  lib/percpu_counter.c           |   28 ++++++++++++++++++++++++++++
> >  2 files changed, 37 insertions(+)
> > 
> > Index: linux-2.6/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-12 13:54:55.000000000 +0200
> > +++ linux-2.6/include/linux/percpu_counter.h	2007-04-12 14:00:21.000000000 +0200
> > @@ -36,6 +36,7 @@ static inline void percpu_counter_destro
> >  }
> >  
> >  void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
> > +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
> >  s64 percpu_counter_sum(struct percpu_counter *fbc);
> >  
> >  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > @@ -81,6 +82,14 @@ percpu_counter_mod(struct percpu_counter
> >  	preempt_enable();
> >  }
> >  
> > +static inline void
> > +percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> > +{
> > +	preempt_disable();
> > +	fbc->count += amount;
> > +	preempt_enable();
> > +}
> > +
> >  static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> >  {
> >  	return fbc->count;
> > Index: linux-2.6/lib/percpu_counter.c
> > ===================================================================
> > --- linux-2.6.orig/lib/percpu_counter.c	2006-07-31 13:07:38.000000000 +0200
> > +++ linux-2.6/lib/percpu_counter.c	2007-04-12 14:17:12.000000000 +0200
> > @@ -25,6 +25,34 @@ void percpu_counter_mod(struct percpu_co
> >  }
> >  EXPORT_SYMBOL(percpu_counter_mod);
> >  
> > +void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount)
> > +{
> > +	long count;
> > +	s32 *pcount;
> > +	int cpu;
> > +
> > +	if (amount >= FBC_BATCH || amount <= -FBC_BATCH) {
> > +		spin_lock(&fbc->lock);
> > +		fbc->count += amount;
> > +		spin_unlock(&fbc->lock);
> > +		return;
> > +	}
> 
> This is wrong, a little.
> 
> If the counter was at -FBC_BATCH/2 and the caller passed in FBC_BATCH, we
> could just set the cpu-local counter to FBC_BATCH/2 instead of going for
> the lock.
> 
> Probably doesn't matter though.

Right, I could have taken along the current percpu offset.

> > +	cpu = get_cpu();
> > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > +	count = *pcount + amount;
> > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > +		spin_lock(&fbc->lock);
> > +		fbc->count += count;
> > +		*pcount = 0;
> > +		spin_unlock(&fbc->lock);
> > +	} else {
> > +		*pcount = count;
> > +	}
> > +	put_cpu();
> > +}
> > +EXPORT_SYMBOL(percpu_counter_mod64);
> 
> Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> PAE has only 64,000,000 pages and won't be using deltas of more than 4
> gigapages?
> 
> <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> another changelog bug>

Yeah, /me chastises himself for that...

This is because percpu_counter is s64 instead of the native long; I need
to halve the counter at some point (bdi_writeout_norm) and do that by
subtracting half the current value.

If percpu_counter_mod is limited to s32 this might not always work
(although in practice it might just fit).



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 07/10] mm: count reclaimable pages per BDI
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 11:04       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:01 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.
> 
> hm.  Aggregating dirty and unstable at inc/dec time is a bit kludgy.  If
> later on we want to know just "dirty" then we're in trouble.
> 
> I can see the logic behind it though.
> 
> Perhaps one could have separate BDI_DIRTY and BDI_UNSTABLE and treat them
> separately at inc/dec time, but give them the same numerical value, so
> they in fact refer to the same counter.  That's kludgy too.

:-(

I struggled with it too; don't have a ready solution either. I'll do
whatever the consensus agrees upon.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 07/10] mm: count reclaimable pages per BDI
@ 2007-04-21 11:04       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:01 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Count per BDI reclaimable pages; nr_reclaimable = nr_dirty + nr_unstable.
> 
> hm.  Aggregating dirty and unstable at inc/dec time is a bit kludgy.  If
> later on we want to know just "dirty" then we're in trouble.
> 
> I can see the logic behind it though.
> 
> Perhaps one could have separate BDI_DIRTY and BDI_UNSTABLE and treat them
> separately at inc/dec time, but give them the same numerical value, so
> they in fact refer to the same counter.  That's kludgy too.

:-(

I struggled with it too; don't have a ready solution either. I'll do
whatever the consensus agrees upon.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 11:07       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Count per BDI writeback pages.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/backing-dev.h |    1 +
> >  mm/page-writeback.c         |   12 ++++++++++--
> >  2 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
> >  	int ret;
> >  
> >  	if (mapping) {
> > +		struct backing_dev_info *bdi = mapping->backing_dev_info;
> >  		unsigned long flags;
> >  
> >  		write_lock_irqsave(&mapping->tree_lock, flags);
> >  		ret = TestClearPageWriteback(page);
> > -		if (ret)
> > +		if (ret) {
> >  			radix_tree_tag_clear(&mapping->page_tree,
> >  						page_index(page),
> >  						PAGECACHE_TAG_WRITEBACK);
> > +			if (bdi_cap_writeback_dirty(bdi))
> > +				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> 
> Why do we test bdi_cap_writeback_dirty() here?
> 
> If we remove that test, we end up accumulating statistics for
> non-writebackable backing devs, but does that matter? 

It would not, had I not cheated:

+void bdi_init(struct backing_dev_info *bdi)
+{
+       int i;
+
+       if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+               return;
+
+       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+               percpu_counter_init(&bdi->bdi_stat[i], 0);
+}
+EXPORT_SYMBOL(bdi_init);

>  Probably the common
> case is writebackable backing-devs, so eliminating the test-n-branch might
> be a net microgain.

Time vs space. Now we don't even have storage for those BDIs..

Don't particularly care on this point though, I just thought it might be
worthwhile to save on the percpu data.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
@ 2007-04-21 11:07       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Count per BDI writeback pages.
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/backing-dev.h |    1 +
> >  mm/page-writeback.c         |   12 ++++++++++--
> >  2 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
> >  	int ret;
> >  
> >  	if (mapping) {
> > +		struct backing_dev_info *bdi = mapping->backing_dev_info;
> >  		unsigned long flags;
> >  
> >  		write_lock_irqsave(&mapping->tree_lock, flags);
> >  		ret = TestClearPageWriteback(page);
> > -		if (ret)
> > +		if (ret) {
> >  			radix_tree_tag_clear(&mapping->page_tree,
> >  						page_index(page),
> >  						PAGECACHE_TAG_WRITEBACK);
> > +			if (bdi_cap_writeback_dirty(bdi))
> > +				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> 
> Why do we test bdi_cap_writeback_dirty() here?
> 
> If we remove that test, we end up accumulating statistics for
> non-writebackable backing devs, but does that matter? 

It would not, had I not cheated:

+void bdi_init(struct backing_dev_info *bdi)
+{
+       int i;
+
+       if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
+               return;
+
+       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+               percpu_counter_init(&bdi->bdi_stat[i], 0);
+}
+EXPORT_SYMBOL(bdi_init);

>  Probably the common
> case is writebackable backing-devs, so eliminating the test-n-branch might
> be a net microgain.

Time vs space. Now we don't even have storage for those BDIs..

Don't particularly care on this point though, I just thought it might be
worthwhile to save on the percpu data.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 09/10] mm: expose BDI statistics in sysfs.
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 11:08       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:03 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Expose the per BDI stats in /sys/block/<dev>/queue/*
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
> >  1 file changed, 32 insertions(+)
> > 
> > Index: linux-2.6-mm/block/ll_rw_blk.c
> > ===================================================================
> > --- linux-2.6-mm.orig/block/ll_rw_blk.c
> > +++ linux-2.6-mm/block/ll_rw_blk.c
> > @@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
> >  	return queue_var_show(max_hw_sectors_kb, (page));
> >  }
> >  
> > +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
> > +{
> > +	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
> > +}
> 
> We try to present memory statistics to userspace in bytes or kbytes rather
> than number-of-pages.  Because page-size varies between architectures and
> between .configs.  Displaying number-of-pages is just inviting people to write
> it-broke-when-i-moved-it-to-ia64 applications.
> 
> Plus kbytes is a bit more user-friendly, particularly when the user will
> want to compare these numbers to /proc/meminfo, for example.
> 
> Using %llu might be more appropriate than %lld.


Right, the biggest problem I actually have with his piece of code is
that is does not represent all BDIs in the system. For example, the BDI
of NFS mounts is not accessible.

Will fix the other points.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 09/10] mm: expose BDI statistics in sysfs.
@ 2007-04-21 11:08       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 11:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:03 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Expose the per BDI stats in /sys/block/<dev>/queue/*
> > 
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  block/ll_rw_blk.c |   32 ++++++++++++++++++++++++++++++++
> >  1 file changed, 32 insertions(+)
> > 
> > Index: linux-2.6-mm/block/ll_rw_blk.c
> > ===================================================================
> > --- linux-2.6-mm.orig/block/ll_rw_blk.c
> > +++ linux-2.6-mm/block/ll_rw_blk.c
> > @@ -3976,6 +3976,15 @@ static ssize_t queue_max_hw_sectors_show
> >  	return queue_var_show(max_hw_sectors_kb, (page));
> >  }
> >  
> > +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page)
> > +{
> > +	return sprintf(page, "%lld\n", bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE));
> > +}
> 
> We try to present memory statistics to userspace in bytes or kbytes rather
> than number-of-pages.  Because page-size varies between architectures and
> between .configs.  Displaying number-of-pages is just inviting people to write
> it-broke-when-i-moved-it-to-ia64 applications.
> 
> Plus kbytes is a bit more user-friendly, particularly when the user will
> want to compare these numbers to /proc/meminfo, for example.
> 
> Using %llu might be more appropriate than %lld.


Right, the biggest problem I actually have with his piece of code is
that is does not represent all BDIs in the system. For example, the BDI
of NFS mounts is not accessible.

Will fix the other points.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21  9:55     ` Andrew Morton
@ 2007-04-21 12:01       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 12:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Scale writeback cache per backing device, proportional to its writeout speed.
> > 
> > By decoupling the BDI dirty thresholds a number of problems we currently have
> > will go away, namely:
> > 
> >  - mutual interference starvation (for any number of BDIs);
> >  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> > 
> > It might be that all dirty pages are for a single BDI while other BDIs are
> > idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> > dirty pages outstanding and make progress.
> > 
> > A global threshold also creates a deadlock for stacked BDIs; when A writes to
> > B, and A generates enough dirty pages to get throttled, B will never start
> > writeback until the dirty pages go away. Again, by giving each BDI its own
> > 'independent' dirty limit, this problem is avoided.
> > 
> > So the problem is to determine how to distribute the total dirty limit across
> > the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> > not have any dirty pages outstanding is a waste.
> > 
> > What is done is to keep a floating proportion between the DBIs based on
> > writeback completions. This way faster/more active devices get a larger share
> > than slower/idle devices.
> 
> This is a pretty major improvement to various nasty corner-cases, if it
> works.
> 
> Does it work?  Please describe the testing you did, and the results.

The testing I did was several dd instances racing each other to various
devices; usually one in a loop and the other a single, timed, instance.

I tested, disk vs disk, disk vs usbstick, disk vs nfs-mount.

Using the debug patch from the last series; the one which exposed the
actual ratio assigned and the total; I monitored (where possible) that
the ratio was around the relative writeout speeds.

The main indicator was that the writes should complete in roughly the
same time as if they were done on an idle system.

the disk vs usbstick gave the most dramatic improvement; on mainline the
usbstick is totally starved by a heavy disk writer, with these patches
it takes about the same time as it would on an idle system

Along with the first series was a number of results; those still stand.

> Has this been confirmed to fix Miklos's FUSE and loopback problems?

I must defer to Miklos for that.

> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/backing-dev.h |   51 ++++++++++++
> >  mm/backing-dev.c            |    3 
> >  mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
> >  3 files changed, 206 insertions(+), 29 deletions(-)
> > 
> > Index: linux-2.6/include/linux/backing-dev.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
> > +++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
> > @@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
> >  enum bdi_stat_item {
> >  	BDI_RECLAIMABLE,
> >  	BDI_WRITEBACK,
> > +	BDI_WRITEOUT,
> >  	NR_BDI_STAT_ITEMS
> >  };
> 
> Whoa, head is now swimming.  What's the difference between "writeback" and
> "writeout"?

writeback is the number of pages in the writeback state.

writeout is a relative proportion (against all other BDIs) of completed
writeouts.

> > @@ -43,6 +44,13 @@ struct backing_dev_info {
> >  	void *unplug_io_data;
> >  
> >  	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
> > +
> > +	/*
> > +	 * data used for scaling the writeback cache
> > +	 */
> > +	spinlock_t lock;	/* protect the cycle count */
> > +	unsigned long cycles;	/* writeout cycles */
> > +	int dirty_exceeded;
> >  };
> >  
> >  void bdi_init(struct backing_dev_info *bdi);
> > @@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
> >  	percpu_counter_mod(&bdi->bdi_stat[item], amount);
> >  }
> >  
> > +static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item, s64 amount)
> > +{
> > +	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
> > +}
> > +
> >  static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
> >  		enum bdi_stat_item item)
> >  {
> > @@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
> >  	local_irq_restore(flags);
> >  }
> >  
> > +static inline s64 __bdi_stat(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	return percpu_counter_read(&bdi->bdi_stat[item]);
> > +}
> > +
> >  static inline s64 bdi_stat(struct backing_dev_info *bdi,
> >  		enum bdi_stat_item item)
> >  {
> >  	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
> >  }
> 
> So here, the __ means "it doesn't do the force-it-positive" treatment.
> 
> > +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	return percpu_counter_sum(&bdi->bdi_stat[item]);
> > +}
> > +
> > +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	s64 sum;
> > +	unsigned long flags;
> > +
> > +	local_irq_save(flags);
> > +	sum = __bdi_stat_sum(bdi, item);
> > +	local_irq_restore(flags);
> > +
> > +	return sum;
> > +}
> 
> And here __ means "not safe to use if this counter is updated from
> interrupt context".
> 
> At least, I think that's what it all means.  The lack of code comments
> casts some doubt.
> 
> 
> The interfaces here could do with a little more thought wrt regularity,
> naming and commenting, methinks.

good points, shall consider.

> 
> > +/*
> > + * maximal error of a stat counter.
> > + */
> > +static inline unsigned long bdi_stat_delta(void)
> > +{
> > +#ifdef CONFIG_SMP
> > +	return NR_CPUS * FBC_BATCH;
> 
> This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> > +#else
> > +	return 1UL;
> 
> The UL is pretty pointless IMO.  The compiler will happily convert "1" to
> unsigned long here.  And if we later change the return type to signed char,
> we don't have to remember to edit this line too.

me and my pedantry.

> > +#endif
> > +}
> >
> >  /*
> >   * Flags in backing_dev_info::capability
> >   * - The first two flags control whether dirty pages will contribute to the
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
> > @@ -49,8 +49,6 @@
> >   */
> >  static long ratelimit_pages = 32;
> >  
> > -static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
> > -
> >  /*
> >   * When balance_dirty_pages decides that the caller needs to perform some
> >   * non-background writeback, this is how many pages it will attempt to write.
> > @@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
> >  static void background_writeout(unsigned long _min_pages);
> >  
> >  /*
> > + * Scale the writeback cache size proportional to the relative writeout speeds.
> > + *
> > + * We do this by tracking a floating average per BDI and a global floating
> > + * average. We optimize away the '/= 2' for the global average by noting that:
> > + *
> > + *  if (++i > thresh) i /= 2:
> > + *
> > + * Can be approximated by:
> > + *
> > + *   thresh/2 + (++i % thresh/2)
> > + *
> > + * Furthermore, when we choose thresh to be 2^n it can be written in terms of
> > + * binary operations and wraparound artifacts disappear.
> > + *
> > + * Also note that this yields a natural counter of the elapsed periods:
> > + *
> > + *   i / thresh
> > + *
> > + * Its monotonous increasing property can be applied to mitigate the wrap-
> > + * around issue.
> > + */

Whaha, and here I thought this was an adequate comment :-/
Obviously it sucked, since you are rather confused.

> > +static int vm_cycle_shift __read_mostly;
> > +static struct percpu_counter vm_writeout_total;
> > +
> > +/*
> > + * Sync up the per BDI average to the global cycle.
> > + */
> > +static void bdi_writeout_norm(struct backing_dev_info *bdi)
> > +{
> > +	int bits = vm_cycle_shift;
> > +	unsigned long cycle = 1UL << bits;
> > +	unsigned long mask = ~(cycle - 1);
> > +	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
> > +	unsigned long flags;
> > +
> > +	global_cycle <<= 1;
> > +	global_cycle &= mask;
> > +
> > +	if ((bdi->cycles & mask) == global_cycle)
> > +		return;
> > +
> > +	spin_lock_irqsave(&bdi->lock, flags);
> > +	bdi->cycles &= mask;
> > +	while (bdi->cycles != global_cycle) {
> > +		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
> > +		unsigned long half = (val + 1) >> 1;
> > +
> > +		if (!val)
> > +			break;
> > +
> > +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> > +		bdi->cycles += cycle;
> > +	}
> > +	bdi->cycles = global_cycle;
> > +	spin_unlock_irqrestore(&bdi->lock, flags);
> > +}
> 
> Here we get to the real critical substance of the patchset, and I don't
> have a clue what it's doing nor how it's doing it.  And I bet nobody else
> does either.

I shall send a comment patch; but let me try to explain:

I am trying to keep a floating proportion between the BDIs based on
writeout events. That is, each device is given a share equal to its
proportion of completed writebacks (writeback, we are in the process of
writing vs. writeout, we have written). This proportion is measured in a
'time'-span measured itself in writeouts.

Example:

  device A completes 4, device B completes 12 and, device C 16 writes.
This gives a 4:12:16 of 32 ratio. Now, assume the 'time'-span is 32
writes. This will force the counters to get halved: 2:6:8 of 16.

Now the devices complete: A:8 B:8 C:0, another 16, making 32 again.
2+8=10 : 6+8=14 : 8+0=8 of 32, or, because its a full period:
5:7:4 of 16.

That is basically what happens; the implementation tries to be a little
smart about it, because it wants to avoid having to traverse all BDIs
when a period expires.

see how the total runs up to 32, gets halved, runs up to 32 again, gets
halved, etc..

That is the
  if (++i > thresh) i /= 2;
from that comment above, which we approximate by:
  thresh/2 + (++i % thresh/2)

the thresh = 2^n -> bit operations part is clear I hope.

now we note that the total (i), is ever increasing, when we look at
  i / (thresh/2)
we see that that is the number of periods expired.

If we then keep track of in which period each BDI is, we can 'normalize'
the (per bdi) counter whenever we detect that the total went into
another period. This is what bdi_writeout_norm() does. bdi->cycle is the
local period (shifted left a bit to align with the global period bits so
that wrap around is handled naturally), and global_cycle the global
period.

> <continues to wonder wtf "writeout" is.  Perhaps knowing that would help>
> 
> I dunno.  I'm sure it's very good code but I don't have the time nor
> inclination to reverse engineer the design from the implementation.
> 
> This is a very important part of the kernel - one of the few most important
> parts, really.  See all the crap going around about CPU schedulers at
> present?  Well hoo-boy, if we get this part of code even a little bit
> wrong, they won't know what hit them.
> 
> So please, spend quite a lot of time thinking about how we can make this
> code as comprehensible and approachable and maintainable as possible. 
> Often this is done with comments ;)
> 
> > +static void __bdi_writeout_inc(struct backing_dev_info *bdi)
> > +{
> > +	bdi_writeout_norm(bdi);
> 
> I'm assuming that "norm" here means "normalise".  There's a hint for me.
> 
> > +	__inc_bdi_stat(bdi, BDI_WRITEOUT);
> > +	percpu_counter_mod(&vm_writeout_total, 1);
> > +}
> > +
> > +void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
> > +{
> > +	int bits = vm_cycle_shift - 1;
> > +	unsigned long cycle = 1UL << bits;
> > +	unsigned long mask = cycle - 1;
> > +	unsigned long total = percpu_counter_read(&vm_writeout_total);
> > +
> > +	if (bdi_cap_writeback_dirty(bdi)) {
> > +		bdi_writeout_norm(bdi);
> > +		*scale = bdi_stat(bdi, BDI_WRITEOUT);
> > +	} else
> > +		*scale = 0;
> > +
> > +	*div = cycle + (total & mask);
> > +}
> 
> I suppose that if I stared at this for long enough I could work out what
> it's doing, and why it's doing it.  But given that it needs comments
> telling others that, there isn't much point in me blowing the time to do
> so.

Right, so with the above clear (I hope), bdi_writeout_inc() tracks the
per bdi and global writeout events. We need to normalize the counter
before incrementing because the global period might have expired due to
another BDI's activity.

get_writeout_scale() does the magic of getting the current ratio.
Remember the example; say that the current state for A, B and, C is:
  3:8:7 of 18
and we're currently interested in A's share.

the 18 (*div) is obtained from 16 (cycle) + 2 (total & 15),
the 3 (*scale) is read from the per BDI counter (again, after
normalizing it, for the global period might have been advance since we
last showed interest in it).

(nr * 3) / 18 gives A's share of nr.

If anything remains unclear, please holler.

> It should have static scope.

ok

> Are all the per-bdi counters being exposed in sysfs?  I think not. 
> Probably they should be?

Like stated in that sysfs email, I'd rather have we had a per bdi sysfs
spot, the current location is only for disks.

> > +/*
> >   * Work out the current dirty-memory clamping and background writeout
> >   * thresholds.
> >   *
> > @@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
> >  }
> >  
> >  static void
> > -get_dirty_limits(long *pbackground, long *pdirty,
> > -					struct address_space *mapping)
> > +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> > +		 struct backing_dev_info *bdi)
> >  {
> >  	int background_ratio;		/* Percentages */
> >  	int dirty_ratio;
> > @@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
> >  	}
> >  	*pbackground = background;
> >  	*pdirty = dirty;
> > +
> > +	if (bdi) {
> > +		long long tmp = dirty;
> > +		long reserve;
> > +		long scale, div;
> > +
> > +		get_writeout_scale(bdi, &scale, &div);
> > +
> > +		tmp *= scale;
> > +		do_div(tmp, div);
> > +
> > +		reserve = dirty -
> > +			(global_page_state(NR_FILE_DIRTY) +
> > +			 global_page_state(NR_WRITEBACK) +
> > +			 global_page_state(NR_UNSTABLE_NFS));
> > +
> > +		if (reserve < 0)
> > +			reserve = 0;
> > +
> > +		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
> > +			bdi_stat(bdi, BDI_WRITEBACK);
> > +
> > +		*pbdi_dirty = min((long)tmp, reserve);
> 
> min_t is preferred
> 
> tmp isn't a particularly good identifier

ok, shall fix.

> > +	}
> >  }
> >  
> >  /*
> > @@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
> >   */
> >  static void balance_dirty_pages(struct address_space *mapping)
> >  {
> > -	long nr_reclaimable;
> > +	long bdi_nr_reclaimable;
> > +	long bdi_nr_writeback;
> >  	long background_thresh;
> >  	long dirty_thresh;
> > +	long bdi_thresh;
> >  	unsigned long pages_written = 0;
> >  	unsigned long write_chunk = sync_writeback_pages();
> >  
> > @@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
> >  			.range_cyclic	= 1,
> >  		};
> >  
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
> > -		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > -					global_page_state(NR_UNSTABLE_NFS);
> > -		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
> > -			dirty_thresh)
> > +		get_dirty_limits(&background_thresh, &dirty_thresh,
> > +				&bdi_thresh, bdi);
> > +		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > +		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > +		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> >  				break;
> >  
> > -		if (!dirty_exceeded)
> > -			dirty_exceeded = 1;
> > +		if (!bdi->dirty_exceeded)
> > +			bdi->dirty_exceeded = 1;
> >  
> >  		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> >  		 * Unstable writes are a feature of certain networked
> > @@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
> >  		 * written to the server's write cache, but has not yet
> >  		 * been flushed to permanent storage.
> >  		 */
> > -		if (nr_reclaimable) {
> > +		if (bdi_nr_reclaimable) {
> >  			writeback_inodes(&wbc);
> > -			get_dirty_limits(&background_thresh,
> > -					 	&dirty_thresh, mapping);
> > -			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > -					global_page_state(NR_UNSTABLE_NFS);
> > -			if (nr_reclaimable +
> > -				global_page_state(NR_WRITEBACK)
> > -					<= dirty_thresh)
> > -						break;
> > +
> > +			get_dirty_limits(&background_thresh, &dirty_thresh,
> > +				       &bdi_thresh, bdi);
> > +
> > +			if (bdi_thresh < 2*bdi_stat_delta()) {
> > +				bdi_nr_reclaimable =
> > +					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> > +				bdi_nr_writeback =
> > +					bdi_stat_sum(bdi, BDI_WRITEBACK);
> > +			} else {
> > +				bdi_nr_reclaimable =
> > +					bdi_stat(bdi, BDI_RECLAIMABLE);
> > +				bdi_nr_writeback =
> > +					bdi_stat(bdi, BDI_WRITEBACK);
> > +			}
> > +
> > +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > +				break;
> > +
> >  			pages_written += write_chunk - wbc.nr_to_write;
> >  			if (pages_written >= write_chunk)
> >  				break;		/* We've done our duty */
> > @@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
> >  		congestion_wait(WRITE, HZ/10);
> >  	}
> >  
> > -	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
> > -		<= dirty_thresh && dirty_exceeded)
> > -			dirty_exceeded = 0;
> > +	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
> > +			bdi->dirty_exceeded)
> > +		bdi->dirty_exceeded = 0;
> >  
> >  	if (writeback_in_progress(bdi))
> >  		return;		/* pdflush is already working this queue */
> > @@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
> >  	 * background_thresh, to keep the amount of dirty memory low.
> >  	 */
> >  	if ((laptop_mode && pages_written) ||
> > -	     (!laptop_mode && (nr_reclaimable > background_thresh)))
> > +			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> > +					  + global_page_state(NR_UNSTABLE_NFS)
> > +					  > background_thresh)))
> >  		pdflush_operation(background_writeout, 0);
> >  }
> 
> Did you test laptop mode?

admittedly, no. Shall do.

> > @@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
> >  	unsigned long *p;
> >  
> >  	ratelimit = ratelimit_pages;
> > -	if (dirty_exceeded)
> > +	if (mapping->backing_dev_info->dirty_exceeded)
> >  		ratelimit = 8;
> >  
> >  	/*
> > @@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
> >  	}
> >  
> >          for ( ; ; ) {
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> > +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> >  
> >                  /*
> >                   * Boost the allowable dirty threshold a bit for page
> > @@ -377,7 +496,7 @@ static void background_writeout(unsigned
> >  		long background_thresh;
> >  		long dirty_thresh;
> >  
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> > +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> >  		if (global_page_state(NR_FILE_DIRTY) +
> >  			global_page_state(NR_UNSTABLE_NFS) < background_thresh
> >  				&& min_pages <= 0)
> > @@ -585,6 +704,8 @@ void __init page_writeback_init(void)
> >  	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
> >  	writeback_set_ratelimit();
> >  	register_cpu_notifier(&ratelimit_nb);
> > +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> > +	percpu_counter_init(&vm_writeout_total, 0);
> >  }
> >  
> >  /**
> > @@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
> >  			radix_tree_tag_clear(&mapping->page_tree,
> >  						page_index(page),
> >  						PAGECACHE_TAG_WRITEBACK);
> > -			if (bdi_cap_writeback_dirty(bdi))
> > +			if (bdi_cap_writeback_dirty(bdi)) {
> >  				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> > +				__bdi_writeout_inc(bdi);
> > +			}
> >  		}
> >  		write_unlock_irqrestore(&mapping->tree_lock, flags);
> >  	} else {
> > Index: linux-2.6/mm/backing-dev.c
> > ===================================================================
> > --- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
> > +++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
> > @@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
> >  	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
> >  		return;
> >  
> > +	spin_lock_init(&bdi->lock);
> > +	bdi->cycles = 0;
> > +	bdi->dirty_exceeded = 0;
> >  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> >  		percpu_counter_init(&bdi->bdi_stat[i], 0);
> >  }
> > 
> 
> ho hum, I'll toss it all in -mm, see what happens.

You show great confidence, thanks!


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 12:01       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 12:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> On Fri, 20 Apr 2007 17:52:04 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > Scale writeback cache per backing device, proportional to its writeout speed.
> > 
> > By decoupling the BDI dirty thresholds a number of problems we currently have
> > will go away, namely:
> > 
> >  - mutual interference starvation (for any number of BDIs);
> >  - deadlocks with stacked BDIs (loop, FUSE and local NFS mounts).
> > 
> > It might be that all dirty pages are for a single BDI while other BDIs are
> > idling. By giving each BDI a 'fair' share of the dirty limit, each one can have
> > dirty pages outstanding and make progress.
> > 
> > A global threshold also creates a deadlock for stacked BDIs; when A writes to
> > B, and A generates enough dirty pages to get throttled, B will never start
> > writeback until the dirty pages go away. Again, by giving each BDI its own
> > 'independent' dirty limit, this problem is avoided.
> > 
> > So the problem is to determine how to distribute the total dirty limit across
> > the BDIs fairly and efficiently. A DBI that has a large dirty limit but does
> > not have any dirty pages outstanding is a waste.
> > 
> > What is done is to keep a floating proportion between the DBIs based on
> > writeback completions. This way faster/more active devices get a larger share
> > than slower/idle devices.
> 
> This is a pretty major improvement to various nasty corner-cases, if it
> works.
> 
> Does it work?  Please describe the testing you did, and the results.

The testing I did was several dd instances racing each other to various
devices; usually one in a loop and the other a single, timed, instance.

I tested, disk vs disk, disk vs usbstick, disk vs nfs-mount.

Using the debug patch from the last series; the one which exposed the
actual ratio assigned and the total; I monitored (where possible) that
the ratio was around the relative writeout speeds.

The main indicator was that the writes should complete in roughly the
same time as if they were done on an idle system.

the disk vs usbstick gave the most dramatic improvement; on mainline the
usbstick is totally starved by a heavy disk writer, with these patches
it takes about the same time as it would on an idle system

Along with the first series was a number of results; those still stand.

> Has this been confirmed to fix Miklos's FUSE and loopback problems?

I must defer to Miklos for that.

> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > ---
> >  include/linux/backing-dev.h |   51 ++++++++++++
> >  mm/backing-dev.c            |    3 
> >  mm/page-writeback.c         |  181 ++++++++++++++++++++++++++++++++++++--------
> >  3 files changed, 206 insertions(+), 29 deletions(-)
> > 
> > Index: linux-2.6/include/linux/backing-dev.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/backing-dev.h	2007-04-20 15:28:17.000000000 +0200
> > +++ linux-2.6/include/linux/backing-dev.h	2007-04-20 15:33:59.000000000 +0200
> > @@ -28,6 +28,7 @@ typedef int (congested_fn)(void *, int);
> >  enum bdi_stat_item {
> >  	BDI_RECLAIMABLE,
> >  	BDI_WRITEBACK,
> > +	BDI_WRITEOUT,
> >  	NR_BDI_STAT_ITEMS
> >  };
> 
> Whoa, head is now swimming.  What's the difference between "writeback" and
> "writeout"?

writeback is the number of pages in the writeback state.

writeout is a relative proportion (against all other BDIs) of completed
writeouts.

> > @@ -43,6 +44,13 @@ struct backing_dev_info {
> >  	void *unplug_io_data;
> >  
> >  	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
> > +
> > +	/*
> > +	 * data used for scaling the writeback cache
> > +	 */
> > +	spinlock_t lock;	/* protect the cycle count */
> > +	unsigned long cycles;	/* writeout cycles */
> > +	int dirty_exceeded;
> >  };
> >  
> >  void bdi_init(struct backing_dev_info *bdi);
> > @@ -54,6 +62,12 @@ static inline void __mod_bdi_stat(struct
> >  	percpu_counter_mod(&bdi->bdi_stat[item], amount);
> >  }
> >  
> > +static inline void __mod_bdi_stat64(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item, s64 amount)
> > +{
> > +	percpu_counter_mod64(&bdi->bdi_stat[item], amount);
> > +}
> > +
> >  static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
> >  		enum bdi_stat_item item)
> >  {
> > @@ -86,12 +100,49 @@ static inline void dec_bdi_stat(struct b
> >  	local_irq_restore(flags);
> >  }
> >  
> > +static inline s64 __bdi_stat(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	return percpu_counter_read(&bdi->bdi_stat[item]);
> > +}
> > +
> >  static inline s64 bdi_stat(struct backing_dev_info *bdi,
> >  		enum bdi_stat_item item)
> >  {
> >  	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
> >  }
> 
> So here, the __ means "it doesn't do the force-it-positive" treatment.
> 
> > +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	return percpu_counter_sum(&bdi->bdi_stat[item]);
> > +}
> > +
> > +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
> > +		enum bdi_stat_item item)
> > +{
> > +	s64 sum;
> > +	unsigned long flags;
> > +
> > +	local_irq_save(flags);
> > +	sum = __bdi_stat_sum(bdi, item);
> > +	local_irq_restore(flags);
> > +
> > +	return sum;
> > +}
> 
> And here __ means "not safe to use if this counter is updated from
> interrupt context".
> 
> At least, I think that's what it all means.  The lack of code comments
> casts some doubt.
> 
> 
> The interfaces here could do with a little more thought wrt regularity,
> naming and commenting, methinks.

good points, shall consider.

> 
> > +/*
> > + * maximal error of a stat counter.
> > + */
> > +static inline unsigned long bdi_stat_delta(void)
> > +{
> > +#ifdef CONFIG_SMP
> > +	return NR_CPUS * FBC_BATCH;
> 
> This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> > +#else
> > +	return 1UL;
> 
> The UL is pretty pointless IMO.  The compiler will happily convert "1" to
> unsigned long here.  And if we later change the return type to signed char,
> we don't have to remember to edit this line too.

me and my pedantry.

> > +#endif
> > +}
> >
> >  /*
> >   * Flags in backing_dev_info::capability
> >   * - The first two flags control whether dirty pages will contribute to the
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:35:01.000000000 +0200
> > @@ -49,8 +49,6 @@
> >   */
> >  static long ratelimit_pages = 32;
> >  
> > -static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
> > -
> >  /*
> >   * When balance_dirty_pages decides that the caller needs to perform some
> >   * non-background writeback, this is how many pages it will attempt to write.
> > @@ -103,6 +101,88 @@ EXPORT_SYMBOL(laptop_mode);
> >  static void background_writeout(unsigned long _min_pages);
> >  
> >  /*
> > + * Scale the writeback cache size proportional to the relative writeout speeds.
> > + *
> > + * We do this by tracking a floating average per BDI and a global floating
> > + * average. We optimize away the '/= 2' for the global average by noting that:
> > + *
> > + *  if (++i > thresh) i /= 2:
> > + *
> > + * Can be approximated by:
> > + *
> > + *   thresh/2 + (++i % thresh/2)
> > + *
> > + * Furthermore, when we choose thresh to be 2^n it can be written in terms of
> > + * binary operations and wraparound artifacts disappear.
> > + *
> > + * Also note that this yields a natural counter of the elapsed periods:
> > + *
> > + *   i / thresh
> > + *
> > + * Its monotonous increasing property can be applied to mitigate the wrap-
> > + * around issue.
> > + */

Whaha, and here I thought this was an adequate comment :-/
Obviously it sucked, since you are rather confused.

> > +static int vm_cycle_shift __read_mostly;
> > +static struct percpu_counter vm_writeout_total;
> > +
> > +/*
> > + * Sync up the per BDI average to the global cycle.
> > + */
> > +static void bdi_writeout_norm(struct backing_dev_info *bdi)
> > +{
> > +	int bits = vm_cycle_shift;
> > +	unsigned long cycle = 1UL << bits;
> > +	unsigned long mask = ~(cycle - 1);
> > +	unsigned long global_cycle = percpu_counter_read(&vm_writeout_total);
> > +	unsigned long flags;
> > +
> > +	global_cycle <<= 1;
> > +	global_cycle &= mask;
> > +
> > +	if ((bdi->cycles & mask) == global_cycle)
> > +		return;
> > +
> > +	spin_lock_irqsave(&bdi->lock, flags);
> > +	bdi->cycles &= mask;
> > +	while (bdi->cycles != global_cycle) {
> > +		unsigned long val = __bdi_stat(bdi, BDI_WRITEOUT);
> > +		unsigned long half = (val + 1) >> 1;
> > +
> > +		if (!val)
> > +			break;
> > +
> > +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> > +		bdi->cycles += cycle;
> > +	}
> > +	bdi->cycles = global_cycle;
> > +	spin_unlock_irqrestore(&bdi->lock, flags);
> > +}
> 
> Here we get to the real critical substance of the patchset, and I don't
> have a clue what it's doing nor how it's doing it.  And I bet nobody else
> does either.

I shall send a comment patch; but let me try to explain:

I am trying to keep a floating proportion between the BDIs based on
writeout events. That is, each device is given a share equal to its
proportion of completed writebacks (writeback, we are in the process of
writing vs. writeout, we have written). This proportion is measured in a
'time'-span measured itself in writeouts.

Example:

  device A completes 4, device B completes 12 and, device C 16 writes.
This gives a 4:12:16 of 32 ratio. Now, assume the 'time'-span is 32
writes. This will force the counters to get halved: 2:6:8 of 16.

Now the devices complete: A:8 B:8 C:0, another 16, making 32 again.
2+8=10 : 6+8=14 : 8+0=8 of 32, or, because its a full period:
5:7:4 of 16.

That is basically what happens; the implementation tries to be a little
smart about it, because it wants to avoid having to traverse all BDIs
when a period expires.

see how the total runs up to 32, gets halved, runs up to 32 again, gets
halved, etc..

That is the
  if (++i > thresh) i /= 2;
from that comment above, which we approximate by:
  thresh/2 + (++i % thresh/2)

the thresh = 2^n -> bit operations part is clear I hope.

now we note that the total (i), is ever increasing, when we look at
  i / (thresh/2)
we see that that is the number of periods expired.

If we then keep track of in which period each BDI is, we can 'normalize'
the (per bdi) counter whenever we detect that the total went into
another period. This is what bdi_writeout_norm() does. bdi->cycle is the
local period (shifted left a bit to align with the global period bits so
that wrap around is handled naturally), and global_cycle the global
period.

> <continues to wonder wtf "writeout" is.  Perhaps knowing that would help>
> 
> I dunno.  I'm sure it's very good code but I don't have the time nor
> inclination to reverse engineer the design from the implementation.
> 
> This is a very important part of the kernel - one of the few most important
> parts, really.  See all the crap going around about CPU schedulers at
> present?  Well hoo-boy, if we get this part of code even a little bit
> wrong, they won't know what hit them.
> 
> So please, spend quite a lot of time thinking about how we can make this
> code as comprehensible and approachable and maintainable as possible. 
> Often this is done with comments ;)
> 
> > +static void __bdi_writeout_inc(struct backing_dev_info *bdi)
> > +{
> > +	bdi_writeout_norm(bdi);
> 
> I'm assuming that "norm" here means "normalise".  There's a hint for me.
> 
> > +	__inc_bdi_stat(bdi, BDI_WRITEOUT);
> > +	percpu_counter_mod(&vm_writeout_total, 1);
> > +}
> > +
> > +void get_writeout_scale(struct backing_dev_info *bdi, long *scale, long *div)
> > +{
> > +	int bits = vm_cycle_shift - 1;
> > +	unsigned long cycle = 1UL << bits;
> > +	unsigned long mask = cycle - 1;
> > +	unsigned long total = percpu_counter_read(&vm_writeout_total);
> > +
> > +	if (bdi_cap_writeback_dirty(bdi)) {
> > +		bdi_writeout_norm(bdi);
> > +		*scale = bdi_stat(bdi, BDI_WRITEOUT);
> > +	} else
> > +		*scale = 0;
> > +
> > +	*div = cycle + (total & mask);
> > +}
> 
> I suppose that if I stared at this for long enough I could work out what
> it's doing, and why it's doing it.  But given that it needs comments
> telling others that, there isn't much point in me blowing the time to do
> so.

Right, so with the above clear (I hope), bdi_writeout_inc() tracks the
per bdi and global writeout events. We need to normalize the counter
before incrementing because the global period might have expired due to
another BDI's activity.

get_writeout_scale() does the magic of getting the current ratio.
Remember the example; say that the current state for A, B and, C is:
  3:8:7 of 18
and we're currently interested in A's share.

the 18 (*div) is obtained from 16 (cycle) + 2 (total & 15),
the 3 (*scale) is read from the per BDI counter (again, after
normalizing it, for the global period might have been advance since we
last showed interest in it).

(nr * 3) / 18 gives A's share of nr.

If anything remains unclear, please holler.

> It should have static scope.

ok

> Are all the per-bdi counters being exposed in sysfs?  I think not. 
> Probably they should be?

Like stated in that sysfs email, I'd rather have we had a per bdi sysfs
spot, the current location is only for disks.

> > +/*
> >   * Work out the current dirty-memory clamping and background writeout
> >   * thresholds.
> >   *
> > @@ -158,8 +238,8 @@ static unsigned long determine_dirtyable
> >  }
> >  
> >  static void
> > -get_dirty_limits(long *pbackground, long *pdirty,
> > -					struct address_space *mapping)
> > +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> > +		 struct backing_dev_info *bdi)
> >  {
> >  	int background_ratio;		/* Percentages */
> >  	int dirty_ratio;
> > @@ -193,6 +273,30 @@ get_dirty_limits(long *pbackground, long
> >  	}
> >  	*pbackground = background;
> >  	*pdirty = dirty;
> > +
> > +	if (bdi) {
> > +		long long tmp = dirty;
> > +		long reserve;
> > +		long scale, div;
> > +
> > +		get_writeout_scale(bdi, &scale, &div);
> > +
> > +		tmp *= scale;
> > +		do_div(tmp, div);
> > +
> > +		reserve = dirty -
> > +			(global_page_state(NR_FILE_DIRTY) +
> > +			 global_page_state(NR_WRITEBACK) +
> > +			 global_page_state(NR_UNSTABLE_NFS));
> > +
> > +		if (reserve < 0)
> > +			reserve = 0;
> > +
> > +		reserve += bdi_stat(bdi, BDI_RECLAIMABLE) +
> > +			bdi_stat(bdi, BDI_WRITEBACK);
> > +
> > +		*pbdi_dirty = min((long)tmp, reserve);
> 
> min_t is preferred
> 
> tmp isn't a particularly good identifier

ok, shall fix.

> > +	}
> >  }
> >  
> >  /*
> > @@ -204,9 +308,11 @@ get_dirty_limits(long *pbackground, long
> >   */
> >  static void balance_dirty_pages(struct address_space *mapping)
> >  {
> > -	long nr_reclaimable;
> > +	long bdi_nr_reclaimable;
> > +	long bdi_nr_writeback;
> >  	long background_thresh;
> >  	long dirty_thresh;
> > +	long bdi_thresh;
> >  	unsigned long pages_written = 0;
> >  	unsigned long write_chunk = sync_writeback_pages();
> >  
> > @@ -221,15 +327,15 @@ static void balance_dirty_pages(struct a
> >  			.range_cyclic	= 1,
> >  		};
> >  
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
> > -		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > -					global_page_state(NR_UNSTABLE_NFS);
> > -		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
> > -			dirty_thresh)
> > +		get_dirty_limits(&background_thresh, &dirty_thresh,
> > +				&bdi_thresh, bdi);
> > +		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > +		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > +		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> >  				break;
> >  
> > -		if (!dirty_exceeded)
> > -			dirty_exceeded = 1;
> > +		if (!bdi->dirty_exceeded)
> > +			bdi->dirty_exceeded = 1;
> >  
> >  		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> >  		 * Unstable writes are a feature of certain networked
> > @@ -237,16 +343,27 @@ static void balance_dirty_pages(struct a
> >  		 * written to the server's write cache, but has not yet
> >  		 * been flushed to permanent storage.
> >  		 */
> > -		if (nr_reclaimable) {
> > +		if (bdi_nr_reclaimable) {
> >  			writeback_inodes(&wbc);
> > -			get_dirty_limits(&background_thresh,
> > -					 	&dirty_thresh, mapping);
> > -			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > -					global_page_state(NR_UNSTABLE_NFS);
> > -			if (nr_reclaimable +
> > -				global_page_state(NR_WRITEBACK)
> > -					<= dirty_thresh)
> > -						break;
> > +
> > +			get_dirty_limits(&background_thresh, &dirty_thresh,
> > +				       &bdi_thresh, bdi);
> > +
> > +			if (bdi_thresh < 2*bdi_stat_delta()) {
> > +				bdi_nr_reclaimable =
> > +					bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> > +				bdi_nr_writeback =
> > +					bdi_stat_sum(bdi, BDI_WRITEBACK);
> > +			} else {
> > +				bdi_nr_reclaimable =
> > +					bdi_stat(bdi, BDI_RECLAIMABLE);
> > +				bdi_nr_writeback =
> > +					bdi_stat(bdi, BDI_WRITEBACK);
> > +			}
> > +
> > +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > +				break;
> > +
> >  			pages_written += write_chunk - wbc.nr_to_write;
> >  			if (pages_written >= write_chunk)
> >  				break;		/* We've done our duty */
> > @@ -254,9 +371,9 @@ static void balance_dirty_pages(struct a
> >  		congestion_wait(WRITE, HZ/10);
> >  	}
> >  
> > -	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
> > -		<= dirty_thresh && dirty_exceeded)
> > -			dirty_exceeded = 0;
> > +	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
> > +			bdi->dirty_exceeded)
> > +		bdi->dirty_exceeded = 0;
> >  
> >  	if (writeback_in_progress(bdi))
> >  		return;		/* pdflush is already working this queue */
> > @@ -270,7 +387,9 @@ static void balance_dirty_pages(struct a
> >  	 * background_thresh, to keep the amount of dirty memory low.
> >  	 */
> >  	if ((laptop_mode && pages_written) ||
> > -	     (!laptop_mode && (nr_reclaimable > background_thresh)))
> > +			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> > +					  + global_page_state(NR_UNSTABLE_NFS)
> > +					  > background_thresh)))
> >  		pdflush_operation(background_writeout, 0);
> >  }
> 
> Did you test laptop mode?

admittedly, no. Shall do.

> > @@ -306,7 +425,7 @@ void balance_dirty_pages_ratelimited_nr(
> >  	unsigned long *p;
> >  
> >  	ratelimit = ratelimit_pages;
> > -	if (dirty_exceeded)
> > +	if (mapping->backing_dev_info->dirty_exceeded)
> >  		ratelimit = 8;
> >  
> >  	/*
> > @@ -342,7 +461,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
> >  	}
> >  
> >          for ( ; ; ) {
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> > +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> >  
> >                  /*
> >                   * Boost the allowable dirty threshold a bit for page
> > @@ -377,7 +496,7 @@ static void background_writeout(unsigned
> >  		long background_thresh;
> >  		long dirty_thresh;
> >  
> > -		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
> > +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> >  		if (global_page_state(NR_FILE_DIRTY) +
> >  			global_page_state(NR_UNSTABLE_NFS) < background_thresh
> >  				&& min_pages <= 0)
> > @@ -585,6 +704,8 @@ void __init page_writeback_init(void)
> >  	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
> >  	writeback_set_ratelimit();
> >  	register_cpu_notifier(&ratelimit_nb);
> > +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> > +	percpu_counter_init(&vm_writeout_total, 0);
> >  }
> >  
> >  /**
> > @@ -988,8 +1109,10 @@ int test_clear_page_writeback(struct pag
> >  			radix_tree_tag_clear(&mapping->page_tree,
> >  						page_index(page),
> >  						PAGECACHE_TAG_WRITEBACK);
> > -			if (bdi_cap_writeback_dirty(bdi))
> > +			if (bdi_cap_writeback_dirty(bdi)) {
> >  				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> > +				__bdi_writeout_inc(bdi);
> > +			}
> >  		}
> >  		write_unlock_irqrestore(&mapping->tree_lock, flags);
> >  	} else {
> > Index: linux-2.6/mm/backing-dev.c
> > ===================================================================
> > --- linux-2.6.orig/mm/backing-dev.c	2007-04-20 15:20:11.000000000 +0200
> > +++ linux-2.6/mm/backing-dev.c	2007-04-20 15:31:42.000000000 +0200
> > @@ -12,6 +12,9 @@ void bdi_init(struct backing_dev_info *b
> >  	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
> >  		return;
> >  
> > +	spin_lock_init(&bdi->lock);
> > +	bdi->cycles = 0;
> > +	bdi->dirty_exceeded = 0;
> >  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> >  		percpu_counter_init(&bdi->bdi_stat[i], 0);
> >  }
> > 
> 
> ho hum, I'll toss it all in -mm, see what happens.

You show great confidence, thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 12:01       ` Peter Zijlstra
@ 2007-04-21 12:15         ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 12:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou


> > > +/*
> > > + * maximal error of a stat counter.
> > > + */
> > > +static inline unsigned long bdi_stat_delta(void)
> > > +{
> > > +#ifdef CONFIG_SMP
> > > +	return NR_CPUS * FBC_BATCH;
> > 
> > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.

Right, I knew about that but, uhm.

I wanted to make that num_online_cpus(), and install a hotplug notifier
to fold the percpu delta back into the total on cpu offline.

But I have to look into doing that hotplug notifier stuff.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 12:15         ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 12:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

> > > +/*
> > > + * maximal error of a stat counter.
> > > + */
> > > +static inline unsigned long bdi_stat_delta(void)
> > > +{
> > > +#ifdef CONFIG_SMP
> > > +	return NR_CPUS * FBC_BATCH;
> > 
> > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.

Right, I knew about that but, uhm.

I wanted to make that num_online_cpus(), and install a hotplug notifier
to fold the percpu delta back into the total on cpu offline.

But I have to look into doing that hotplug notifier stuff.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
  2007-04-21 11:02       ` Peter Zijlstra
@ 2007-04-21 19:21         ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21 19:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 13:02:26 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> > > +	cpu = get_cpu();
> > > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > > +	count = *pcount + amount;
> > > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > > +		spin_lock(&fbc->lock);
> > > +		fbc->count += count;
> > > +		*pcount = 0;
> > > +		spin_unlock(&fbc->lock);
> > > +	} else {
> > > +		*pcount = count;
> > > +	}
> > > +	put_cpu();
> > > +}
> > > +EXPORT_SYMBOL(percpu_counter_mod64);
> > 
> > Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> > PAE has only 64,000,000 pages and won't be using deltas of more than 4
> > gigapages?
> > 
> > <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> > another changelog bug>
> 
> Yeah, /me chastises himself for that...
> 
> This is because percpu_counter is s64 instead of the native long; I need
> to halve the counter at some point (bdi_writeout_norm) and do that by
> subtracting half the current value.

ah, the mysterious bdi_writeout_norm().

I don't think it's possible to precisely halve a percpu_counter - there has
to be some error involved.  I guess that's acceptable within the
inscrutable bdi_writeout_norm().

otoh, there's a chance that the attempt to halve the counter will take the
counter negative, due to races.  Does the elusive bdi_writeout_norm()
handle that?  If not, it should.  If it does, then there should be comments
around the places where this is being handled, because it is subtle, and unobvious,
and others might break it by accident.

> If percpu_counter_mod is limited to s32 this might not always work
> (although in practice it might just fit).

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
@ 2007-04-21 19:21         ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-21 19:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 13:02:26 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> > > +	cpu = get_cpu();
> > > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > > +	count = *pcount + amount;
> > > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > > +		spin_lock(&fbc->lock);
> > > +		fbc->count += count;
> > > +		*pcount = 0;
> > > +		spin_unlock(&fbc->lock);
> > > +	} else {
> > > +		*pcount = count;
> > > +	}
> > > +	put_cpu();
> > > +}
> > > +EXPORT_SYMBOL(percpu_counter_mod64);
> > 
> > Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> > PAE has only 64,000,000 pages and won't be using deltas of more than 4
> > gigapages?
> > 
> > <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> > another changelog bug>
> 
> Yeah, /me chastises himself for that...
> 
> This is because percpu_counter is s64 instead of the native long; I need
> to halve the counter at some point (bdi_writeout_norm) and do that by
> subtracting half the current value.

ah, the mysterious bdi_writeout_norm().

I don't think it's possible to precisely halve a percpu_counter - there has
to be some error involved.  I guess that's acceptable within the
inscrutable bdi_writeout_norm().

otoh, there's a chance that the attempt to halve the counter will take the
counter negative, due to races.  Does the elusive bdi_writeout_norm()
handle that?  If not, it should.  If it does, then there should be comments
around the places where this is being handled, because it is subtle, and unobvious,
and others might break it by accident.

> If percpu_counter_mod is limited to s32 this might not always work
> (although in practice it might just fit).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
  2007-04-21 19:21         ` Andrew Morton
@ 2007-04-21 19:30           ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 19:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 12:21 -0700, Andrew Morton wrote:
> On Sat, 21 Apr 2007 13:02:26 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > > > +	cpu = get_cpu();
> > > > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > > > +	count = *pcount + amount;
> > > > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > > > +		spin_lock(&fbc->lock);
> > > > +		fbc->count += count;
> > > > +		*pcount = 0;
> > > > +		spin_unlock(&fbc->lock);
> > > > +	} else {
> > > > +		*pcount = count;
> > > > +	}
> > > > +	put_cpu();
> > > > +}
> > > > +EXPORT_SYMBOL(percpu_counter_mod64);
> > > 
> > > Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> > > PAE has only 64,000,000 pages and won't be using deltas of more than 4
> > > gigapages?
> > > 
> > > <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> > > another changelog bug>
> > 
> > Yeah, /me chastises himself for that...
> > 
> > This is because percpu_counter is s64 instead of the native long; I need
> > to halve the counter at some point (bdi_writeout_norm) and do that by
> > subtracting half the current value.
> 
> ah, the mysterious bdi_writeout_norm().
> 
> I don't think it's possible to precisely halve a percpu_counter - there has
> to be some error involved.  I guess that's acceptable within the
> inscrutable bdi_writeout_norm().
> 
> otoh, there's a chance that the attempt to halve the counter will take the
> counter negative, due to races.  Does the elusive bdi_writeout_norm()
> handle that?  If not, it should.  If it does, then there should be comments
> around the places where this is being handled, because it is subtle, and unobvious,
> and others might break it by accident.

The counter it is halving is only ever incremented, so we might be off a
little, but only to the safe side.

I shall do the comment thing along with all the other missing
comments :-)


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 04/10] lib: percpu_counter_mod64
@ 2007-04-21 19:30           ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 19:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 12:21 -0700, Andrew Morton wrote:
> On Sat, 21 Apr 2007 13:02:26 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > > > +	cpu = get_cpu();
> > > > +	pcount = per_cpu_ptr(fbc->counters, cpu);
> > > > +	count = *pcount + amount;
> > > > +	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
> > > > +		spin_lock(&fbc->lock);
> > > > +		fbc->count += count;
> > > > +		*pcount = 0;
> > > > +		spin_unlock(&fbc->lock);
> > > > +	} else {
> > > > +		*pcount = count;
> > > > +	}
> > > > +	put_cpu();
> > > > +}
> > > > +EXPORT_SYMBOL(percpu_counter_mod64);
> > > 
> > > Bloaty.  Surely we won't be needing this on 32-bit kernels?  Even monster
> > > PAE has only 64,000,000 pages and won't be using deltas of more than 4
> > > gigapages?
> > > 
> > > <Does even 64-bit need to handle 4 gigapages in a single hit?  /me suspects
> > > another changelog bug>
> > 
> > Yeah, /me chastises himself for that...
> > 
> > This is because percpu_counter is s64 instead of the native long; I need
> > to halve the counter at some point (bdi_writeout_norm) and do that by
> > subtracting half the current value.
> 
> ah, the mysterious bdi_writeout_norm().
> 
> I don't think it's possible to precisely halve a percpu_counter - there has
> to be some error involved.  I guess that's acceptable within the
> inscrutable bdi_writeout_norm().
> 
> otoh, there's a chance that the attempt to halve the counter will take the
> counter negative, due to races.  Does the elusive bdi_writeout_norm()
> handle that?  If not, it should.  If it does, then there should be comments
> around the places where this is being handled, because it is subtle, and unobvious,
> and others might break it by accident.

The counter it is halving is only ever incremented, so we might be off a
little, but only to the safe side.

I shall do the comment thing along with all the other missing
comments :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 12:15         ` Peter Zijlstra
@ 2007-04-21 19:50           ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 19:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 14:15 +0200, Peter Zijlstra wrote:
> > > > +/*
> > > > + * maximal error of a stat counter.
> > > > + */
> > > > +static inline unsigned long bdi_stat_delta(void)
> > > > +{
> > > > +#ifdef CONFIG_SMP
> > > > +	return NR_CPUS * FBC_BATCH;
> > > 
> > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> Right, I knew about that but, uhm.
> 
> I wanted to make that num_online_cpus(), and install a hotplug notifier
> to fold the percpu delta back into the total on cpu offline.
> 
> But I have to look into doing that hotplug notifier stuff.

Something like this should do I think, I just looked at other hotplug
code and imitated the pattern.

I assumed CONFIG_HOTPLUG_CPU requires CONFIG_SMP, I didn't actually try
that one :-)

---

In order to estimate the per stat counter error more accurately, using
num_online_cpus() instead of NR_CPUS, install a cpu hotplug notifier
(when cpu hotplug is enabled) that flushes whatever percpu delta was
present into the total on cpu unplug.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h    |    6 ++++-
 include/linux/percpu_counter.h |    1 
 lib/percpu_counter.c           |   11 +++++++++
 mm/backing-dev.c               |   47 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-21 21:33:28.000000000 +0200
@@ -51,6 +51,10 @@ struct backing_dev_info {
 	spinlock_t lock;	/* protect the cycle count */
 	unsigned long cycles;	/* writeout cycles */
 	int dirty_exceeded;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct notifier_block hotplug_nb;
+#endif
 };
 
 void bdi_init(struct backing_dev_info *bdi);
@@ -137,7 +141,7 @@ static inline s64 bdi_stat_sum(struct ba
 static inline unsigned long bdi_stat_delta(void)
 {
 #ifdef CONFIG_SMP
-	return NR_CPUS * FBC_BATCH;
+	return num_online_cpus() * FBC_BATCH;
 #else
 	return 1UL;
 #endif
Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/include/linux/percpu_counter.h	2007-04-21 21:33:17.000000000 +0200
@@ -38,6 +38,7 @@ static inline void percpu_counter_destro
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
 void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
 s64 percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_fold(struct percpu_counter *fbx, int cpu);
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/lib/percpu_counter.c	2007-04-21 21:33:17.000000000 +0200
@@ -72,3 +72,14 @@ s64 percpu_counter_sum(struct percpu_cou
 	return ret < 0 ? 0 : ret;
 }
 EXPORT_SYMBOL(percpu_counter_sum);
+
+void percpu_counter_fold(struct percpu_counter *fbc, int cpu)
+{
+	s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+	if (*pcount) {
+		spin_lock(&fbc->lock);
+		fbc->count += *pcount;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	}
+}
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-21 21:34:47.000000000 +0200
@@ -4,6 +4,49 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int bdi_stat_fold(struct notifier_block *nb,
+		unsigned long action, void *hcpu)
+{
+	struct backing_dev_info *bdi =
+		container_of(nb, struct backing_dev_info, hotplug_nb);
+	unsigned long flags;
+	int cpu = (unsigned long)hcpu;
+	int i;
+
+	if (action == CPU_DEAD) {
+		local_irq_save(flags);
+		for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+			percpu_counter_fold(&bdi->bdi_stat[i], cpu);
+		local_irq_restore(flags);
+	}
+	return NOTIFY_OK;
+}
+
+static void bdi_init_hotplug(struct backing_dev_info *bdi)
+{
+	bdi->hotplug_nb = (struct notifier_block){
+		.notifier_call = bdi_stat_fold,
+		.priority = 0,
+	};
+	register_hotcpu_notifier(&bdi->hotplug_nb);
+}
+
+static void bdi_destroy_hotplug(struct backing_dev_info *bdi)
+{
+	unregister_hotcpu_notifier(&bdi->hotplug_nb);
+}
+#else
+static void bdi_init_hotplug(struct backing_dev_info *bdi)
+{
+}
+
+static void bdi_destroy_hotplug(struct backing_dev_info *bdi)
+{
+}
+#endif
 
 void bdi_init(struct backing_dev_info *bdi)
 {
@@ -17,6 +60,8 @@ void bdi_init(struct backing_dev_info *b
 	bdi->dirty_exceeded = 0;
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_init(&bdi->bdi_stat[i], 0);
+
+	bdi_init_hotplug(bdi);
 }
 EXPORT_SYMBOL(bdi_init);
 
@@ -27,6 +72,8 @@ void bdi_destroy(struct backing_dev_info
 	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
 		return;
 
+	bdi_destroy_hotplug(bdi);
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 }



^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 19:50           ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-21 19:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 14:15 +0200, Peter Zijlstra wrote:
> > > > +/*
> > > > + * maximal error of a stat counter.
> > > > + */
> > > > +static inline unsigned long bdi_stat_delta(void)
> > > > +{
> > > > +#ifdef CONFIG_SMP
> > > > +	return NR_CPUS * FBC_BATCH;
> > > 
> > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> Right, I knew about that but, uhm.
> 
> I wanted to make that num_online_cpus(), and install a hotplug notifier
> to fold the percpu delta back into the total on cpu offline.
> 
> But I have to look into doing that hotplug notifier stuff.

Something like this should do I think, I just looked at other hotplug
code and imitated the pattern.

I assumed CONFIG_HOTPLUG_CPU requires CONFIG_SMP, I didn't actually try
that one :-)

---

In order to estimate the per stat counter error more accurately, using
num_online_cpus() instead of NR_CPUS, install a cpu hotplug notifier
(when cpu hotplug is enabled) that flushes whatever percpu delta was
present into the total on cpu unplug.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/backing-dev.h    |    6 ++++-
 include/linux/percpu_counter.h |    1 
 lib/percpu_counter.c           |   11 +++++++++
 mm/backing-dev.c               |   47 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/include/linux/backing-dev.h	2007-04-21 21:33:28.000000000 +0200
@@ -51,6 +51,10 @@ struct backing_dev_info {
 	spinlock_t lock;	/* protect the cycle count */
 	unsigned long cycles;	/* writeout cycles */
 	int dirty_exceeded;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	struct notifier_block hotplug_nb;
+#endif
 };
 
 void bdi_init(struct backing_dev_info *bdi);
@@ -137,7 +141,7 @@ static inline s64 bdi_stat_sum(struct ba
 static inline unsigned long bdi_stat_delta(void)
 {
 #ifdef CONFIG_SMP
-	return NR_CPUS * FBC_BATCH;
+	return num_online_cpus() * FBC_BATCH;
 #else
 	return 1UL;
 #endif
Index: linux-2.6/include/linux/percpu_counter.h
===================================================================
--- linux-2.6.orig/include/linux/percpu_counter.h	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/include/linux/percpu_counter.h	2007-04-21 21:33:17.000000000 +0200
@@ -38,6 +38,7 @@ static inline void percpu_counter_destro
 void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
 void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount);
 s64 percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_fold(struct percpu_counter *fbx, int cpu);
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/lib/percpu_counter.c	2007-04-21 21:33:17.000000000 +0200
@@ -72,3 +72,14 @@ s64 percpu_counter_sum(struct percpu_cou
 	return ret < 0 ? 0 : ret;
 }
 EXPORT_SYMBOL(percpu_counter_sum);
+
+void percpu_counter_fold(struct percpu_counter *fbc, int cpu)
+{
+	s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+	if (*pcount) {
+		spin_lock(&fbc->lock);
+		fbc->count += *pcount;
+		*pcount = 0;
+		spin_unlock(&fbc->lock);
+	}
+}
Index: linux-2.6/mm/backing-dev.c
===================================================================
--- linux-2.6.orig/mm/backing-dev.c	2007-04-21 21:32:49.000000000 +0200
+++ linux-2.6/mm/backing-dev.c	2007-04-21 21:34:47.000000000 +0200
@@ -4,6 +4,49 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int bdi_stat_fold(struct notifier_block *nb,
+		unsigned long action, void *hcpu)
+{
+	struct backing_dev_info *bdi =
+		container_of(nb, struct backing_dev_info, hotplug_nb);
+	unsigned long flags;
+	int cpu = (unsigned long)hcpu;
+	int i;
+
+	if (action == CPU_DEAD) {
+		local_irq_save(flags);
+		for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+			percpu_counter_fold(&bdi->bdi_stat[i], cpu);
+		local_irq_restore(flags);
+	}
+	return NOTIFY_OK;
+}
+
+static void bdi_init_hotplug(struct backing_dev_info *bdi)
+{
+	bdi->hotplug_nb = (struct notifier_block){
+		.notifier_call = bdi_stat_fold,
+		.priority = 0,
+	};
+	register_hotcpu_notifier(&bdi->hotplug_nb);
+}
+
+static void bdi_destroy_hotplug(struct backing_dev_info *bdi)
+{
+	unregister_hotcpu_notifier(&bdi->hotplug_nb);
+}
+#else
+static void bdi_init_hotplug(struct backing_dev_info *bdi)
+{
+}
+
+static void bdi_destroy_hotplug(struct backing_dev_info *bdi)
+{
+}
+#endif
 
 void bdi_init(struct backing_dev_info *bdi)
 {
@@ -17,6 +60,8 @@ void bdi_init(struct backing_dev_info *b
 	bdi->dirty_exceeded = 0;
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_init(&bdi->bdi_stat[i], 0);
+
+	bdi_init_hotplug(bdi);
 }
 EXPORT_SYMBOL(bdi_init);
 
@@ -27,6 +72,8 @@ void bdi_destroy(struct backing_dev_info
 	if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
 		return;
 
+	bdi_destroy_hotplug(bdi);
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 10:54         ` Andrew Morton
@ 2007-04-21 20:25           ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-21 20:25 UTC (permalink / raw)
  To: akpm
  Cc: miklos, a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > The other deadlock, in throttle_vm_writeout() is still to be solved.
> 
> Let's go back to the original changelog:
> 
> Author: marcelo.tosatti <marcelo.tosatti>
> Date:   Tue Mar 8 17:25:19 2005 +0000
> 
>     [PATCH] vm: pageout throttling
>     
>     With silly pageout testcases it is possible to place huge amounts of memory
>     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
>     possible to place _all_ memory under I/O at the same time.
>     
>     This means that all memory is pinned and unreclaimable and the VM gets
>     upset and goes oom.
>     
>     The patch limits the amount of memory which is under pageout writeout to be
>     a little more than the amount of memory at which balance_dirty_pages()
>     callers will synchronously throttle.
>     
>     This means that heavy pageout activity can starve heavy writeback activity
>     completely, but heavy writeback activity will not cause starvation of
>     pageout.  Because we don't want a simple `dd' to be causing excessive
>     latencies in page reclaim.
>     
>     Signed-off-by: Andrew Morton <akpm@osdl.org>
>     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> 
> (A good one!  I wrote it ;))
> 
> 
> I believe that the combination of dirty-page-tracking and its calls to
> balance_dirty_pages() mean that we can now never get more than dirty_ratio
> of memory into the dirty-or-writeback condition.
> 
> The vm scanner can convert dirty pages into clean, under-writeback pages,
> but it cannot increase the total of dirty+writeback.

What about swapout?  That can increase the number of writeback pages,
without decreasing the number of dirty pages, no?

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-21 20:25           ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-21 20:25 UTC (permalink / raw)
  To: akpm
  Cc: miklos, a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > The other deadlock, in throttle_vm_writeout() is still to be solved.
> 
> Let's go back to the original changelog:
> 
> Author: marcelo.tosatti <marcelo.tosatti>
> Date:   Tue Mar 8 17:25:19 2005 +0000
> 
>     [PATCH] vm: pageout throttling
>     
>     With silly pageout testcases it is possible to place huge amounts of memory
>     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
>     possible to place _all_ memory under I/O at the same time.
>     
>     This means that all memory is pinned and unreclaimable and the VM gets
>     upset and goes oom.
>     
>     The patch limits the amount of memory which is under pageout writeout to be
>     a little more than the amount of memory at which balance_dirty_pages()
>     callers will synchronously throttle.
>     
>     This means that heavy pageout activity can starve heavy writeback activity
>     completely, but heavy writeback activity will not cause starvation of
>     pageout.  Because we don't want a simple `dd' to be causing excessive
>     latencies in page reclaim.
>     
>     Signed-off-by: Andrew Morton <akpm@osdl.org>
>     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> 
> (A good one!  I wrote it ;))
> 
> 
> I believe that the combination of dirty-page-tracking and its calls to
> balance_dirty_pages() mean that we can now never get more than dirty_ratio
> of memory into the dirty-or-writeback condition.
> 
> The vm scanner can convert dirty pages into clean, under-writeback pages,
> but it cannot increase the total of dirty+writeback.

What about swapout?  That can increase the number of writeback pages,
without decreasing the number of dirty pages, no?

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
  2007-04-21 11:07       ` Peter Zijlstra
@ 2007-04-22  7:19         ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  7:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 13:07:16 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> > On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > Count per BDI writeback pages.
> > > 
> > > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > ---
> > >  include/linux/backing-dev.h |    1 +
> > >  mm/page-writeback.c         |   12 ++++++++++--
> > >  2 files changed, 11 insertions(+), 2 deletions(-)
> > > 
> > > Index: linux-2.6/mm/page-writeback.c
> > > ===================================================================
> > > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> > > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > > @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
> > >  	int ret;
> > >  
> > >  	if (mapping) {
> > > +		struct backing_dev_info *bdi = mapping->backing_dev_info;
> > >  		unsigned long flags;
> > >  
> > >  		write_lock_irqsave(&mapping->tree_lock, flags);
> > >  		ret = TestClearPageWriteback(page);
> > > -		if (ret)
> > > +		if (ret) {
> > >  			radix_tree_tag_clear(&mapping->page_tree,
> > >  						page_index(page),
> > >  						PAGECACHE_TAG_WRITEBACK);
> > > +			if (bdi_cap_writeback_dirty(bdi))
> > > +				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> > 
> > Why do we test bdi_cap_writeback_dirty() here?
> > 
> > If we remove that test, we end up accumulating statistics for
> > non-writebackable backing devs, but does that matter? 
> 
> It would not, had I not cheated:
> 
> +void bdi_init(struct backing_dev_info *bdi)
> +{
> +       int i;
> +
> +       if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
> +               return;
> +
> +       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> +               percpu_counter_init(&bdi->bdi_stat[i], 0);
> +}
> +EXPORT_SYMBOL(bdi_init);
> 
> >  Probably the common
> > case is writebackable backing-devs, so eliminating the test-n-branch might
> > be a net microgain.
> 
> Time vs space. Now we don't even have storage for those BDIs..
> 
> Don't particularly care on this point though, I just thought it might be
> worthwhile to save on the percpu data.

It could be that we never call test_clear_page_writeback() against
!bdi_cap_writeback_dirty() pages anwyay.  I can't think why we would, but
the relationships there aren't very clear.  Does "don't account for dirty
memory" imply "doesn't ever do writeback"?  One would need to check, and
it's perhaps a bit fragile.

It's worth checking though.  Boy we're doing a lot of stuff in there
nowadays.

OT: it might be worth looking into batching this work up - the predominant
caller should be mpage_end_io_write(), and he has a whole bunch of pages
which are usually all from the same file, all contiguous.  It's pretty
inefficient to be handling that data one-page-at-a-time, and some
significant speedups may be available.

Instead, everyone seems to think that variable pagecache page size is the
only way of improving things.  Shudder.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
@ 2007-04-22  7:19         ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  7:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 13:07:16 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
> > On Fri, 20 Apr 2007 17:52:02 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > Count per BDI writeback pages.
> > > 
> > > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > ---
> > >  include/linux/backing-dev.h |    1 +
> > >  mm/page-writeback.c         |   12 ++++++++++--
> > >  2 files changed, 11 insertions(+), 2 deletions(-)
> > > 
> > > Index: linux-2.6/mm/page-writeback.c
> > > ===================================================================
> > > --- linux-2.6.orig/mm/page-writeback.c	2007-04-20 15:27:28.000000000 +0200
> > > +++ linux-2.6/mm/page-writeback.c	2007-04-20 15:28:10.000000000 +0200
> > > @@ -979,14 +979,18 @@ int test_clear_page_writeback(struct pag
> > >  	int ret;
> > >  
> > >  	if (mapping) {
> > > +		struct backing_dev_info *bdi = mapping->backing_dev_info;
> > >  		unsigned long flags;
> > >  
> > >  		write_lock_irqsave(&mapping->tree_lock, flags);
> > >  		ret = TestClearPageWriteback(page);
> > > -		if (ret)
> > > +		if (ret) {
> > >  			radix_tree_tag_clear(&mapping->page_tree,
> > >  						page_index(page),
> > >  						PAGECACHE_TAG_WRITEBACK);
> > > +			if (bdi_cap_writeback_dirty(bdi))
> > > +				__dec_bdi_stat(bdi, BDI_WRITEBACK);
> > 
> > Why do we test bdi_cap_writeback_dirty() here?
> > 
> > If we remove that test, we end up accumulating statistics for
> > non-writebackable backing devs, but does that matter? 
> 
> It would not, had I not cheated:
> 
> +void bdi_init(struct backing_dev_info *bdi)
> +{
> +       int i;
> +
> +       if (!(bdi_cap_writeback_dirty(bdi) || bdi_cap_account_dirty(bdi)))
> +               return;
> +
> +       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> +               percpu_counter_init(&bdi->bdi_stat[i], 0);
> +}
> +EXPORT_SYMBOL(bdi_init);
> 
> >  Probably the common
> > case is writebackable backing-devs, so eliminating the test-n-branch might
> > be a net microgain.
> 
> Time vs space. Now we don't even have storage for those BDIs..
> 
> Don't particularly care on this point though, I just thought it might be
> worthwhile to save on the percpu data.

It could be that we never call test_clear_page_writeback() against
!bdi_cap_writeback_dirty() pages anwyay.  I can't think why we would, but
the relationships there aren't very clear.  Does "don't account for dirty
memory" imply "doesn't ever do writeback"?  One would need to check, and
it's perhaps a bit fragile.

It's worth checking though.  Boy we're doing a lot of stuff in there
nowadays.

OT: it might be worth looking into batching this work up - the predominant
caller should be mpage_end_io_write(), and he has a whole bunch of pages
which are usually all from the same file, all contiguous.  It's pretty
inefficient to be handling that data one-page-at-a-time, and some
significant speedups may be available.

Instead, everyone seems to think that variable pagecache page size is the
only way of improving things.  Shudder.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 12:01       ` Peter Zijlstra
@ 2007-04-22  7:26         ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  7:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 14:01:36 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
>
> > > +
> > > +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> > > +		bdi->cycles += cycle;
> > > +	}
> > > +	bdi->cycles = global_cycle;
> > > +	spin_unlock_irqrestore(&bdi->lock, flags);
> > > +}
> > 
> > Here we get to the real critical substance of the patchset, and I don't
> > have a clue what it's doing nor how it's doing it.  And I bet nobody else
> > does either.
> 
> I shall send a comment patch; but let me try to explain:
> 
> I am trying to keep a floating proportion between the BDIs based on
> writeout events.

The term "writeout event" hasn't been defined.  I assume that it refers to
something like "one call to balance_dirty_pages()".  Or maybe "one pass
through balance_dirty_pages()'s inner loop".  Or maybe something else. 
This is important, because the reader is already a bit lost.

> That is, each device is given a share equal to its
> proportion of completed writebacks

In what units are "writebacks" measured?  Pages?

> (writeback, we are in the process of
> writing vs. writeout, we have written). This proportion is measured in a
> 'time'-span measured itself in writeouts.

time is measured how?  jiffies?  Calls to balance_dirty_pages(), or passes
around its inner loop, or...

> Example:
> 
>   device A completes 4, device B completes 12 and, device C 16 writes.

writes of what?  One page??

I think you get my point ;) Please start from the top.  Define terms before
using them, always specify in what units all things are being measured,
assume *no* prior knowledge apart from general kernel-fu.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-22  7:26         ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  7:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007 14:01:36 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Sat, 2007-04-21 at 02:55 -0700, Andrew Morton wrote:
>
> > > +
> > > +		__mod_bdi_stat64(bdi, BDI_WRITEOUT, -half);
> > > +		bdi->cycles += cycle;
> > > +	}
> > > +	bdi->cycles = global_cycle;
> > > +	spin_unlock_irqrestore(&bdi->lock, flags);
> > > +}
> > 
> > Here we get to the real critical substance of the patchset, and I don't
> > have a clue what it's doing nor how it's doing it.  And I bet nobody else
> > does either.
> 
> I shall send a comment patch; but let me try to explain:
> 
> I am trying to keep a floating proportion between the BDIs based on
> writeout events.

The term "writeout event" hasn't been defined.  I assume that it refers to
something like "one call to balance_dirty_pages()".  Or maybe "one pass
through balance_dirty_pages()'s inner loop".  Or maybe something else. 
This is important, because the reader is already a bit lost.

> That is, each device is given a share equal to its
> proportion of completed writebacks

In what units are "writebacks" measured?  Pages?

> (writeback, we are in the process of
> writing vs. writeout, we have written). This proportion is measured in a
> 'time'-span measured itself in writeouts.

time is measured how?  jiffies?  Calls to balance_dirty_pages(), or passes
around its inner loop, or...

> Example:
> 
>   device A completes 4, device B completes 12 and, device C 16 writes.

writes of what?  One page??

I think you get my point ;) Please start from the top.  Define terms before
using them, always specify in what units all things are being measured,
assume *no* prior knowledge apart from general kernel-fu.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
  2007-04-22  7:19         ` Andrew Morton
@ 2007-04-22  9:08           ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-22  9:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sun, 2007-04-22 at 00:19 -0700, Andrew Morton wrote:

> It could be that we never call test_clear_page_writeback() against
> !bdi_cap_writeback_dirty() pages anwyay.  I can't think why we would, but
> the relationships there aren't very clear.  Does "don't account for dirty
> memory" imply "doesn't ever do writeback"?  One would need to check, and
> it's perhaps a bit fragile.

I did, thats how that test ended up there; I guess a comment would have
been a good thing, no? :-)

end_swap_bio_write() calls end_page_writeback(), and
swap_backing_dev_info has neither cap_writeback nor cap_account_dirty.

> It's worth checking though.  Boy we're doing a lot of stuff in there
> nowadays.
> 
> OT: it might be worth looking into batching this work up - the predominant
> caller should be mpage_end_io_write(), and he has a whole bunch of pages
> which are usually all from the same file, all contiguous.  It's pretty
> inefficient to be handling that data one-page-at-a-time, and some
> significant speedups may be available.

Right, that might be a good spot to hook into, I'll have a look.

> Instead, everyone seems to think that variable pagecache page size is the
> only way of improving things.  Shudder.

hehe, I guess you haven't looked at my concurrent pagecache patches yet
either :-)


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 08/10] mm: count writeback pages per BDI
@ 2007-04-22  9:08           ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-22  9:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sun, 2007-04-22 at 00:19 -0700, Andrew Morton wrote:

> It could be that we never call test_clear_page_writeback() against
> !bdi_cap_writeback_dirty() pages anwyay.  I can't think why we would, but
> the relationships there aren't very clear.  Does "don't account for dirty
> memory" imply "doesn't ever do writeback"?  One would need to check, and
> it's perhaps a bit fragile.

I did, thats how that test ended up there; I guess a comment would have
been a good thing, no? :-)

end_swap_bio_write() calls end_page_writeback(), and
swap_backing_dev_info has neither cap_writeback nor cap_account_dirty.

> It's worth checking though.  Boy we're doing a lot of stuff in there
> nowadays.
> 
> OT: it might be worth looking into batching this work up - the predominant
> caller should be mpage_end_io_write(), and he has a whole bunch of pages
> which are usually all from the same file, all contiguous.  It's pretty
> inefficient to be handling that data one-page-at-a-time, and some
> significant speedups may be available.

Right, that might be a good spot to hook into, I'll have a look.

> Instead, everyone seems to think that variable pagecache page size is the
> only way of improving things.  Shudder.

hehe, I guess you haven't looked at my concurrent pagecache patches yet
either :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 00/10] per device dirty throttling -v5
  2007-04-20 15:51 ` Peter Zijlstra
@ 2007-04-22  9:57   ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  9:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou


So after a cheerful five hours making all the new crap with which they have
inflicted me actually compile, first up is powerpc:

initcall 0xc0000000006dc650: .scsi_complete_async_scans+0x0/0x1dc() returned 0.
initcall 0xc0000000006dc650 ran for 0 msecs: .scsi_complete_async_scans+0x0/0x1dc()
Calling initcall 0xc0000000006ea1d0: .tcp_congestion_default+0x0/0x18()
initcall 0xc0000000006ea1d0: .tcp_congestion_default+0x0/0x18() returned 0.
initcall 0xc0000000006ea1d0 ran for 0 msecs: .tcp_congestion_default+0x0/0x18()
Freeing unused kernel memory: 292k freed
EXT3-fs: INFO: recovery required on readonly filesystem.
EXT3-fs: write access will be enabled during recovery.
Unable to handle kernel paging request for data at address 0x0000000f
Faulting instruction address: 0xc0000000001d060c
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=4 PowerMac
Modules linked in:
NIP: c0000000001d060c LR: c0000000000aaecc CTR: 0000000000000000
REGS: c0000000080f70c0 TRAP: 0300   Tainted: G      D  (2.6.21-rc7-mm1)
MSR: 9000000000009032 <EE,ME,IR,DR>  CR: 24024028  XER: 000fffff
DAR: 000000000000000f, DSISR: 0000000040000000
TASK = c0000000080e47f0[1] 'init' THREAD: c0000000080f4000 CPU: 2
GPR00: c0000000000aaecc c0000000080f7340 c0000000006f69f8 c00000000c45d768 
GPR04: 0000000000000001 0000000000000000 c00000000064e700 c00000000064e030 
GPR08: 0000000000000030 ffffffffffffffff 0000000000000013 0000000000000010 
GPR12: 0000000024024024 c000000000628980 0000000000000000 c0000000090090d8 
GPR16: c0000000090090c0 c0000000090090a8 0000000000000000 0000000000000000 
GPR20: 0000000000000000 c000000000733cd0 0000000000000000 0000000000000002 
GPR24: c0000000080f7630 000000000007ccde c000000009858908 c00000000984400c 
GPR28: c00000000c45d768 c00000000bc2bcb0 c000000000658aa0 c000000002372818 
NIP [c0000000001d060c] .percpu_counter_mod+0x2c/0xd8
LR [c0000000000aaecc] .__set_page_dirty_nobuffers+0x14c/0x17c
Call Trace:
[c0000000080f7340] [c0000000001088e4] .alloc_page_buffers+0x58/0x100 (unreliable)
[c0000000080f73d0] [c0000000000aaecc] .__set_page_dirty_nobuffers+0x14c/0x17c
[c0000000080f7460] [c000000000106328] .mark_buffer_dirty+0x5c/0x70
[c0000000080f74e0] [c00000000015efb8] .do_one_pass+0x55c/0x6a8
[c0000000080f75c0] [c00000000015f3f8] .journal_recover+0x1c0/0x1c8
[c0000000080f7670] [c000000000164178] .journal_load+0xcc/0x178
[c0000000080f7700] [c000000000151134] .ext3_fill_super+0xfc0/0x1a44
[c0000000080f7840] [c0000000000d8554] .get_sb_bdev+0x200/0x260
[c0000000080f7920] [c000000000152454] .ext3_get_sb+0x20/0x38
[c0000000080f79a0] [c0000000000d8900] .vfs_kern_mount+0x80/0x108
[c0000000080f7a40] [c0000000000f7a94] .do_mount+0x2e4/0x930
[c0000000080f7d60] [c00000000011a4f4] .compat_sys_mount+0xf4/0x2b0
[c0000000080f7e30] [c00000000000872c] syscall_exit+0x0/0x40
Instruction dump:
4bffff00 7c0802a6 fb81ffe0 fbe1fff8 fba1ffe8 7c7c1b78 f8010010 f821ff71 
a16d000a e9230010 796b1f24 7d2948f8 <7fab482a> 801d0000 7c002214 7c1f07b4 
DART table allocated at: c00000007f000000

I'll drop 'em.

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 00/10] per device dirty throttling -v5
@ 2007-04-22  9:57   ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-22  9:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

So after a cheerful five hours making all the new crap with which they have
inflicted me actually compile, first up is powerpc:

initcall 0xc0000000006dc650: .scsi_complete_async_scans+0x0/0x1dc() returned 0.
initcall 0xc0000000006dc650 ran for 0 msecs: .scsi_complete_async_scans+0x0/0x1dc()
Calling initcall 0xc0000000006ea1d0: .tcp_congestion_default+0x0/0x18()
initcall 0xc0000000006ea1d0: .tcp_congestion_default+0x0/0x18() returned 0.
initcall 0xc0000000006ea1d0 ran for 0 msecs: .tcp_congestion_default+0x0/0x18()
Freeing unused kernel memory: 292k freed
EXT3-fs: INFO: recovery required on readonly filesystem.
EXT3-fs: write access will be enabled during recovery.
Unable to handle kernel paging request for data at address 0x0000000f
Faulting instruction address: 0xc0000000001d060c
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=4 PowerMac
Modules linked in:
NIP: c0000000001d060c LR: c0000000000aaecc CTR: 0000000000000000
REGS: c0000000080f70c0 TRAP: 0300   Tainted: G      D  (2.6.21-rc7-mm1)
MSR: 9000000000009032 <EE,ME,IR,DR>  CR: 24024028  XER: 000fffff
DAR: 000000000000000f, DSISR: 0000000040000000
TASK = c0000000080e47f0[1] 'init' THREAD: c0000000080f4000 CPU: 2
GPR00: c0000000000aaecc c0000000080f7340 c0000000006f69f8 c00000000c45d768 
GPR04: 0000000000000001 0000000000000000 c00000000064e700 c00000000064e030 
GPR08: 0000000000000030 ffffffffffffffff 0000000000000013 0000000000000010 
GPR12: 0000000024024024 c000000000628980 0000000000000000 c0000000090090d8 
GPR16: c0000000090090c0 c0000000090090a8 0000000000000000 0000000000000000 
GPR20: 0000000000000000 c000000000733cd0 0000000000000000 0000000000000002 
GPR24: c0000000080f7630 000000000007ccde c000000009858908 c00000000984400c 
GPR28: c00000000c45d768 c00000000bc2bcb0 c000000000658aa0 c000000002372818 
NIP [c0000000001d060c] .percpu_counter_mod+0x2c/0xd8
LR [c0000000000aaecc] .__set_page_dirty_nobuffers+0x14c/0x17c
Call Trace:
[c0000000080f7340] [c0000000001088e4] .alloc_page_buffers+0x58/0x100 (unreliable)
[c0000000080f73d0] [c0000000000aaecc] .__set_page_dirty_nobuffers+0x14c/0x17c
[c0000000080f7460] [c000000000106328] .mark_buffer_dirty+0x5c/0x70
[c0000000080f74e0] [c00000000015efb8] .do_one_pass+0x55c/0x6a8
[c0000000080f75c0] [c00000000015f3f8] .journal_recover+0x1c0/0x1c8
[c0000000080f7670] [c000000000164178] .journal_load+0xcc/0x178
[c0000000080f7700] [c000000000151134] .ext3_fill_super+0xfc0/0x1a44
[c0000000080f7840] [c0000000000d8554] .get_sb_bdev+0x200/0x260
[c0000000080f7920] [c000000000152454] .ext3_get_sb+0x20/0x38
[c0000000080f79a0] [c0000000000d8900] .vfs_kern_mount+0x80/0x108
[c0000000080f7a40] [c0000000000f7a94] .do_mount+0x2e4/0x930
[c0000000080f7d60] [c00000000011a4f4] .compat_sys_mount+0xf4/0x2b0
[c0000000080f7e30] [c00000000000872c] syscall_exit+0x0/0x40
Instruction dump:
4bffff00 7c0802a6 fb81ffe0 fbe1fff8 fba1ffe8 7c7c1b78 f8010010 f821ff71 
a16d000a e9230010 796b1f24 7d2948f8 <7fab482a> 801d0000 7c002214 7c1f07b4 
DART table allocated at: c00000007f000000

I'll drop 'em.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 20:25           ` Miklos Szeredi
@ 2007-04-23  6:14             ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-23  6:14 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linux-mm, linux-kernel, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 22:25 +0200, Miklos Szeredi wrote: 
> > > The other deadlock, in throttle_vm_writeout() is still to be solved.
> > 
> > Let's go back to the original changelog:
> > 
> > Author: marcelo.tosatti <marcelo.tosatti>
> > Date:   Tue Mar 8 17:25:19 2005 +0000
> > 
> >     [PATCH] vm: pageout throttling
> >     
> >     With silly pageout testcases it is possible to place huge amounts of memory
> >     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
> >     possible to place _all_ memory under I/O at the same time.
> >     
> >     This means that all memory is pinned and unreclaimable and the VM gets
> >     upset and goes oom.
> >     
> >     The patch limits the amount of memory which is under pageout writeout to be
> >     a little more than the amount of memory at which balance_dirty_pages()
> >     callers will synchronously throttle.
> >     
> >     This means that heavy pageout activity can starve heavy writeback activity
> >     completely, but heavy writeback activity will not cause starvation of
> >     pageout.  Because we don't want a simple `dd' to be causing excessive
> >     latencies in page reclaim.
> >     
> >     Signed-off-by: Andrew Morton <akpm@osdl.org>
> >     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> > 
> > (A good one!  I wrote it ;))
> > 
> > 
> > I believe that the combination of dirty-page-tracking and its calls to
> > balance_dirty_pages() mean that we can now never get more than dirty_ratio
> > of memory into the dirty-or-writeback condition.
> > 
> > The vm scanner can convert dirty pages into clean, under-writeback pages,
> > but it cannot increase the total of dirty+writeback.
> 
> What about swapout?  That can increase the number of writeback pages,
> without decreasing the number of dirty pages, no?

Could we not solve that by enabling cap_account_writeback on
swapper_space, and thereby account swap writeback pages. Then the VM
knows it has outstanding IO and need not panic.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23  6:14             ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-23  6:14 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, linux-mm, linux-kernel, neilb, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Sat, 2007-04-21 at 22:25 +0200, Miklos Szeredi wrote: 
> > > The other deadlock, in throttle_vm_writeout() is still to be solved.
> > 
> > Let's go back to the original changelog:
> > 
> > Author: marcelo.tosatti <marcelo.tosatti>
> > Date:   Tue Mar 8 17:25:19 2005 +0000
> > 
> >     [PATCH] vm: pageout throttling
> >     
> >     With silly pageout testcases it is possible to place huge amounts of memory
> >     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
> >     possible to place _all_ memory under I/O at the same time.
> >     
> >     This means that all memory is pinned and unreclaimable and the VM gets
> >     upset and goes oom.
> >     
> >     The patch limits the amount of memory which is under pageout writeout to be
> >     a little more than the amount of memory at which balance_dirty_pages()
> >     callers will synchronously throttle.
> >     
> >     This means that heavy pageout activity can starve heavy writeback activity
> >     completely, but heavy writeback activity will not cause starvation of
> >     pageout.  Because we don't want a simple `dd' to be causing excessive
> >     latencies in page reclaim.
> >     
> >     Signed-off-by: Andrew Morton <akpm@osdl.org>
> >     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> > 
> > (A good one!  I wrote it ;))
> > 
> > 
> > I believe that the combination of dirty-page-tracking and its calls to
> > balance_dirty_pages() mean that we can now never get more than dirty_ratio
> > of memory into the dirty-or-writeback condition.
> > 
> > The vm scanner can convert dirty pages into clean, under-writeback pages,
> > but it cannot increase the total of dirty+writeback.
> 
> What about swapout?  That can increase the number of writeback pages,
> without decreasing the number of dirty pages, no?

Could we not solve that by enabling cap_account_writeback on
swapper_space, and thereby account swap writeback pages. Then the VM
knows it has outstanding IO and need not panic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-23  6:14             ` Peter Zijlstra
@ 2007-04-23  6:29               ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-23  6:29 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, akpm, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > The other deadlock, in throttle_vm_writeout() is still to be solved.
> > > 
> > > Let's go back to the original changelog:
> > > 
> > > Author: marcelo.tosatti <marcelo.tosatti>
> > > Date:   Tue Mar 8 17:25:19 2005 +0000
> > > 
> > >     [PATCH] vm: pageout throttling
> > >     
> > >     With silly pageout testcases it is possible to place huge amounts of memory
> > >     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
> > >     possible to place _all_ memory under I/O at the same time.
> > >     
> > >     This means that all memory is pinned and unreclaimable and the VM gets
> > >     upset and goes oom.
> > >     
> > >     The patch limits the amount of memory which is under pageout writeout to be
> > >     a little more than the amount of memory at which balance_dirty_pages()
> > >     callers will synchronously throttle.
> > >     
> > >     This means that heavy pageout activity can starve heavy writeback activity
> > >     completely, but heavy writeback activity will not cause starvation of
> > >     pageout.  Because we don't want a simple `dd' to be causing excessive
> > >     latencies in page reclaim.
> > >     
> > >     Signed-off-by: Andrew Morton <akpm@osdl.org>
> > >     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> > > 
> > > (A good one!  I wrote it ;))
> > > 
> > > 
> > > I believe that the combination of dirty-page-tracking and its calls to
> > > balance_dirty_pages() mean that we can now never get more than dirty_ratio
> > > of memory into the dirty-or-writeback condition.
> > > 
> > > The vm scanner can convert dirty pages into clean, under-writeback pages,
> > > but it cannot increase the total of dirty+writeback.
> > 
> > What about swapout?  That can increase the number of writeback pages,
> > without decreasing the number of dirty pages, no?
> 
> Could we not solve that by enabling cap_account_writeback on
> swapper_space, and thereby account swap writeback pages. Then the VM
> knows it has outstanding IO and need not panic.

Hmm, I'm not sure that would be right, because then those writeback
pages would be accounted twice: once for swapper_space, and once for
the real device.

So there's a condition, when lots of anonymous pages are turned into
swap-cache writeback pages, and we should somehow throttle this, because

>>>     This means that all memory is pinned and unreclaimable and the VM gets
>>>     upset and goes oom.

although, it's not quite clear in my mind, how the VM gets upset about
this.

One way to throttle just the swapout activity, is to do the per-bdi
accounting on swapper_space, and limit the number of writeback pages
to e.g. the global threshold + 10%, which is basically what
throttle_vm_writeout() currently does, only now it does it
indiscriminately, and not just on swap writeback pages.

Does this make any sense?

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23  6:29               ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-23  6:29 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, akpm, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > The other deadlock, in throttle_vm_writeout() is still to be solved.
> > > 
> > > Let's go back to the original changelog:
> > > 
> > > Author: marcelo.tosatti <marcelo.tosatti>
> > > Date:   Tue Mar 8 17:25:19 2005 +0000
> > > 
> > >     [PATCH] vm: pageout throttling
> > >     
> > >     With silly pageout testcases it is possible to place huge amounts of memory
> > >     under I/O.  With a large request queue (CFQ uses 8192 requests) it is
> > >     possible to place _all_ memory under I/O at the same time.
> > >     
> > >     This means that all memory is pinned and unreclaimable and the VM gets
> > >     upset and goes oom.
> > >     
> > >     The patch limits the amount of memory which is under pageout writeout to be
> > >     a little more than the amount of memory at which balance_dirty_pages()
> > >     callers will synchronously throttle.
> > >     
> > >     This means that heavy pageout activity can starve heavy writeback activity
> > >     completely, but heavy writeback activity will not cause starvation of
> > >     pageout.  Because we don't want a simple `dd' to be causing excessive
> > >     latencies in page reclaim.
> > >     
> > >     Signed-off-by: Andrew Morton <akpm@osdl.org>
> > >     Signed-off-by: Linus Torvalds <torvalds@osdl.org>
> > > 
> > > (A good one!  I wrote it ;))
> > > 
> > > 
> > > I believe that the combination of dirty-page-tracking and its calls to
> > > balance_dirty_pages() mean that we can now never get more than dirty_ratio
> > > of memory into the dirty-or-writeback condition.
> > > 
> > > The vm scanner can convert dirty pages into clean, under-writeback pages,
> > > but it cannot increase the total of dirty+writeback.
> > 
> > What about swapout?  That can increase the number of writeback pages,
> > without decreasing the number of dirty pages, no?
> 
> Could we not solve that by enabling cap_account_writeback on
> swapper_space, and thereby account swap writeback pages. Then the VM
> knows it has outstanding IO and need not panic.

Hmm, I'm not sure that would be right, because then those writeback
pages would be accounted twice: once for swapper_space, and once for
the real device.

So there's a condition, when lots of anonymous pages are turned into
swap-cache writeback pages, and we should somehow throttle this, because

>>>     This means that all memory is pinned and unreclaimable and the VM gets
>>>     upset and goes oom.

although, it's not quite clear in my mind, how the VM gets upset about
this.

One way to throttle just the swapout activity, is to do the per-bdi
accounting on swapper_space, and limit the number of writeback pages
to e.g. the global threshold + 10%, which is basically what
throttle_vm_writeout() currently does, only now it does it
indiscriminately, and not just on swap writeback pages.

Does this make any sense?

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-23  6:29               ` Miklos Szeredi
@ 2007-04-23  6:39                 ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-23  6:39 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 23 Apr 2007 08:29:59 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > > What about swapout?  That can increase the number of writeback pages,
> > > without decreasing the number of dirty pages, no?
> > 
> > Could we not solve that by enabling cap_account_writeback on
> > swapper_space, and thereby account swap writeback pages. Then the VM
> > knows it has outstanding IO and need not panic.
> 
> Hmm, I'm not sure that would be right, because then those writeback
> pages would be accounted twice: once for swapper_space, and once for
> the real device.
> 
> So there's a condition, when lots of anonymous pages are turned into
> swap-cache writeback pages, and we should somehow throttle this, because
> 
> >>>     This means that all memory is pinned and unreclaimable and the VM gets
> >>>     upset and goes oom.
> 
> although, it's not quite clear in my mind, how the VM gets upset about
> this.

I've been scratching my head on and off for a couple of days over this.

We've traditionally had reclaim problems when there's a huge amount of
dirty MAP_SHARED data, which the VM didn't know was dirty.  It's the old
"map a file which is the same size as physical memory and write to it all"
stresstest.

But we do not have such problems with anonymous memory, and I'm darned if I
can remember why :(


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23  6:39                 ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-23  6:39 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, linux-mm, linux-kernel, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 23 Apr 2007 08:29:59 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > > What about swapout?  That can increase the number of writeback pages,
> > > without decreasing the number of dirty pages, no?
> > 
> > Could we not solve that by enabling cap_account_writeback on
> > swapper_space, and thereby account swap writeback pages. Then the VM
> > knows it has outstanding IO and need not panic.
> 
> Hmm, I'm not sure that would be right, because then those writeback
> pages would be accounted twice: once for swapper_space, and once for
> the real device.
> 
> So there's a condition, when lots of anonymous pages are turned into
> swap-cache writeback pages, and we should somehow throttle this, because
> 
> >>>     This means that all memory is pinned and unreclaimable and the VM gets
> >>>     upset and goes oom.
> 
> although, it's not quite clear in my mind, how the VM gets upset about
> this.

I've been scratching my head on and off for a couple of days over this.

We've traditionally had reclaim problems when there's a huge amount of
dirty MAP_SHARED data, which the VM didn't know was dirty.  It's the old
"map a file which is the same size as physical memory and write to it all"
stresstest.

But we do not have such problems with anonymous memory, and I'm darned if I
can remember why :(

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-21 12:15         ` Peter Zijlstra
@ 2007-04-23 15:48           ` Christoph Lameter
  -1 siblings, 0 replies; 110+ messages in thread
From: Christoph Lameter @ 2007-04-23 15:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007, Peter Zijlstra wrote:

> > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> Right, I knew about that but, uhm.
> 
> I wanted to make that num_online_cpus(), and install a hotplug notifier
> to fold the percpu delta back into the total on cpu offline.

Use nr_cpu_ids instead. Contains the maximum possible cpus on this 
hardware and allows to handle the hotplug case easily.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23 15:48           ` Christoph Lameter
  0 siblings, 0 replies; 110+ messages in thread
From: Christoph Lameter @ 2007-04-23 15:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Sat, 21 Apr 2007, Peter Zijlstra wrote:

> > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> 
> Right, I knew about that but, uhm.
> 
> I wanted to make that num_online_cpus(), and install a hotplug notifier
> to fold the percpu delta back into the total on cpu offline.

Use nr_cpu_ids instead. Contains the maximum possible cpus on this 
hardware and allows to handle the hotplug case easily.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-23 15:48           ` Christoph Lameter
@ 2007-04-23 15:58             ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-23 15:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 2007-04-23 at 08:48 -0700, Christoph Lameter wrote:
> On Sat, 21 Apr 2007, Peter Zijlstra wrote:
> 
> > > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> > 
> > Right, I knew about that but, uhm.
> > 
> > I wanted to make that num_online_cpus(), and install a hotplug notifier
> > to fold the percpu delta back into the total on cpu offline.
> 
> Use nr_cpu_ids instead. Contains the maximum possible cpus on this 
> hardware and allows to handle the hotplug case easily.

Ooh, thats handy... /me ditches the hotplug code again.
That is, unless its very common to have half empty boxens.. ?


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23 15:58             ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-23 15:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 2007-04-23 at 08:48 -0700, Christoph Lameter wrote:
> On Sat, 21 Apr 2007, Peter Zijlstra wrote:
> 
> > > > This is enormously wrong for CONFIG_NR_CPUS=1024 on a 2-way.
> > 
> > Right, I knew about that but, uhm.
> > 
> > I wanted to make that num_online_cpus(), and install a hotplug notifier
> > to fold the percpu delta back into the total on cpu offline.
> 
> Use nr_cpu_ids instead. Contains the maximum possible cpus on this 
> hardware and allows to handle the hotplug case easily.

Ooh, thats handy... /me ditches the hotplug code again.
That is, unless its very common to have half empty boxens.. ?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-23 15:58             ` Peter Zijlstra
@ 2007-04-23 16:08               ` Christoph Lameter
  -1 siblings, 0 replies; 110+ messages in thread
From: Christoph Lameter @ 2007-04-23 16:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 23 Apr 2007, Peter Zijlstra wrote:

> Ooh, thats handy... /me ditches the hotplug code again.
> That is, unless its very common to have half empty boxens.. ?

Its up to the arch code to establish reasonable boundaries.....


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-23 16:08               ` Christoph Lameter
  0 siblings, 0 replies; 110+ messages in thread
From: Christoph Lameter @ 2007-04-23 16:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, linux-mm, linux-kernel, miklos, neilb, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Mon, 23 Apr 2007, Peter Zijlstra wrote:

> Ooh, thats handy... /me ditches the hotplug code again.
> That is, unless its very common to have half empty boxens.. ?

Its up to the arch code to establish reasonable boundaries.....

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-20 15:52   ` Peter Zijlstra
@ 2007-04-24  2:58     ` Neil Brown
  -1 siblings, 0 replies; 110+ messages in thread
From: Neil Brown @ 2007-04-24  2:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Friday April 20, a.p.zijlstra@chello.nl wrote:
> Scale writeback cache per backing device, proportional to its writeout speed.

So it works like this:

 We account for writeout in full pages.
 When a page has the Writeback flag cleared, we account that as a
 successfully retired write for the relevant bdi.
 By using floating averages we keep track of how many writes each bdi
 has retired 'recently' where the unit of time in which we understand
 'recently' is a single page written.

 We keep a floating average for each bdi, and a floating average for
 the total writeouts (that 'average' is, of course, 1.)

 Using these numbers we can calculate what faction of 'recently'
 retired writes were retired by each bdi (get_writeout_scale).

 Multiplying this fraction by the system-wide number of pages that are
 allowed to be dirty before write-throttling, we get the number of
 pages that the bdi can have dirty before write-throttling the bdi.

 I note that the same fraction is *not* applied to background_thresh.
 Should it be?  I guess not - there would be interesting starting
 transients, as a bdi which had done no writeout would not be allowed
 any dirty pages, so background writeout would start immediately,
 which isn't what you want... or is it?

 For each bdi we also track the number of (dirty, writeback, unstable)
 pages and do not allow this to exceed the limit set for this bdi.

 The calculations involving 'reserve' in get_dirty_limits are a little
 confusing.  It looks like you calculating how much total head-room
 there is for the bdi (pages that the system can still dirty - pages
 this bdi has dirty) and making sure the number returned in pbdi_dirty
 doesn't allow more than that to be used.  This is probably a
 reasonable thing to do but it doesn't feel like the right place.  I
 think get_dirty_limits should return the raw threshold, and
 balance_dirty_pages should do both tests - the bdi-local test and the
 system-wide test.

 Currently you have a rather odd situation where
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+				break;
 might included numbers obtained with bdi_stat_sum being compared with
 numbers obtained with bdi_stat.


 With these patches, the VM still (I think) assumes that each BDI has
 a reasonable queue limit, so that writeback_inodes will block on a
 full queue.  If a BDI has a very large queue, balance_dirty_pages
 will simply turn lots of DIRTY pages into WRITEBACK pages and then
 think "We've done our duty" without actually blocking at all.

 With the extra accounting that we now have, I would like to see
 balance_dirty_pages dirty pages wait until RECLAIMABLE+WRITEBACK is
 actually less than 'threshold'.  This would probably mean that we
 would need to support per-bdi background_writeout to smooth things
 out.  Maybe that it fodder for another patch-set.

 You set:
+	vm_cycle_shift = 1 + ilog2(vm_total_pages);

 Can you explain that?  My experience is that scaling dirty limits
 with main memory isn't what we really want.  When you get machines
 with very large memory, the amount that you want to be dirty is more
 a function of the speed of your IO devices, rather than the amount
 of memory, otherwise you can sometimes see large filesystem lags
 ('sync' taking minutes?)

 I wonder if it makes sense to try to limit the dirty data for a bdi
 to the amount that it can write out in some period of time - maybe 3
 seconds.  Probably configurable.  You seem to have almost all the
 infrastructure in place to do that, and I think it could be a
 valuable feature.

 At least, I think vm_cycle_shift should be tied (loosely) to 
   dirty_ratio * vm_total_pages
 ??

On the whole, looks good!

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  2:58     ` Neil Brown
  0 siblings, 0 replies; 110+ messages in thread
From: Neil Brown @ 2007-04-24  2:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mm, linux-kernel, miklos, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Friday April 20, a.p.zijlstra@chello.nl wrote:
> Scale writeback cache per backing device, proportional to its writeout speed.

So it works like this:

 We account for writeout in full pages.
 When a page has the Writeback flag cleared, we account that as a
 successfully retired write for the relevant bdi.
 By using floating averages we keep track of how many writes each bdi
 has retired 'recently' where the unit of time in which we understand
 'recently' is a single page written.

 We keep a floating average for each bdi, and a floating average for
 the total writeouts (that 'average' is, of course, 1.)

 Using these numbers we can calculate what faction of 'recently'
 retired writes were retired by each bdi (get_writeout_scale).

 Multiplying this fraction by the system-wide number of pages that are
 allowed to be dirty before write-throttling, we get the number of
 pages that the bdi can have dirty before write-throttling the bdi.

 I note that the same fraction is *not* applied to background_thresh.
 Should it be?  I guess not - there would be interesting starting
 transients, as a bdi which had done no writeout would not be allowed
 any dirty pages, so background writeout would start immediately,
 which isn't what you want... or is it?

 For each bdi we also track the number of (dirty, writeback, unstable)
 pages and do not allow this to exceed the limit set for this bdi.

 The calculations involving 'reserve' in get_dirty_limits are a little
 confusing.  It looks like you calculating how much total head-room
 there is for the bdi (pages that the system can still dirty - pages
 this bdi has dirty) and making sure the number returned in pbdi_dirty
 doesn't allow more than that to be used.  This is probably a
 reasonable thing to do but it doesn't feel like the right place.  I
 think get_dirty_limits should return the raw threshold, and
 balance_dirty_pages should do both tests - the bdi-local test and the
 system-wide test.

 Currently you have a rather odd situation where
+			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+				break;
 might included numbers obtained with bdi_stat_sum being compared with
 numbers obtained with bdi_stat.


 With these patches, the VM still (I think) assumes that each BDI has
 a reasonable queue limit, so that writeback_inodes will block on a
 full queue.  If a BDI has a very large queue, balance_dirty_pages
 will simply turn lots of DIRTY pages into WRITEBACK pages and then
 think "We've done our duty" without actually blocking at all.

 With the extra accounting that we now have, I would like to see
 balance_dirty_pages dirty pages wait until RECLAIMABLE+WRITEBACK is
 actually less than 'threshold'.  This would probably mean that we
 would need to support per-bdi background_writeout to smooth things
 out.  Maybe that it fodder for another patch-set.

 You set:
+	vm_cycle_shift = 1 + ilog2(vm_total_pages);

 Can you explain that?  My experience is that scaling dirty limits
 with main memory isn't what we really want.  When you get machines
 with very large memory, the amount that you want to be dirty is more
 a function of the speed of your IO devices, rather than the amount
 of memory, otherwise you can sometimes see large filesystem lags
 ('sync' taking minutes?)

 I wonder if it makes sense to try to limit the dirty data for a bdi
 to the amount that it can write out in some period of time - maybe 3
 seconds.  Probably configurable.  You seem to have almost all the
 infrastructure in place to do that, and I think it could be a
 valuable feature.

 At least, I think vm_cycle_shift should be tied (loosely) to 
   dirty_ratio * vm_total_pages
 ??

On the whole, looks good!

Thanks,
NeilBrown

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  2:58     ` Neil Brown
@ 2007-04-24  7:09       ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  7:09 UTC (permalink / raw)
  To: Neil Brown
  Cc: linux-mm, linux-kernel, miklos, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 12:58 +1000, Neil Brown wrote:
> On Friday April 20, a.p.zijlstra@chello.nl wrote:
> > Scale writeback cache per backing device, proportional to its writeout speed.
> 
> So it works like this:
> 
>  We account for writeout in full pages.
>  When a page has the Writeback flag cleared, we account that as a
>  successfully retired write for the relevant bdi.
>  By using floating averages we keep track of how many writes each bdi
>  has retired 'recently' where the unit of time in which we understand
>  'recently' is a single page written.

That is actually that period I keep referring to. So recently is the
last 'period' number of writeout completions.

>  We keep a floating average for each bdi, and a floating average for
>  the total writeouts (that 'average' is, of course, 1.)

1 in the sense of unity, yes :-)

>  Using these numbers we can calculate what faction of 'recently'
>  retired writes were retired by each bdi (get_writeout_scale).
> 
>  Multiplying this fraction by the system-wide number of pages that are
>  allowed to be dirty before write-throttling, we get the number of
>  pages that the bdi can have dirty before write-throttling the bdi.
> 
>  I note that the same fraction is *not* applied to background_thresh.
>  Should it be?  I guess not - there would be interesting starting
>  transients, as a bdi which had done no writeout would not be allowed
>  any dirty pages, so background writeout would start immediately,
>  which isn't what you want... or is it?

This is something I have not been able to come to a conclusive answer
yet,... 

>  For each bdi we also track the number of (dirty, writeback, unstable)
>  pages and do not allow this to exceed the limit set for this bdi.
> 
>  The calculations involving 'reserve' in get_dirty_limits are a little
>  confusing.  It looks like you calculating how much total head-room
>  there is for the bdi (pages that the system can still dirty - pages
>  this bdi has dirty) and making sure the number returned in pbdi_dirty
>  doesn't allow more than that to be used.  

Yes, it limits the earned share of the total dirty limit to the possible
share, ensuring that the total dirty limit is never exceeded.

This is especially relevant when the proportions change faster than the
pages get written out, ie. when the period << total dirty limit.

> This is probably a
>  reasonable thing to do but it doesn't feel like the right place.  I
>  think get_dirty_limits should return the raw threshold, and
>  balance_dirty_pages should do both tests - the bdi-local test and the
>  system-wide test.

Ok, that makes sense I guess.

>  Currently you have a rather odd situation where
> +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> +				break;
>  might included numbers obtained with bdi_stat_sum being compared with
>  numbers obtained with bdi_stat.

Yes, I was aware of that. The bdi_thresh is based on bdi_stat() numbers,
whereas the others could be bdi_stat_sum(). I think this is ok, since
the threshold is a 'guess' anyway, we just _need_ to ensure we do not
get trapped by writeouts not arriving (due to getting stuck in the per
cpu deltas).  -- I have all this commented in the new version.

>  With these patches, the VM still (I think) assumes that each BDI has
>  a reasonable queue limit, so that writeback_inodes will block on a
>  full queue.  If a BDI has a very large queue, balance_dirty_pages
>  will simply turn lots of DIRTY pages into WRITEBACK pages and then
>  think "We've done our duty" without actually blocking at all.

It will block once we exceed the total number of dirty pages allowed for
that BDI. But yes, this does not take away the need for queue limits.

This work was primarily aimed at allowing multiple queues to not
interfere as much, so they all can make progress and not get starved.

>  With the extra accounting that we now have, I would like to see
>  balance_dirty_pages dirty pages wait until RECLAIMABLE+WRITEBACK is
>  actually less than 'threshold'.  This would probably mean that we
>  would need to support per-bdi background_writeout to smooth things
>  out.  Maybe that it fodder for another patch-set.

Indeed, I still have to wrap my mind around the background thing. Your
input is appreciated.

>  You set:
> +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> 
>  Can you explain that?

You found the one random knob I hid :-)

>   My experience is that scaling dirty limits
>  with main memory isn't what we really want.  When you get machines
>  with very large memory, the amount that you want to be dirty is more
>  a function of the speed of your IO devices, rather than the amount
>  of memory, otherwise you can sometimes see large filesystem lags
>  ('sync' taking minutes?)
> 
>  I wonder if it makes sense to try to limit the dirty data for a bdi
>  to the amount that it can write out in some period of time - maybe 3
>  seconds.  Probably configurable.  You seem to have almost all the
>  infrastructure in place to do that, and I think it could be a
>  valuable feature.
> 
>  At least, I think vm_cycle_shift should be tied (loosely) to 
>    dirty_ratio * vm_total_pages
>  ??

Yes, I initially tried that; but I convinced myself that the math doing
the floating average couldn't handle vm_cycle_shift shrinking (getting
larger does seem fine).

I will look at that again, because I think you are absolutely right.

The current set variable, is related to the initial dirty limit, in that
that too is set based on vm_total_pages. It just doesn't adjust
afterwards :-(

In specific, what goes wrong is that when we shrink vm_cycle_shift, the
total cycle count gains bits from the average, and will not match up
with the BDI cycle anymore. This could blow away the full BDI average.

Hmm, that might not be as bad as I thought, people don't fiddle with
dirty_ratio that often anyway.

Yes, I shall tie it to dirty_ratio once again.

> On the whole, looks good!

Thanks for taking the time to look at it in detail!

The latest code is online here:
  http://programming.kicks-ass.net/kernel-patches/balance_dirty_pages/

I shall post it again after the new -mm kernel hits the streets, and
incorporate all feedback.

I hope the comments made things cleared, not create more confusion... It
seems I have a lot to learn when it comes to writing skillz :-/


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  7:09       ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  7:09 UTC (permalink / raw)
  To: Neil Brown
  Cc: linux-mm, linux-kernel, miklos, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 12:58 +1000, Neil Brown wrote:
> On Friday April 20, a.p.zijlstra@chello.nl wrote:
> > Scale writeback cache per backing device, proportional to its writeout speed.
> 
> So it works like this:
> 
>  We account for writeout in full pages.
>  When a page has the Writeback flag cleared, we account that as a
>  successfully retired write for the relevant bdi.
>  By using floating averages we keep track of how many writes each bdi
>  has retired 'recently' where the unit of time in which we understand
>  'recently' is a single page written.

That is actually that period I keep referring to. So recently is the
last 'period' number of writeout completions.

>  We keep a floating average for each bdi, and a floating average for
>  the total writeouts (that 'average' is, of course, 1.)

1 in the sense of unity, yes :-)

>  Using these numbers we can calculate what faction of 'recently'
>  retired writes were retired by each bdi (get_writeout_scale).
> 
>  Multiplying this fraction by the system-wide number of pages that are
>  allowed to be dirty before write-throttling, we get the number of
>  pages that the bdi can have dirty before write-throttling the bdi.
> 
>  I note that the same fraction is *not* applied to background_thresh.
>  Should it be?  I guess not - there would be interesting starting
>  transients, as a bdi which had done no writeout would not be allowed
>  any dirty pages, so background writeout would start immediately,
>  which isn't what you want... or is it?

This is something I have not been able to come to a conclusive answer
yet,... 

>  For each bdi we also track the number of (dirty, writeback, unstable)
>  pages and do not allow this to exceed the limit set for this bdi.
> 
>  The calculations involving 'reserve' in get_dirty_limits are a little
>  confusing.  It looks like you calculating how much total head-room
>  there is for the bdi (pages that the system can still dirty - pages
>  this bdi has dirty) and making sure the number returned in pbdi_dirty
>  doesn't allow more than that to be used.  

Yes, it limits the earned share of the total dirty limit to the possible
share, ensuring that the total dirty limit is never exceeded.

This is especially relevant when the proportions change faster than the
pages get written out, ie. when the period << total dirty limit.

> This is probably a
>  reasonable thing to do but it doesn't feel like the right place.  I
>  think get_dirty_limits should return the raw threshold, and
>  balance_dirty_pages should do both tests - the bdi-local test and the
>  system-wide test.

Ok, that makes sense I guess.

>  Currently you have a rather odd situation where
> +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> +				break;
>  might included numbers obtained with bdi_stat_sum being compared with
>  numbers obtained with bdi_stat.

Yes, I was aware of that. The bdi_thresh is based on bdi_stat() numbers,
whereas the others could be bdi_stat_sum(). I think this is ok, since
the threshold is a 'guess' anyway, we just _need_ to ensure we do not
get trapped by writeouts not arriving (due to getting stuck in the per
cpu deltas).  -- I have all this commented in the new version.

>  With these patches, the VM still (I think) assumes that each BDI has
>  a reasonable queue limit, so that writeback_inodes will block on a
>  full queue.  If a BDI has a very large queue, balance_dirty_pages
>  will simply turn lots of DIRTY pages into WRITEBACK pages and then
>  think "We've done our duty" without actually blocking at all.

It will block once we exceed the total number of dirty pages allowed for
that BDI. But yes, this does not take away the need for queue limits.

This work was primarily aimed at allowing multiple queues to not
interfere as much, so they all can make progress and not get starved.

>  With the extra accounting that we now have, I would like to see
>  balance_dirty_pages dirty pages wait until RECLAIMABLE+WRITEBACK is
>  actually less than 'threshold'.  This would probably mean that we
>  would need to support per-bdi background_writeout to smooth things
>  out.  Maybe that it fodder for another patch-set.

Indeed, I still have to wrap my mind around the background thing. Your
input is appreciated.

>  You set:
> +	vm_cycle_shift = 1 + ilog2(vm_total_pages);
> 
>  Can you explain that?

You found the one random knob I hid :-)

>   My experience is that scaling dirty limits
>  with main memory isn't what we really want.  When you get machines
>  with very large memory, the amount that you want to be dirty is more
>  a function of the speed of your IO devices, rather than the amount
>  of memory, otherwise you can sometimes see large filesystem lags
>  ('sync' taking minutes?)
> 
>  I wonder if it makes sense to try to limit the dirty data for a bdi
>  to the amount that it can write out in some period of time - maybe 3
>  seconds.  Probably configurable.  You seem to have almost all the
>  infrastructure in place to do that, and I think it could be a
>  valuable feature.
> 
>  At least, I think vm_cycle_shift should be tied (loosely) to 
>    dirty_ratio * vm_total_pages
>  ??

Yes, I initially tried that; but I convinced myself that the math doing
the floating average couldn't handle vm_cycle_shift shrinking (getting
larger does seem fine).

I will look at that again, because I think you are absolutely right.

The current set variable, is related to the initial dirty limit, in that
that too is set based on vm_total_pages. It just doesn't adjust
afterwards :-(

In specific, what goes wrong is that when we shrink vm_cycle_shift, the
total cycle count gains bits from the average, and will not match up
with the BDI cycle anymore. This could blow away the full BDI average.

Hmm, that might not be as bad as I thought, people don't fiddle with
dirty_ratio that often anyway.

Yes, I shall tie it to dirty_ratio once again.

> On the whole, looks good!

Thanks for taking the time to look at it in detail!

The latest code is online here:
  http://programming.kicks-ass.net/kernel-patches/balance_dirty_pages/

I shall post it again after the new -mm kernel hits the streets, and
incorporate all feedback.

I hope the comments made things cleared, not create more confusion... It
seems I have a lot to learn when it comes to writing skillz :-/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  7:09       ` Peter Zijlstra
@ 2007-04-24  8:19         ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  8:19 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: neilb, linux-mm, linux-kernel, miklos, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > This is probably a
> >  reasonable thing to do but it doesn't feel like the right place.  I
> >  think get_dirty_limits should return the raw threshold, and
> >  balance_dirty_pages should do both tests - the bdi-local test and the
> >  system-wide test.
> 
> Ok, that makes sense I guess.

Well, my narrow minded world view says it's not such a good idea,
because it would again introduce the deadlock scenario, we're trying
to avoid.

In a sense allowing a queue to go over the global limit just a little
bit is a good thing.  Actually the very original code does that: if
writeback was started for "write_chunk" number of pages, then we allow
"ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
global limit.

That's why I've been saying, that the current code is so unfair: if
there are lots of dirty pages to be written back to a particular
device, then balance_dirty_pages() allows the dirty producer to make
even more pages dirty, but if there are _no_ dirty pages for a device,
and we are over the limit, then that dirty producer is allowed
absolutely no new dirty pages until the global counts subside.

I'm still not quite sure what purpose the above "soft" limiting
serves.  It seems to just give advantage to writers, which managed to
accumulate lots of dirty pages, and then can convert that into even
more dirtyings.

Would it make sense to remove this behavior, and ensure that
balance_dirty_pages() doesn't return until the per-queue limits have
been complied with?

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  8:19         ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  8:19 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: neilb, linux-mm, linux-kernel, miklos, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > This is probably a
> >  reasonable thing to do but it doesn't feel like the right place.  I
> >  think get_dirty_limits should return the raw threshold, and
> >  balance_dirty_pages should do both tests - the bdi-local test and the
> >  system-wide test.
> 
> Ok, that makes sense I guess.

Well, my narrow minded world view says it's not such a good idea,
because it would again introduce the deadlock scenario, we're trying
to avoid.

In a sense allowing a queue to go over the global limit just a little
bit is a good thing.  Actually the very original code does that: if
writeback was started for "write_chunk" number of pages, then we allow
"ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
global limit.

That's why I've been saying, that the current code is so unfair: if
there are lots of dirty pages to be written back to a particular
device, then balance_dirty_pages() allows the dirty producer to make
even more pages dirty, but if there are _no_ dirty pages for a device,
and we are over the limit, then that dirty producer is allowed
absolutely no new dirty pages until the global counts subside.

I'm still not quite sure what purpose the above "soft" limiting
serves.  It seems to just give advantage to writers, which managed to
accumulate lots of dirty pages, and then can convert that into even
more dirtyings.

Would it make sense to remove this behavior, and ensure that
balance_dirty_pages() doesn't return until the per-queue limits have
been complied with?

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  8:19         ` Miklos Szeredi
@ 2007-04-24  8:31           ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  8:31 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: neilb, linux-mm, linux-kernel, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 10:19 +0200, Miklos Szeredi wrote:
> > > This is probably a
> > >  reasonable thing to do but it doesn't feel like the right place.  I
> > >  think get_dirty_limits should return the raw threshold, and
> > >  balance_dirty_pages should do both tests - the bdi-local test and the
> > >  system-wide test.
> > 
> > Ok, that makes sense I guess.
> 
> Well, my narrow minded world view says it's not such a good idea,
> because it would again introduce the deadlock scenario, we're trying
> to avoid.

I was only referring to the placement of the clipping; and exactly where
that happens does not affect the deadlock.

> In a sense allowing a queue to go over the global limit just a little
> bit is a good thing.  Actually the very original code does that: if
> writeback was started for "write_chunk" number of pages, then we allow
> "ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
> global limit.

It might be time to get rid of that rate-limiting.
balance_dirty_pages()'s fast path is not nearly as heavy as it used to
be. All these fancy counter systems have removed quite a bit of
iteration from there.

> That's why I've been saying, that the current code is so unfair: if
> there are lots of dirty pages to be written back to a particular
> device, then balance_dirty_pages() allows the dirty producer to make
> even more pages dirty, but if there are _no_ dirty pages for a device,
> and we are over the limit, then that dirty producer is allowed
> absolutely no new dirty pages until the global counts subside.

Well, that got fixed on a per device basis with this patch, it is still
true for multiple tasks writing to the same device.

> I'm still not quite sure what purpose the above "soft" limiting
> serves.  It seems to just give advantage to writers, which managed to
> accumulate lots of dirty pages, and then can convert that into even
> more dirtyings.

The queues only limit the actual in-flight writeback pages,
balance_dirty_pages() considers all pages that might become writeback as
well as those that are.

> Would it make sense to remove this behavior, and ensure that
> balance_dirty_pages() doesn't return until the per-queue limits have
> been complied with?

I don't think that will help, balance_dirty_pages drives the queues.
That is, it converts pages from mere dirty to writeback.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  8:31           ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  8:31 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: neilb, linux-mm, linux-kernel, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 10:19 +0200, Miklos Szeredi wrote:
> > > This is probably a
> > >  reasonable thing to do but it doesn't feel like the right place.  I
> > >  think get_dirty_limits should return the raw threshold, and
> > >  balance_dirty_pages should do both tests - the bdi-local test and the
> > >  system-wide test.
> > 
> > Ok, that makes sense I guess.
> 
> Well, my narrow minded world view says it's not such a good idea,
> because it would again introduce the deadlock scenario, we're trying
> to avoid.

I was only referring to the placement of the clipping; and exactly where
that happens does not affect the deadlock.

> In a sense allowing a queue to go over the global limit just a little
> bit is a good thing.  Actually the very original code does that: if
> writeback was started for "write_chunk" number of pages, then we allow
> "ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
> global limit.

It might be time to get rid of that rate-limiting.
balance_dirty_pages()'s fast path is not nearly as heavy as it used to
be. All these fancy counter systems have removed quite a bit of
iteration from there.

> That's why I've been saying, that the current code is so unfair: if
> there are lots of dirty pages to be written back to a particular
> device, then balance_dirty_pages() allows the dirty producer to make
> even more pages dirty, but if there are _no_ dirty pages for a device,
> and we are over the limit, then that dirty producer is allowed
> absolutely no new dirty pages until the global counts subside.

Well, that got fixed on a per device basis with this patch, it is still
true for multiple tasks writing to the same device.

> I'm still not quite sure what purpose the above "soft" limiting
> serves.  It seems to just give advantage to writers, which managed to
> accumulate lots of dirty pages, and then can convert that into even
> more dirtyings.

The queues only limit the actual in-flight writeback pages,
balance_dirty_pages() considers all pages that might become writeback as
well as those that are.

> Would it make sense to remove this behavior, and ensure that
> balance_dirty_pages() doesn't return until the per-queue limits have
> been complied with?

I don't think that will help, balance_dirty_pages drives the queues.
That is, it converts pages from mere dirty to writeback.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  8:31           ` Peter Zijlstra
@ 2007-04-24  9:14             ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  9:14 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, neilb, linux-mm, linux-kernel, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > This is probably a
> > > >  reasonable thing to do but it doesn't feel like the right place.  I
> > > >  think get_dirty_limits should return the raw threshold, and
> > > >  balance_dirty_pages should do both tests - the bdi-local test and the
> > > >  system-wide test.
> > > 
> > > Ok, that makes sense I guess.
> > 
> > Well, my narrow minded world view says it's not such a good idea,
> > because it would again introduce the deadlock scenario, we're trying
> > to avoid.
> 
> I was only referring to the placement of the clipping; and exactly where
> that happens does not affect the deadlock.

OK.

> > In a sense allowing a queue to go over the global limit just a little
> > bit is a good thing.  Actually the very original code does that: if
> > writeback was started for "write_chunk" number of pages, then we allow
> > "ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
> > global limit.
> 
> It might be time to get rid of that rate-limiting.
> balance_dirty_pages()'s fast path is not nearly as heavy as it used to
> be. All these fancy counter systems have removed quite a bit of
> iteration from there.

Hmm.  The rate limiting probably makes lots of sense for
dirty_exceeded==0, when ratelimit can be a nice large value.

For dirty_exceeded==1 it may make sense to disable ratelimiting, OTOH
having a granularity of 8 pages probably doesn't matter, because of
the granularity of the percpu counter is usually larger (except on UP).

> > That's why I've been saying, that the current code is so unfair: if
> > there are lots of dirty pages to be written back to a particular
> > device, then balance_dirty_pages() allows the dirty producer to make
> > even more pages dirty, but if there are _no_ dirty pages for a device,
> > and we are over the limit, then that dirty producer is allowed
> > absolutely no new dirty pages until the global counts subside.
> 
> Well, that got fixed on a per device basis with this patch, it is still
> true for multiple tasks writing to the same device.

Yes, this is the part of this patchset I'm personally interested in ;)

> > I'm still not quite sure what purpose the above "soft" limiting
> > serves.  It seems to just give advantage to writers, which managed to
> > accumulate lots of dirty pages, and then can convert that into even
> > more dirtyings.
> 
> The queues only limit the actual in-flight writeback pages,
> balance_dirty_pages() considers all pages that might become writeback as
> well as those that are.
> 
> > Would it make sense to remove this behavior, and ensure that
> > balance_dirty_pages() doesn't return until the per-queue limits have
> > been complied with?
> 
> I don't think that will help, balance_dirty_pages drives the queues.
> That is, it converts pages from mere dirty to writeback.

Yes.  But current logic says, that if you convert "write_chunk" dirty
to writeback, you are allowed to dirty "ratelimit" more. 

D: number of dirty pages
W: number of writeback pages
L: global limit
C: write_chunk = ratelimit_pages * 1.5
R: ratelimit

If D+W >= L, then R = 8

Let's assume, that D == L and W == 0.  And that all of the dirty pages
belong to a single device.  Also for simplicity, lets assume an
infinite length queue, and a slow device.

Then while converting the dirty pages to writeback, D / C * R new
dirty pages can be created.  So when all existing dirty have been
converted:

  D = L / C * R
  W = L

  D + W = L * (1 + R / C)

So we see, that we're now even more above the limit than before the
conversion.  This means, that we starve writers to other devices,
which don't have as many dirty pages, because until the slow device
doesn't finish these writes they will not get to do anything.

Your patch helps this in that if the other writers have an empty queue
and no dirty, they will be allowed to slowly start writing.  But they
will not gain their full share until the slow dirty-hog goes below the
global limit, which may take some time.

So I think the logical thing to do, is if the dirty-hog is over it's
queue limit, don't let it dirty any more until it's dirty+writeback go
below the limit.  That allowes other devices to more quickly gain
their share of dirty pages.

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  9:14             ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  9:14 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, neilb, linux-mm, linux-kernel, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > This is probably a
> > > >  reasonable thing to do but it doesn't feel like the right place.  I
> > > >  think get_dirty_limits should return the raw threshold, and
> > > >  balance_dirty_pages should do both tests - the bdi-local test and the
> > > >  system-wide test.
> > > 
> > > Ok, that makes sense I guess.
> > 
> > Well, my narrow minded world view says it's not such a good idea,
> > because it would again introduce the deadlock scenario, we're trying
> > to avoid.
> 
> I was only referring to the placement of the clipping; and exactly where
> that happens does not affect the deadlock.

OK.

> > In a sense allowing a queue to go over the global limit just a little
> > bit is a good thing.  Actually the very original code does that: if
> > writeback was started for "write_chunk" number of pages, then we allow
> > "ratelimit" (8) _new_ pages to be dirtied, effectively ignoring the
> > global limit.
> 
> It might be time to get rid of that rate-limiting.
> balance_dirty_pages()'s fast path is not nearly as heavy as it used to
> be. All these fancy counter systems have removed quite a bit of
> iteration from there.

Hmm.  The rate limiting probably makes lots of sense for
dirty_exceeded==0, when ratelimit can be a nice large value.

For dirty_exceeded==1 it may make sense to disable ratelimiting, OTOH
having a granularity of 8 pages probably doesn't matter, because of
the granularity of the percpu counter is usually larger (except on UP).

> > That's why I've been saying, that the current code is so unfair: if
> > there are lots of dirty pages to be written back to a particular
> > device, then balance_dirty_pages() allows the dirty producer to make
> > even more pages dirty, but if there are _no_ dirty pages for a device,
> > and we are over the limit, then that dirty producer is allowed
> > absolutely no new dirty pages until the global counts subside.
> 
> Well, that got fixed on a per device basis with this patch, it is still
> true for multiple tasks writing to the same device.

Yes, this is the part of this patchset I'm personally interested in ;)

> > I'm still not quite sure what purpose the above "soft" limiting
> > serves.  It seems to just give advantage to writers, which managed to
> > accumulate lots of dirty pages, and then can convert that into even
> > more dirtyings.
> 
> The queues only limit the actual in-flight writeback pages,
> balance_dirty_pages() considers all pages that might become writeback as
> well as those that are.
> 
> > Would it make sense to remove this behavior, and ensure that
> > balance_dirty_pages() doesn't return until the per-queue limits have
> > been complied with?
> 
> I don't think that will help, balance_dirty_pages drives the queues.
> That is, it converts pages from mere dirty to writeback.

Yes.  But current logic says, that if you convert "write_chunk" dirty
to writeback, you are allowed to dirty "ratelimit" more. 

D: number of dirty pages
W: number of writeback pages
L: global limit
C: write_chunk = ratelimit_pages * 1.5
R: ratelimit

If D+W >= L, then R = 8

Let's assume, that D == L and W == 0.  And that all of the dirty pages
belong to a single device.  Also for simplicity, lets assume an
infinite length queue, and a slow device.

Then while converting the dirty pages to writeback, D / C * R new
dirty pages can be created.  So when all existing dirty have been
converted:

  D = L / C * R
  W = L

  D + W = L * (1 + R / C)

So we see, that we're now even more above the limit than before the
conversion.  This means, that we starve writers to other devices,
which don't have as many dirty pages, because until the slow device
doesn't finish these writes they will not get to do anything.

Your patch helps this in that if the other writers have an empty queue
and no dirty, they will be allowed to slowly start writing.  But they
will not gain their full share until the slow dirty-hog goes below the
global limit, which may take some time.

So I think the logical thing to do, is if the dirty-hog is over it's
queue limit, don't let it dirty any more until it's dirty+writeback go
below the limit.  That allowes other devices to more quickly gain
their share of dirty pages.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  9:14             ` Miklos Szeredi
@ 2007-04-24  9:26               ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  9:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: neilb, linux-mm, linux-kernel, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 11:14 +0200, Miklos Szeredi wrote:

> > > I'm still not quite sure what purpose the above "soft" limiting
> > > serves.  It seems to just give advantage to writers, which managed to
> > > accumulate lots of dirty pages, and then can convert that into even
> > > more dirtyings.
> > 
> > The queues only limit the actual in-flight writeback pages,
> > balance_dirty_pages() considers all pages that might become writeback as
> > well as those that are.
> > 
> > > Would it make sense to remove this behavior, and ensure that
> > > balance_dirty_pages() doesn't return until the per-queue limits have
> > > been complied with?
> > 
> > I don't think that will help, balance_dirty_pages drives the queues.
> > That is, it converts pages from mere dirty to writeback.
> 
> Yes.  But current logic says, that if you convert "write_chunk" dirty
> to writeback, you are allowed to dirty "ratelimit" more. 
> 
> D: number of dirty pages
> W: number of writeback pages
> L: global limit
> C: write_chunk = ratelimit_pages * 1.5
> R: ratelimit
> 
> If D+W >= L, then R = 8
> 
> Let's assume, that D == L and W == 0.  And that all of the dirty pages
> belong to a single device.  Also for simplicity, lets assume an
> infinite length queue, and a slow device.
> 
> Then while converting the dirty pages to writeback, D / C * R new
> dirty pages can be created.  So when all existing dirty have been
> converted:
> 
>   D = L / C * R
>   W = L
> 
>   D + W = L * (1 + R / C)
> 
> So we see, that we're now even more above the limit than before the
> conversion.  This means, that we starve writers to other devices,
> which don't have as many dirty pages, because until the slow device
> doesn't finish these writes they will not get to do anything.
> 
> Your patch helps this in that if the other writers have an empty queue
> and no dirty, they will be allowed to slowly start writing.  But they
> will not gain their full share until the slow dirty-hog goes below the
> global limit, which may take some time.
> 
> So I think the logical thing to do, is if the dirty-hog is over it's
> queue limit, don't let it dirty any more until it's dirty+writeback go
> below the limit.  That allowes other devices to more quickly gain
> their share of dirty pages.

Ahh, now I see; I had totally blocked out these few lines:

			pages_written += write_chunk - wbc.nr_to_write;
			if (pages_written >= write_chunk)
				break;		/* We've done our duty */

yeah, those look dubious indeed... And reading back Neil's comments, I
think he agrees.

Shall we just kill those?


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  9:26               ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24  9:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: neilb, linux-mm, linux-kernel, akpm, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 11:14 +0200, Miklos Szeredi wrote:

> > > I'm still not quite sure what purpose the above "soft" limiting
> > > serves.  It seems to just give advantage to writers, which managed to
> > > accumulate lots of dirty pages, and then can convert that into even
> > > more dirtyings.
> > 
> > The queues only limit the actual in-flight writeback pages,
> > balance_dirty_pages() considers all pages that might become writeback as
> > well as those that are.
> > 
> > > Would it make sense to remove this behavior, and ensure that
> > > balance_dirty_pages() doesn't return until the per-queue limits have
> > > been complied with?
> > 
> > I don't think that will help, balance_dirty_pages drives the queues.
> > That is, it converts pages from mere dirty to writeback.
> 
> Yes.  But current logic says, that if you convert "write_chunk" dirty
> to writeback, you are allowed to dirty "ratelimit" more. 
> 
> D: number of dirty pages
> W: number of writeback pages
> L: global limit
> C: write_chunk = ratelimit_pages * 1.5
> R: ratelimit
> 
> If D+W >= L, then R = 8
> 
> Let's assume, that D == L and W == 0.  And that all of the dirty pages
> belong to a single device.  Also for simplicity, lets assume an
> infinite length queue, and a slow device.
> 
> Then while converting the dirty pages to writeback, D / C * R new
> dirty pages can be created.  So when all existing dirty have been
> converted:
> 
>   D = L / C * R
>   W = L
> 
>   D + W = L * (1 + R / C)
> 
> So we see, that we're now even more above the limit than before the
> conversion.  This means, that we starve writers to other devices,
> which don't have as many dirty pages, because until the slow device
> doesn't finish these writes they will not get to do anything.
> 
> Your patch helps this in that if the other writers have an empty queue
> and no dirty, they will be allowed to slowly start writing.  But they
> will not gain their full share until the slow dirty-hog goes below the
> global limit, which may take some time.
> 
> So I think the logical thing to do, is if the dirty-hog is over it's
> queue limit, don't let it dirty any more until it's dirty+writeback go
> below the limit.  That allowes other devices to more quickly gain
> their share of dirty pages.

Ahh, now I see; I had totally blocked out these few lines:

			pages_written += write_chunk - wbc.nr_to_write;
			if (pages_written >= write_chunk)
				break;		/* We've done our duty */

yeah, those look dubious indeed... And reading back Neil's comments, I
think he agrees.

Shall we just kill those?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  9:26               ` Peter Zijlstra
@ 2007-04-24  9:47                 ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  9:47 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, neilb, linux-mm, linux-kernel, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> Ahh, now I see; I had totally blocked out these few lines:
> 
> 			pages_written += write_chunk - wbc.nr_to_write;
> 			if (pages_written >= write_chunk)
> 				break;		/* We've done our duty */
> 
> yeah, those look dubious indeed... And reading back Neil's comments, I
> think he agrees.
> 
> Shall we just kill those?

I think we should.

Athough I'm a little afraid, that Akpm will tell me again, that I'm a
stupid git, and that those lines are in fact vitally important ;)

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24  9:47                 ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24  9:47 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: miklos, neilb, linux-mm, linux-kernel, akpm, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> Ahh, now I see; I had totally blocked out these few lines:
> 
> 			pages_written += write_chunk - wbc.nr_to_write;
> 			if (pages_written >= write_chunk)
> 				break;		/* We've done our duty */
> 
> yeah, those look dubious indeed... And reading back Neil's comments, I
> think he agrees.
> 
> Shall we just kill those?

I think we should.

Athough I'm a little afraid, that Akpm will tell me again, that I'm a
stupid git, and that those lines are in fact vitally important ;)

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24  9:47                 ` Miklos Szeredi
@ 2007-04-24 10:00                   ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 10:00 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > Ahh, now I see; I had totally blocked out these few lines:
> > 
> > 			pages_written += write_chunk - wbc.nr_to_write;
> > 			if (pages_written >= write_chunk)
> > 				break;		/* We've done our duty */
> > 
> > yeah, those look dubious indeed... And reading back Neil's comments, I
> > think he agrees.
> > 
> > Shall we just kill those?
> 
> I think we should.
> 
> Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> stupid git, and that those lines are in fact vitally important ;)
> 

It depends what they're replaced with.

That code is there, iirc, to prevent a process from getting stuck in
balance_dirty_pages() forever due to the dirtying activity of other
processes.

hm, we ask the process to write write_chunk pages each go around the loop.
So if it wrote write-chunk/2 pages on the first pass it might end up writing
write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
if it does happen - the upper bound is write_chunk*2-1, I think.

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 10:00                   ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 10:00 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > Ahh, now I see; I had totally blocked out these few lines:
> > 
> > 			pages_written += write_chunk - wbc.nr_to_write;
> > 			if (pages_written >= write_chunk)
> > 				break;		/* We've done our duty */
> > 
> > yeah, those look dubious indeed... And reading back Neil's comments, I
> > think he agrees.
> > 
> > Shall we just kill those?
> 
> I think we should.
> 
> Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> stupid git, and that those lines are in fact vitally important ;)
> 

It depends what they're replaced with.

That code is there, iirc, to prevent a process from getting stuck in
balance_dirty_pages() forever due to the dirtying activity of other
processes.

hm, we ask the process to write write_chunk pages each go around the loop.
So if it wrote write-chunk/2 pages on the first pass it might end up writing
write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
if it does happen - the upper bound is write_chunk*2-1, I think.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 10:00                   ` Andrew Morton
@ 2007-04-24 10:12                     ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24 10:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Miklos Szeredi, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> > > Ahh, now I see; I had totally blocked out these few lines:
> > > 
> > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > 			if (pages_written >= write_chunk)
> > > 				break;		/* We've done our duty */
> > > 
> > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > think he agrees.
> > > 
> > > Shall we just kill those?
> > 
> > I think we should.
> > 
> > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > stupid git, and that those lines are in fact vitally important ;)
> > 
> 
> It depends what they're replaced with.
> 
> That code is there, iirc, to prevent a process from getting stuck in
> balance_dirty_pages() forever due to the dirtying activity of other
> processes.
> 
> hm, we ask the process to write write_chunk pages each go around the loop.
> So if it wrote write-chunk/2 pages on the first pass it might end up writing
> write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> if it does happen - the upper bound is write_chunk*2-1, I think.

Right, but I think the problem is that its dirty -> writeback, not dirty
-> writeback completed.

Ie. they don't guarantee progress, it could be that the total
nr_reclaimable + nr_writeback will steadily increase due to this break.

How about ensuring that vm_writeout_total increases least
2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
way we have the guarantee that more pages get written out than can be
dirtied.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 10:12                     ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24 10:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Miklos Szeredi, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> > > Ahh, now I see; I had totally blocked out these few lines:
> > > 
> > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > 			if (pages_written >= write_chunk)
> > > 				break;		/* We've done our duty */
> > > 
> > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > think he agrees.
> > > 
> > > Shall we just kill those?
> > 
> > I think we should.
> > 
> > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > stupid git, and that those lines are in fact vitally important ;)
> > 
> 
> It depends what they're replaced with.
> 
> That code is there, iirc, to prevent a process from getting stuck in
> balance_dirty_pages() forever due to the dirtying activity of other
> processes.
> 
> hm, we ask the process to write write_chunk pages each go around the loop.
> So if it wrote write-chunk/2 pages on the first pass it might end up writing
> write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> if it does happen - the upper bound is write_chunk*2-1, I think.

Right, but I think the problem is that its dirty -> writeback, not dirty
-> writeback completed.

Ie. they don't guarantee progress, it could be that the total
nr_reclaimable + nr_writeback will steadily increase due to this break.

How about ensuring that vm_writeout_total increases least
2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
way we have the guarantee that more pages get written out than can be
dirtied.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 10:12                     ` Peter Zijlstra
@ 2007-04-24 10:19                       ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 10:19 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: akpm, miklos, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > 
> > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > 			if (pages_written >= write_chunk)
> > > > 				break;		/* We've done our duty */
> > > > 
> > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > think he agrees.
> > > > 
> > > > Shall we just kill those?
> > > 
> > > I think we should.
> > > 
> > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > stupid git, and that those lines are in fact vitally important ;)
> > > 
> > 
> > It depends what they're replaced with.
> > 
> > That code is there, iirc, to prevent a process from getting stuck in
> > balance_dirty_pages() forever due to the dirtying activity of other
> > processes.
> > 
> > hm, we ask the process to write write_chunk pages each go around the loop.
> > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > if it does happen - the upper bound is write_chunk*2-1, I think.
> 
> Right, but I think the problem is that its dirty -> writeback, not dirty
> -> writeback completed.
> 
> Ie. they don't guarantee progress, it could be that the total
> nr_reclaimable + nr_writeback will steadily increase due to this break.
> 
> How about ensuring that vm_writeout_total increases least
> 2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
> way we have the guarantee that more pages get written out than can be
> dirtied.

No, because that's a global counter, which many writers could be
looking at.

We'd need a per-task writeout counter, but when finishing the write we
don't know anymore which task it was performed for.

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 10:19                       ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 10:19 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: akpm, miklos, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > 
> > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > 			if (pages_written >= write_chunk)
> > > > 				break;		/* We've done our duty */
> > > > 
> > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > think he agrees.
> > > > 
> > > > Shall we just kill those?
> > > 
> > > I think we should.
> > > 
> > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > stupid git, and that those lines are in fact vitally important ;)
> > > 
> > 
> > It depends what they're replaced with.
> > 
> > That code is there, iirc, to prevent a process from getting stuck in
> > balance_dirty_pages() forever due to the dirtying activity of other
> > processes.
> > 
> > hm, we ask the process to write write_chunk pages each go around the loop.
> > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > if it does happen - the upper bound is write_chunk*2-1, I think.
> 
> Right, but I think the problem is that its dirty -> writeback, not dirty
> -> writeback completed.
> 
> Ie. they don't guarantee progress, it could be that the total
> nr_reclaimable + nr_writeback will steadily increase due to this break.
> 
> How about ensuring that vm_writeout_total increases least
> 2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
> way we have the guarantee that more pages get written out than can be
> dirtied.

No, because that's a global counter, which many writers could be
looking at.

We'd need a per-task writeout counter, but when finishing the write we
don't know anymore which task it was performed for.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 10:19                       ` Miklos Szeredi
@ 2007-04-24 10:24                         ` Peter Zijlstra
  -1 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24 10:24 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, neilb, linux-mm, linux-kernel, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 12:19 +0200, Miklos Szeredi wrote:
> > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > 
> > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > 			if (pages_written >= write_chunk)
> > > > > 				break;		/* We've done our duty */
> > > > > 
> > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > think he agrees.
> > > > > 
> > > > > Shall we just kill those?
> > > > 
> > > > I think we should.
> > > > 
> > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > stupid git, and that those lines are in fact vitally important ;)
> > > > 
> > > 
> > > It depends what they're replaced with.
> > > 
> > > That code is there, iirc, to prevent a process from getting stuck in
> > > balance_dirty_pages() forever due to the dirtying activity of other
> > > processes.
> > > 
> > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > 
> > Right, but I think the problem is that its dirty -> writeback, not dirty
> > -> writeback completed.
> > 
> > Ie. they don't guarantee progress, it could be that the total
> > nr_reclaimable + nr_writeback will steadily increase due to this break.
> > 
> > How about ensuring that vm_writeout_total increases least
> > 2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
> > way we have the guarantee that more pages get written out than can be
> > dirtied.
> 
> No, because that's a global counter, which many writers could be
> looking at.
> 
> We'd need a per-task writeout counter, but when finishing the write we
> don't know anymore which task it was performed for.

Yeah, just reached that conclusion myself too - again, I ran into that
when trying to figure out how to do the per task balancing right.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 10:24                         ` Peter Zijlstra
  0 siblings, 0 replies; 110+ messages in thread
From: Peter Zijlstra @ 2007-04-24 10:24 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: akpm, neilb, linux-mm, linux-kernel, dgc, tomoki.sekiyama.qu,
	nikita, trond.myklebust, yingchao.zhou

On Tue, 2007-04-24 at 12:19 +0200, Miklos Szeredi wrote:
> > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > 
> > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > 			if (pages_written >= write_chunk)
> > > > > 				break;		/* We've done our duty */
> > > > > 
> > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > think he agrees.
> > > > > 
> > > > > Shall we just kill those?
> > > > 
> > > > I think we should.
> > > > 
> > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > stupid git, and that those lines are in fact vitally important ;)
> > > > 
> > > 
> > > It depends what they're replaced with.
> > > 
> > > That code is there, iirc, to prevent a process from getting stuck in
> > > balance_dirty_pages() forever due to the dirtying activity of other
> > > processes.
> > > 
> > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > 
> > Right, but I think the problem is that its dirty -> writeback, not dirty
> > -> writeback completed.
> > 
> > Ie. they don't guarantee progress, it could be that the total
> > nr_reclaimable + nr_writeback will steadily increase due to this break.
> > 
> > How about ensuring that vm_writeout_total increases least
> > 2*sync_writeback_pages() during our stay in balance_dirty_pages(). That
> > way we have the guarantee that more pages get written out than can be
> > dirtied.
> 
> No, because that's a global counter, which many writers could be
> looking at.
> 
> We'd need a per-task writeout counter, but when finishing the write we
> don't know anymore which task it was performed for.

Yeah, just reached that conclusion myself too - again, I ran into that
when trying to figure out how to do the per task balancing right.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 10:12                     ` Peter Zijlstra
@ 2007-04-24 10:40                       ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 10:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Miklos Szeredi, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > 
> > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > 
> > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > 			if (pages_written >= write_chunk)
> > > > 				break;		/* We've done our duty */
> > > > 
> > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > think he agrees.
> > > > 
> > > > Shall we just kill those?
> > > 
> > > I think we should.
> > > 
> > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > stupid git, and that those lines are in fact vitally important ;)
> > > 
> > 
> > It depends what they're replaced with.
> > 
> > That code is there, iirc, to prevent a process from getting stuck in
> > balance_dirty_pages() forever due to the dirtying activity of other
> > processes.
> > 
> > hm, we ask the process to write write_chunk pages each go around the loop.
> > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > if it does happen - the upper bound is write_chunk*2-1, I think.
> 
> Right, but I think the problem is that its dirty -> writeback, not dirty
> -> writeback completed.
> 
> Ie. they don't guarantee progress, it could be that the total
> nr_reclaimable + nr_writeback will steadily increase due to this break.

Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
dirtyings and when we get there, we write 1.5*ratelimit_pages pages.


^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 10:40                       ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 10:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Miklos Szeredi, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > 
> > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > 
> > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > 			if (pages_written >= write_chunk)
> > > > 				break;		/* We've done our duty */
> > > > 
> > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > think he agrees.
> > > > 
> > > > Shall we just kill those?
> > > 
> > > I think we should.
> > > 
> > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > stupid git, and that those lines are in fact vitally important ;)
> > > 
> > 
> > It depends what they're replaced with.
> > 
> > That code is there, iirc, to prevent a process from getting stuck in
> > balance_dirty_pages() forever due to the dirtying activity of other
> > processes.
> > 
> > hm, we ask the process to write write_chunk pages each go around the loop.
> > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > if it does happen - the upper bound is write_chunk*2-1, I think.
> 
> Right, but I think the problem is that its dirty -> writeback, not dirty
> -> writeback completed.
> 
> Ie. they don't guarantee progress, it could be that the total
> nr_reclaimable + nr_writeback will steadily increase due to this break.

Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
dirtyings and when we get there, we write 1.5*ratelimit_pages pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 10:40                       ` Andrew Morton
@ 2007-04-24 11:22                         ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 11:22 UTC (permalink / raw)
  To: akpm
  Cc: a.p.zijlstra, miklos, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > 
> > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > 
> > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > 			if (pages_written >= write_chunk)
> > > > > 				break;		/* We've done our duty */
> > > > > 
> > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > think he agrees.
> > > > > 
> > > > > Shall we just kill those?
> > > > 
> > > > I think we should.
> > > > 
> > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > stupid git, and that those lines are in fact vitally important ;)
> > > > 
> > > 
> > > It depends what they're replaced with.
> > > 
> > > That code is there, iirc, to prevent a process from getting stuck in
> > > balance_dirty_pages() forever due to the dirtying activity of other
> > > processes.
> > > 
> > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > 
> > Right, but I think the problem is that its dirty -> writeback, not dirty
> > -> writeback completed.
> > 
> > Ie. they don't guarantee progress, it could be that the total
> > nr_reclaimable + nr_writeback will steadily increase due to this break.
> 
> Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
> dirtyings and when we get there, we write 1.5*ratelimit_pages pages.

No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
wait for those writebacks to finish.

So for a slow device and a fast writer, dirty+writeback can indeed
increase beyond the dirty threshold.

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 11:22                         ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 11:22 UTC (permalink / raw)
  To: akpm
  Cc: a.p.zijlstra, miklos, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > 
> > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > 
> > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > 			if (pages_written >= write_chunk)
> > > > > 				break;		/* We've done our duty */
> > > > > 
> > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > think he agrees.
> > > > > 
> > > > > Shall we just kill those?
> > > > 
> > > > I think we should.
> > > > 
> > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > stupid git, and that those lines are in fact vitally important ;)
> > > > 
> > > 
> > > It depends what they're replaced with.
> > > 
> > > That code is there, iirc, to prevent a process from getting stuck in
> > > balance_dirty_pages() forever due to the dirtying activity of other
> > > processes.
> > > 
> > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > 
> > Right, but I think the problem is that its dirty -> writeback, not dirty
> > -> writeback completed.
> > 
> > Ie. they don't guarantee progress, it could be that the total
> > nr_reclaimable + nr_writeback will steadily increase due to this break.
> 
> Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
> dirtyings and when we get there, we write 1.5*ratelimit_pages pages.

No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
wait for those writebacks to finish.

So for a slow device and a fast writer, dirty+writeback can indeed
increase beyond the dirty threshold.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 11:22                         ` Miklos Szeredi
@ 2007-04-24 11:50                           ` Andrew Morton
  -1 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 11:50 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 13:22:02 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > > > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > > 
> > > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > > 
> > > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > > 			if (pages_written >= write_chunk)
> > > > > > 				break;		/* We've done our duty */
> > > > > > 
> > > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > > think he agrees.
> > > > > > 
> > > > > > Shall we just kill those?
> > > > > 
> > > > > I think we should.
> > > > > 
> > > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > > stupid git, and that those lines are in fact vitally important ;)
> > > > > 
> > > > 
> > > > It depends what they're replaced with.
> > > > 
> > > > That code is there, iirc, to prevent a process from getting stuck in
> > > > balance_dirty_pages() forever due to the dirtying activity of other
> > > > processes.
> > > > 
> > > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > > 
> > > Right, but I think the problem is that its dirty -> writeback, not dirty
> > > -> writeback completed.
> > > 
> > > Ie. they don't guarantee progress, it could be that the total
> > > nr_reclaimable + nr_writeback will steadily increase due to this break.
> > 
> > Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
> > dirtyings and when we get there, we write 1.5*ratelimit_pages pages.
> 
> No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
> wait for those writebacks to finish.
> 
> So for a slow device and a fast writer, dirty+writeback can indeed
> increase beyond the dirty threshold.
> 

Nope, try it.

If a process dirties 1000 pages it'll then go into balance_dirty_pages()
and start writeback against 1,500 pages.  When we hit dirty_ratio that
process will be required to write back 1,500 pages for each eight pages
which it dirtied.  We'll quickly reach the stage where there are no longer
1,500 pages to be written back and the process will block in
balance_dirty_pages() until the dirty+writeback level subsides.





^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 11:50                           ` Andrew Morton
  0 siblings, 0 replies; 110+ messages in thread
From: Andrew Morton @ 2007-04-24 11:50 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

On Tue, 24 Apr 2007 13:22:02 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:

> > On Tue, 24 Apr 2007 12:12:18 +0200 Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > 
> > > On Tue, 2007-04-24 at 03:00 -0700, Andrew Morton wrote:
> > > > On Tue, 24 Apr 2007 11:47:20 +0200 Miklos Szeredi <miklos@szeredi.hu> wrote:
> > > > 
> > > > > > Ahh, now I see; I had totally blocked out these few lines:
> > > > > > 
> > > > > > 			pages_written += write_chunk - wbc.nr_to_write;
> > > > > > 			if (pages_written >= write_chunk)
> > > > > > 				break;		/* We've done our duty */
> > > > > > 
> > > > > > yeah, those look dubious indeed... And reading back Neil's comments, I
> > > > > > think he agrees.
> > > > > > 
> > > > > > Shall we just kill those?
> > > > > 
> > > > > I think we should.
> > > > > 
> > > > > Athough I'm a little afraid, that Akpm will tell me again, that I'm a
> > > > > stupid git, and that those lines are in fact vitally important ;)
> > > > > 
> > > > 
> > > > It depends what they're replaced with.
> > > > 
> > > > That code is there, iirc, to prevent a process from getting stuck in
> > > > balance_dirty_pages() forever due to the dirtying activity of other
> > > > processes.
> > > > 
> > > > hm, we ask the process to write write_chunk pages each go around the loop.
> > > > So if it wrote write-chunk/2 pages on the first pass it might end up writing
> > > > write_chunk*1.5 pages total.  I guess that's rare and doesn't matter much
> > > > if it does happen - the upper bound is write_chunk*2-1, I think.
> > > 
> > > Right, but I think the problem is that its dirty -> writeback, not dirty
> > > -> writeback completed.
> > > 
> > > Ie. they don't guarantee progress, it could be that the total
> > > nr_reclaimable + nr_writeback will steadily increase due to this break.
> > 
> > Don't think so.  We call balance_dirty_pages() once per ratelimit_pages
> > dirtyings and when we get there, we write 1.5*ratelimit_pages pages.
> 
> No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
> wait for those writebacks to finish.
> 
> So for a slow device and a fast writer, dirty+writeback can indeed
> increase beyond the dirty threshold.
> 

Nope, try it.

If a process dirties 1000 pages it'll then go into balance_dirty_pages()
and start writeback against 1,500 pages.  When we hit dirty_ratio that
process will be required to write back 1,500 pages for each eight pages
which it dirtied.  We'll quickly reach the stage where there are no longer
1,500 pages to be written back and the process will block in
balance_dirty_pages() until the dirty+writeback level subsides.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
  2007-04-24 11:50                           ` Andrew Morton
@ 2007-04-24 12:07                             ` Miklos Szeredi
  -1 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 12:07 UTC (permalink / raw)
  To: akpm
  Cc: miklos, a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
> > wait for those writebacks to finish.
> > 
> > So for a slow device and a fast writer, dirty+writeback can indeed
> > increase beyond the dirty threshold.
> > 
> 
> Nope, try it.
> 
> If a process dirties 1000 pages it'll then go into balance_dirty_pages()
> and start writeback against 1,500 pages.  When we hit dirty_ratio that
> process will be required to write back 1,500 pages for each eight pages
> which it dirtied.  We'll quickly reach the stage where there are no longer
> 1,500 pages to be written back and the process will block in
> balance_dirty_pages() until the dirty+writeback level subsides.

OK.  I was confused by this:

static long ratelimit_pages = 32;

and didn't realize, that that 32 is totally irrelevant.

So I'm still right, that for N dirty pages, the writer is allowed to
dirty N/1500*8 more dirty pages, but I agree, that this isn't really
an issue.

Miklos

^ permalink raw reply	[flat|nested] 110+ messages in thread

* Re: [PATCH 10/10] mm: per device dirty threshold
@ 2007-04-24 12:07                             ` Miklos Szeredi
  0 siblings, 0 replies; 110+ messages in thread
From: Miklos Szeredi @ 2007-04-24 12:07 UTC (permalink / raw)
  To: akpm
  Cc: miklos, a.p.zijlstra, neilb, linux-mm, linux-kernel, dgc,
	tomoki.sekiyama.qu, nikita, trond.myklebust, yingchao.zhou

> > No, we _start_ writeback for 1.5*ratelimit_pages pages, but do not
> > wait for those writebacks to finish.
> > 
> > So for a slow device and a fast writer, dirty+writeback can indeed
> > increase beyond the dirty threshold.
> > 
> 
> Nope, try it.
> 
> If a process dirties 1000 pages it'll then go into balance_dirty_pages()
> and start writeback against 1,500 pages.  When we hit dirty_ratio that
> process will be required to write back 1,500 pages for each eight pages
> which it dirtied.  We'll quickly reach the stage where there are no longer
> 1,500 pages to be written back and the process will block in
> balance_dirty_pages() until the dirty+writeback level subsides.

OK.  I was confused by this:

static long ratelimit_pages = 32;

and didn't realize, that that 32 is totally irrelevant.

So I'm still right, that for N dirty pages, the writer is allowed to
dirty N/1500*8 more dirty pages, but I agree, that this isn't really
an issue.

Miklos

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 110+ messages in thread

end of thread, other threads:[~2007-04-24 12:08 UTC | newest]

Thread overview: 110+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-20 15:51 [PATCH 00/10] per device dirty throttling -v5 Peter Zijlstra
2007-04-20 15:51 ` Peter Zijlstra
2007-04-20 15:51 ` [PATCH 01/10] revert per-backing_dev-dirty-and-writeback-page-accounting Peter Zijlstra
2007-04-20 15:51   ` Peter Zijlstra
2007-04-20 15:51 ` [PATCH 02/10] nfs: remove congestion_end() Peter Zijlstra
2007-04-20 15:51   ` Peter Zijlstra
2007-04-20 15:51 ` [PATCH 03/10] lib: dampen the percpu_counter FBC_BATCH Peter Zijlstra
2007-04-20 15:51   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 10:58     ` Peter Zijlstra
2007-04-21 10:58       ` Peter Zijlstra
2007-04-20 15:51 ` [PATCH 04/10] lib: percpu_counter_mod64 Peter Zijlstra
2007-04-20 15:51   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 11:02     ` Peter Zijlstra
2007-04-21 11:02       ` Peter Zijlstra
2007-04-21 19:21       ` Andrew Morton
2007-04-21 19:21         ` Andrew Morton
2007-04-21 19:30         ` Peter Zijlstra
2007-04-21 19:30           ` Peter Zijlstra
2007-04-20 15:51 ` [PATCH 05/10] mm: bdi init hooks Peter Zijlstra
2007-04-20 15:51   ` Peter Zijlstra
2007-04-20 15:52 ` [PATCH 06/10] mm: scalable bdi statistics counters Peter Zijlstra
2007-04-20 15:52   ` Peter Zijlstra
2007-04-20 15:52 ` [PATCH 07/10] mm: count reclaimable pages per BDI Peter Zijlstra
2007-04-20 15:52   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 11:04     ` Peter Zijlstra
2007-04-21 11:04       ` Peter Zijlstra
2007-04-20 15:52 ` [PATCH 08/10] mm: count writeback " Peter Zijlstra
2007-04-20 15:52   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 11:07     ` Peter Zijlstra
2007-04-21 11:07       ` Peter Zijlstra
2007-04-22  7:19       ` Andrew Morton
2007-04-22  7:19         ` Andrew Morton
2007-04-22  9:08         ` Peter Zijlstra
2007-04-22  9:08           ` Peter Zijlstra
2007-04-20 15:52 ` [PATCH 09/10] mm: expose BDI statistics in sysfs Peter Zijlstra
2007-04-20 15:52   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 11:08     ` Peter Zijlstra
2007-04-21 11:08       ` Peter Zijlstra
2007-04-20 15:52 ` [PATCH 10/10] mm: per device dirty threshold Peter Zijlstra
2007-04-20 15:52   ` Peter Zijlstra
2007-04-21  9:55   ` Andrew Morton
2007-04-21  9:55     ` Andrew Morton
2007-04-21 10:38     ` Miklos Szeredi
2007-04-21 10:38       ` Miklos Szeredi
2007-04-21 10:54       ` Andrew Morton
2007-04-21 10:54         ` Andrew Morton
2007-04-21 20:25         ` Miklos Szeredi
2007-04-21 20:25           ` Miklos Szeredi
2007-04-23  6:14           ` Peter Zijlstra
2007-04-23  6:14             ` Peter Zijlstra
2007-04-23  6:29             ` Miklos Szeredi
2007-04-23  6:29               ` Miklos Szeredi
2007-04-23  6:39               ` Andrew Morton
2007-04-23  6:39                 ` Andrew Morton
2007-04-21 12:01     ` Peter Zijlstra
2007-04-21 12:01       ` Peter Zijlstra
2007-04-21 12:15       ` Peter Zijlstra
2007-04-21 12:15         ` Peter Zijlstra
2007-04-21 19:50         ` Peter Zijlstra
2007-04-21 19:50           ` Peter Zijlstra
2007-04-23 15:48         ` Christoph Lameter
2007-04-23 15:48           ` Christoph Lameter
2007-04-23 15:58           ` Peter Zijlstra
2007-04-23 15:58             ` Peter Zijlstra
2007-04-23 16:08             ` Christoph Lameter
2007-04-23 16:08               ` Christoph Lameter
2007-04-22  7:26       ` Andrew Morton
2007-04-22  7:26         ` Andrew Morton
2007-04-24  2:58   ` Neil Brown
2007-04-24  2:58     ` Neil Brown
2007-04-24  7:09     ` Peter Zijlstra
2007-04-24  7:09       ` Peter Zijlstra
2007-04-24  8:19       ` Miklos Szeredi
2007-04-24  8:19         ` Miklos Szeredi
2007-04-24  8:31         ` Peter Zijlstra
2007-04-24  8:31           ` Peter Zijlstra
2007-04-24  9:14           ` Miklos Szeredi
2007-04-24  9:14             ` Miklos Szeredi
2007-04-24  9:26             ` Peter Zijlstra
2007-04-24  9:26               ` Peter Zijlstra
2007-04-24  9:47               ` Miklos Szeredi
2007-04-24  9:47                 ` Miklos Szeredi
2007-04-24 10:00                 ` Andrew Morton
2007-04-24 10:00                   ` Andrew Morton
2007-04-24 10:12                   ` Peter Zijlstra
2007-04-24 10:12                     ` Peter Zijlstra
2007-04-24 10:19                     ` Miklos Szeredi
2007-04-24 10:19                       ` Miklos Szeredi
2007-04-24 10:24                       ` Peter Zijlstra
2007-04-24 10:24                         ` Peter Zijlstra
2007-04-24 10:40                     ` Andrew Morton
2007-04-24 10:40                       ` Andrew Morton
2007-04-24 11:22                       ` Miklos Szeredi
2007-04-24 11:22                         ` Miklos Szeredi
2007-04-24 11:50                         ` Andrew Morton
2007-04-24 11:50                           ` Andrew Morton
2007-04-24 12:07                           ` Miklos Szeredi
2007-04-24 12:07                             ` Miklos Szeredi
2007-04-22  9:57 ` [PATCH 00/10] per device dirty throttling -v5 Andrew Morton
2007-04-22  9:57   ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.