linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] backing dev unplugging
@ 2004-03-10 12:45 Jens Axboe
  2004-03-10 19:55 ` Andrew Morton
                   ` (5 more replies)
  0 siblings, 6 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 12:45 UTC (permalink / raw)
  To: Linux Kernel; +Cc: kenneth.w.chen, Andrew Morton, thornber

Hi,

Here's a first cut at killing global plugging of block devices to reduce
the nasty contention blk_plug_lock caused. This introduceds per-queue
plugging, controlled by the backing_dev_info. Observations:

- Most uses of blk_run_queues() without a specific context was bogus.
  Usually the act of kicking the targets in question should be (and
  already are) performed by other activities, such as making the vm
  flushing run to free memory.

- Some use of blk_run_queues() really just want to kick the final queue
  where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
  those, if the queue needs unplugging we'll do it when holding the lock
  for the queue already.

- The dm bit needs careful examination and checking of Joe. It could be
  more clever and maintain plug state of each target, I just added a
  dm_table unplug all functionality.

Patch is against 2.6.4-rc2-mm1.

diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/block/ll_rw_blk.c linux-2.6.4-rc2-mm1-plug/drivers/block/ll_rw_blk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/block/ll_rw_blk.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/block/ll_rw_blk.c	2004-03-10 13:30:51.438072138 +0100
@@ -42,12 +42,6 @@
  */
 static kmem_cache_t *request_cachep;
 
-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
 static wait_queue_head_t congestion_wqh[2];
 
 /*
@@ -247,8 +241,6 @@
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
-	INIT_LIST_HEAD(&q->plug_list);
-
 	blk_queue_activity_fn(q, NULL, NULL);
 }
 
@@ -1100,13 +1092,11 @@
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
-	if (!blk_queue_plugged(q)
-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
-	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1118,15 +1108,12 @@
 int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
-	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
-		list_del_init(&q->plug_list);
-		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
-		return 1;
-	}
 
-	return 0;
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
 }
 
 EXPORT_SYMBOL(blk_remove_plug);
@@ -1157,14 +1144,11 @@
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged (either by manually calling this function, or by
- *   calling blk_run_queues()), the request_fn defined for the
- *   queue is invoked and transfers started.
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
  **/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
-
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
@@ -1172,9 +1156,23 @@
 
 EXPORT_SYMBOL(generic_unplug_device);
 
+void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+EXPORT_SYMBOL(blk_backing_dev_unplug);
+
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
+
 	q->unplug_fn(q);
 }
 
@@ -1241,41 +1239,13 @@
 
 EXPORT_SYMBOL(blk_run_queue);
 
-/**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- *   Start I/O on all plugged queues known to the block layer. Queues that
- *   are currently stopped are ignored. This is equivalent to the older
- *   tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
+void blk_run_backing_dev(struct backing_dev_info *bdi)
 {
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
+	if (bdi)
+		bdi->unplug_io_fn(bdi);
 }
 
-EXPORT_SYMBOL(blk_run_queues);
+EXPORT_SYMBOL(blk_run_backing_dev);
 
 /**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
@@ -1382,6 +1352,10 @@
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
 	return q;
 }
 
@@ -2036,7 +2010,6 @@
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
-	blk_run_queues();
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
@@ -2294,7 +2267,7 @@
 	if (blk_queue_plugged(q)) {
 		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
-		if (nr_queued == q->unplug_thresh)
+		if (nr_queued == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm.c	2004-03-10 11:26:35.000000000 +0100
@@ -560,6 +560,13 @@
 	return 0;
 }
 
+static void dm_unplug_all(request_queue_t *q)
+{
+	struct mapped_device *md = q->queuedata;
+
+	dm_table_unplug_all(md->map);
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -657,6 +664,7 @@
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	md->queue->unplug_fn = dm_unplug_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -880,7 +888,7 @@
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	blk_run_queues();
+	dm_table_unplug_all(md->map);
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -921,7 +929,7 @@
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
 
-	blk_run_queues();
+	dm_table_unplug_all(md->map);
 
 	return 0;
 }
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c	2004-03-09 15:27:36.000000000 +0100
@@ -668,7 +668,7 @@
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_run_queues();
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
 	/* drop reference, clones could have returned before we reach this */
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.h linux-2.6.4-rc2-mm1-plug/drivers/md/dm.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.h	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm.h	2004-03-10 11:18:48.000000000 +0100
@@ -116,6 +116,7 @@
 void dm_table_suspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-table.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-table.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-table.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-table.c	2004-03-10 13:42:26.244595842 +0100
@@ -882,6 +882,22 @@
 	return r;
 }
 
+void dm_table_unplug_all(struct dm_table *t)
+{
+	struct list_head *d, *devices;
+
+	devices = dm_table_get_devices(t);
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+
+		if (q->unplug_fn) {
+			set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+			q->unplug_fn(q);
+		}
+	}
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/md.c linux-2.6.4-rc2-mm1-plug/drivers/md/md.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/md.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/md.c	2004-03-10 13:27:17.273517490 +0100
@@ -335,6 +335,8 @@
 	struct bio_vec vec;
 	struct completion event;
 
+	rw |= (1 << BIO_RW_SYNC);
+
 	bio_init(&bio);
 	bio.bi_io_vec = &vec;
 	vec.bv_page = page;
@@ -349,7 +351,6 @@
 	bio.bi_private = &event;
 	bio.bi_end_io = bi_complete;
 	submit_bio(rw, &bio);
-	blk_run_queues();
 	wait_for_completion(&event);
 
 	return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -2718,7 +2719,7 @@
 		run = thread->run;
 		if (run) {
 			run(thread->mddev);
-			blk_run_queues();
+			blk_run_queue(thread->mddev->queue);
 		}
 		if (signal_pending(current))
 			flush_signals(current);
@@ -3286,7 +3287,7 @@
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		blk_run_queues();
+		blk_run_queue(mddev->queue);
 
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid1.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid1.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid1.c	2004-03-09 13:08:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid1.c	2004-03-10 12:14:46.000000000 +0100
@@ -451,6 +451,7 @@
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
+	blk_run_queue(conf->mddev->queue);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
 	
@@ -478,6 +479,7 @@
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
+	blk_run_queue(conf->mddev->queue);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	conf->nr_pending++;
@@ -644,6 +646,7 @@
 
 static void close_sync(conf_t *conf)
 {
+	blk_run_queue(conf->mddev->queue);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid5.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid5.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid5.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid5.c	2004-03-10 12:15:19.000000000 +0100
@@ -249,6 +249,7 @@
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
+				blk_run_queue(conf->mddev->queue);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@
 		}
 	}
 }
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid6main.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid6main.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid6main.c	2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid6main.c	2004-03-10 11:20:20.000000000 +0100
@@ -1454,9 +1454,8 @@
 		}
 	}
 }
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid6_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/mtd/devices/blkmtd.c linux-2.6.4-rc2-mm1-plug/drivers/mtd/devices/blkmtd.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/mtd/devices/blkmtd.c	2004-03-09 13:08:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/mtd/devices/blkmtd.c	2004-03-10 13:03:22.000000000 +0100
@@ -147,8 +147,7 @@
 		bio->bi_private = &event;
 		bio->bi_end_io = bi_read_complete;
 		if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
-			submit_bio(READ, bio);
-			blk_run_queues();
+			submit_bio(READ_SYNC, bio);
 			wait_for_completion(&event);
 			err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
 			bio_put(bio);
@@ -179,8 +178,7 @@
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_write_complete;
-	submit_bio(WRITE, bio);
-	blk_run_queues();
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&event);
 	DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
 	err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/buffer.c linux-2.6.4-rc2-mm1-plug/fs/buffer.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/buffer.c	2004-03-09 15:24:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/buffer.c	2004-03-10 11:15:33.000000000 +0100
@@ -132,7 +132,7 @@
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		if (buffer_locked(bh)) {
-			blk_run_queues();
+			blk_run_address_space(bh->b_bdev->bd_inode->i_mapping);
 			io_schedule();
 		}
 	} while (buffer_locked(bh));
@@ -491,7 +491,6 @@
 	pg_data_t *pgdat;
 
 	wakeup_bdflush(1024);
-	blk_run_queues();
 	yield();
 
 	for_each_pgdat(pgdat) {
@@ -2928,7 +2927,7 @@
 
 int block_sync_page(struct page *page)
 {
-	blk_run_queues();
+	blk_run_address_space(page->mapping);
 	return 0;
 }
 
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/direct-io.c linux-2.6.4-rc2-mm1-plug/fs/direct-io.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/direct-io.c	2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/direct-io.c	2004-03-10 10:50:48.000000000 +0100
@@ -364,7 +364,7 @@
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
-			blk_run_queues();
+			blk_run_address_space(dio->inode->i_mapping);
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_lock, flags);
 			dio->waiter = NULL;
@@ -1035,7 +1035,7 @@
 		if (ret == 0)
 			ret = dio->result;
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		if (should_wait) {
 			unsigned long flags;
 			/*
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/jfs/jfs_logmgr.c linux-2.6.4-rc2-mm1-plug/fs/jfs/jfs_logmgr.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/jfs/jfs_logmgr.c	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/jfs/jfs_logmgr.c	2004-03-10 13:03:47.000000000 +0100
@@ -1971,8 +1971,7 @@
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ, bio);
-	blk_run_queues();
+	submit_bio(READ_SYNC, bio);
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2116,9 +2115,8 @@
 
 	/* check if journaling to disk has been disabled */
 	if (!log->no_integrity) {
-		submit_bio(WRITE, bio);
+		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
-		blk_run_queues();
 	}
 	else {
 		bio->bi_size = 0;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/ntfs/compress.c linux-2.6.4-rc2-mm1-plug/fs/ntfs/compress.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/ntfs/compress.c	2004-02-18 04:57:58.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/ntfs/compress.c	2004-03-10 12:10:38.000000000 +0100
@@ -668,7 +668,7 @@
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_queues();
+			blk_run_address_space(mapping);
 			schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/ufs/truncate.c linux-2.6.4-rc2-mm1-plug/fs/ufs/truncate.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/ufs/truncate.c	2004-02-18 04:57:57.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/ufs/truncate.c	2004-03-10 12:11:06.000000000 +0100
@@ -456,7 +456,7 @@
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c	2004-03-10 13:13:49.000000000 +0100
@@ -1013,7 +1013,7 @@
 {
 	PB_TRACE(pb, "lock", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_sema);
 	PB_SET_OWNER(pb);
 	PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@
 		if (atomic_read(&pb->pb_pin_count) == 0)
 			break;
 		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		schedule();
 	}
 	remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@
 	if (pb->pb_flags & PBF_RUN_QUEUES) {
 		pb->pb_flags &= ~PBF_RUN_QUEUES;
 		if (atomic_read(&pb->pb_io_remaining) > 1)
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 	}
 }
 
@@ -1471,7 +1471,7 @@
 {
 	PB_TRACE(pb, "iowait", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_iodonesema);
 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
 	return pb->pb_error;
@@ -1617,7 +1617,6 @@
 pagebuf_daemon(
 	void			*data)
 {
-	int			count;
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 
@@ -1640,7 +1639,6 @@
 
 		spin_lock(&pbd_delwrite_lock);
 
-		count = 0;
 		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
 			pb = list_entry(curr, page_buf_t, pb_list);
 
@@ -1657,7 +1655,7 @@
 				pb->pb_flags &= ~PBF_DELWRI;
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
-				count++;
+				blk_run_address_space(pb->pb_target->pbr_mapping);
 			}
 		}
 
@@ -1671,8 +1669,6 @@
 
 		if (as_list_len > 0)
 			purge_addresses();
-		if (count)
-			blk_run_queues();
 
 		force_flush = 0;
 	} while (pagebuf_daemon_active);
@@ -1734,13 +1730,11 @@
 		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
 		if (++flush_cnt > 32) {
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 			flush_cnt = 0;
 		}
 	}
 
-	blk_run_queues();
-
 	while (!list_empty(&tmp)) {
 		pb = list_entry(tmp.next, page_buf_t, pb_list);
 
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/backing-dev.h linux-2.6.4-rc2-mm1-plug/include/linux/backing-dev.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/backing-dev.h	2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/backing-dev.h	2004-03-10 11:06:45.000000000 +0100
@@ -28,6 +28,8 @@
 	int memory_backed;	/* Cannot clean pages with writepage */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	void (*unplug_io_fn)(struct backing_dev_info *);
+	void *unplug_io_data;
 };
 
 extern struct backing_dev_info default_backing_dev_info;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/bio.h linux-2.6.4-rc2-mm1-plug/include/linux/bio.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/bio.h	2004-02-18 04:59:06.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/bio.h	2004-03-10 13:01:45.000000000 +0100
@@ -124,6 +124,7 @@
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
+#define BIO_RW_SYNC	4
 
 /*
  * various member access, note that bio_data should of course not be used
@@ -138,6 +139,7 @@
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
  * will die
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/blkdev.h linux-2.6.4-rc2-mm1-plug/include/linux/blkdev.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/blkdev.h	2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/blkdev.h	2004-03-10 13:27:27.761319763 +0100
@@ -243,7 +243,7 @@
 typedef void (request_fn_proc) (request_queue_t *q);
 typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
 typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@
 	unsigned long		bounce_pfn;
 	int			bounce_gfp;
 
-	struct list_head	plug_list;
-
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
@@ -369,8 +367,9 @@
 #define	QUEUE_FLAG_READFULL	3	/* write queue has been filled */
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
+#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 
-#define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 
@@ -514,7 +513,8 @@
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
+extern void blk_run_backing_dev(struct backing_dev_info *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int, int);
@@ -525,6 +525,12 @@
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info);
+}
+
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
@@ -571,7 +577,7 @@
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(request_queue_t *);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/fs.h linux-2.6.4-rc2-mm1-plug/include/linux/fs.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/fs.h	2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/fs.h	2004-03-10 13:03:04.000000000 +0100
@@ -82,6 +82,8 @@
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define READ_SYNC	(READ | BIO_RW_SYNC)
+#define WRITE_SYNC	(WRITE | BIO_RW_SYNC)
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/raid/md_k.h linux-2.6.4-rc2-mm1-plug/include/linux/raid/md_k.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/raid/md_k.h	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/raid/md_k.h	2004-03-10 12:15:43.000000000 +0100
@@ -326,7 +326,6 @@
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
-		blk_run_queues();					\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -341,30 +340,5 @@
 	__wait_event_lock_irq(wq, condition, lock);			\
 } while (0)
 
-
-#define __wait_disk_event(wq, condition) 				\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		blk_run_queues();					\
-		schedule();						\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_disk_event(wq, condition) 					\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_disk_event(wq, condition);				\
-} while (0)
-
 #endif
 
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/disk.c linux-2.6.4-rc2-mm1-plug/kernel/power/disk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/disk.c	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/disk.c	2004-03-10 12:17:32.000000000 +0100
@@ -84,7 +84,6 @@
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/pmdisk.c linux-2.6.4-rc2-mm1-plug/kernel/power/pmdisk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/pmdisk.c	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/pmdisk.c	2004-03-10 13:16:16.000000000 +0100
@@ -859,7 +859,6 @@
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -895,6 +894,7 @@
 		goto Done;
 	}
 
+	rw |= BIO_RW_SYNC;
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/swsusp.c linux-2.6.4-rc2-mm1-plug/kernel/power/swsusp.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/swsusp.c	2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/swsusp.c	2004-03-10 12:17:52.000000000 +0100
@@ -707,11 +707,6 @@
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/mm/mempool.c linux-2.6.4-rc2-mm1-plug/mm/mempool.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/mm/mempool.c	2004-02-18 04:58:32.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/mm/mempool.c	2004-03-09 15:24:39.000000000 +0100
@@ -233,8 +233,6 @@
 	if (!(gfp_mask & __GFP_WAIT))
 		return NULL;
 
-	blk_run_queues();
-
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	if (!pool->curr_nr)
 		io_schedule();
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/mm/readahead.c linux-2.6.4-rc2-mm1-plug/mm/readahead.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/mm/readahead.c	2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/mm/readahead.c	2004-03-10 11:04:49.000000000 +0100
@@ -15,9 +15,14 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+static void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+
 struct backing_dev_info default_backing_dev_info = {
 	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
 	.state		= 0,
+	.unplug_io_fn	= default_unplug_io_fn,
 };
 
 EXPORT_SYMBOL_GPL(default_backing_dev_info);

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
@ 2004-03-10 19:55 ` Andrew Morton
  2004-03-10 20:03   ` Kenneth Chen
                     ` (2 more replies)
  2004-03-10 20:14 ` Jeff Garzik
                   ` (4 subsequent siblings)
  5 siblings, 3 replies; 31+ messages in thread
From: Andrew Morton @ 2004-03-10 19:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, kenneth.w.chen, thornber

Jens Axboe <axboe@suse.de> wrote:
>
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused. This introduceds per-queue
> plugging, controlled by the backing_dev_info.

This is such an improvement over what we have now it isn't funny.

Ken, the next -mm is starting to look like linux-3.1.0 so I think it
would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] backing dev unplugging
  2004-03-10 19:55 ` Andrew Morton
@ 2004-03-10 20:03   ` Kenneth Chen
  2004-03-10 20:20     ` Jens Axboe
  2004-03-10 20:17   ` Jens Axboe
  2004-03-15  5:53   ` Kenneth Chen
  2 siblings, 1 reply; 31+ messages in thread
From: Kenneth Chen @ 2004-03-10 20:03 UTC (permalink / raw)
  To: 'Andrew Morton', Jens Axboe; +Cc: linux-kernel, thornber

Andrew Morton wrote on Wednesday, March 10, 2004 11:56 AM
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > plugging, controlled by the backing_dev_info.
>
> This is such an improvement over what we have now it isn't funny.
>
> Ken, the next -mm is starting to look like linux-3.1.0 so I think it
> would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.

I'm planning on couple experiments: one is just Jens's change on top of
what we have so we can validate the backing dev unplug. Then we will run
2.6.4-rc2-mm1 + Jens's patch.

- Ken



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
  2004-03-10 19:55 ` Andrew Morton
@ 2004-03-10 20:14 ` Jeff Garzik
  2004-03-10 20:19   ` Jens Axboe
  2004-03-10 21:00 ` Andrew Morton
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 31+ messages in thread
From: Jeff Garzik @ 2004-03-10 20:14 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel, kenneth.w.chen, Andrew Morton, thornber

Jens Axboe wrote:
> Hi,
> 
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused. This introduceds per-queue
> plugging, controlled by the backing_dev_info. Observations:
> 
> - Most uses of blk_run_queues() without a specific context was bogus.
>   Usually the act of kicking the targets in question should be (and
>   already are) performed by other activities, such as making the vm
>   flushing run to free memory.
> 
> - Some use of blk_run_queues() really just want to kick the final queue
>   where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
>   those, if the queue needs unplugging we'll do it when holding the lock
>   for the queue already.
> 
> - The dm bit needs careful examination and checking of Joe. It could be
>   more clever and maintain plug state of each target, I just added a
>   dm_table unplug all functionality.
> 
> Patch is against 2.6.4-rc2-mm1.

I like it a lot.

Any chance some of these newly-shortened functions can become static 
inline as well?


> @@ -1100,13 +1092,11 @@
>  	 * don't plug a stopped queue, it must be paired with blk_start_queue()
>  	 * which will restart the queueing
>  	 */
> -	if (!blk_queue_plugged(q)
> -	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
> -		spin_lock(&blk_plug_lock);
> -		list_add_tail(&q->plug_list, &blk_plug_list);
> +	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
> +		return;
> +
> +	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
>  		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
> -		spin_unlock(&blk_plug_lock);
> -	}
>  }
>  
>  EXPORT_SYMBOL(blk_plug_device);
> @@ -1118,15 +1108,12 @@
>  int blk_remove_plug(request_queue_t *q)
>  {
>  	WARN_ON(!irqs_disabled());
> -	if (blk_queue_plugged(q)) {
> -		spin_lock(&blk_plug_lock);
> -		list_del_init(&q->plug_list);
> -		del_timer(&q->unplug_timer);
> -		spin_unlock(&blk_plug_lock);
> -		return 1;
> -	}
>  
> -	return 0;
> +	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> +		return 0;
> +
> +	del_timer(&q->unplug_timer);
> +	return 1;
>  }
>  
>  EXPORT_SYMBOL(blk_remove_plug);

I tend to like

	if (test_and_clear_bit())
		call_out_of_line_function()

style for small functions like this, and inline those.

	Jeff




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 19:55 ` Andrew Morton
  2004-03-10 20:03   ` Kenneth Chen
@ 2004-03-10 20:17   ` Jens Axboe
  2004-03-15  5:53   ` Kenneth Chen
  2 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 20:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, kenneth.w.chen, thornber

On Wed, Mar 10 2004, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > plugging, controlled by the backing_dev_info.
> 
> This is such an improvement over what we have now it isn't funny.

It is pretty scary, once I dove into it...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 20:14 ` Jeff Garzik
@ 2004-03-10 20:19   ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 20:19 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Linux Kernel, kenneth.w.chen, Andrew Morton, thornber

On Wed, Mar 10 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >Hi,
> >
> >Here's a first cut at killing global plugging of block devices to reduce
> >the nasty contention blk_plug_lock caused. This introduceds per-queue
> >plugging, controlled by the backing_dev_info. Observations:
> >
> >- Most uses of blk_run_queues() without a specific context was bogus.
> >  Usually the act of kicking the targets in question should be (and
> >  already are) performed by other activities, such as making the vm
> >  flushing run to free memory.
> >
> >- Some use of blk_run_queues() really just want to kick the final queue
> >  where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
> >  those, if the queue needs unplugging we'll do it when holding the lock
> >  for the queue already.
> >
> >- The dm bit needs careful examination and checking of Joe. It could be
> >  more clever and maintain plug state of each target, I just added a
> >  dm_table unplug all functionality.
> >
> >Patch is against 2.6.4-rc2-mm1.
> 
> I like it a lot.

Thanks

> Any chance some of these newly-shortened functions can become static 
> inline as well?

Yeah most likely. Remember this is just a first version, there's room
for a little cleanup here and there for sure.

> >@@ -1100,13 +1092,11 @@
> > 	 * don't plug a stopped queue, it must be paired with 
> > 	 blk_start_queue()
> > 	 * which will restart the queueing
> > 	 */
> >-	if (!blk_queue_plugged(q)
> >-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
> >-		spin_lock(&blk_plug_lock);
> >-		list_add_tail(&q->plug_list, &blk_plug_list);
> >+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
> >+		return;
> >+
> >+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> > 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
> >-		spin_unlock(&blk_plug_lock);
> >-	}
> > }
> > 
> > EXPORT_SYMBOL(blk_plug_device);
> >@@ -1118,15 +1108,12 @@
> > int blk_remove_plug(request_queue_t *q)
> > {
> > 	WARN_ON(!irqs_disabled());
> >-	if (blk_queue_plugged(q)) {
> >-		spin_lock(&blk_plug_lock);
> >-		list_del_init(&q->plug_list);
> >-		del_timer(&q->unplug_timer);
> >-		spin_unlock(&blk_plug_lock);
> >-		return 1;
> >-	}
> > 
> >-	return 0;
> >+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> >+		return 0;
> >+
> >+	del_timer(&q->unplug_timer);
> >+	return 1;
> > }
> > 
> > EXPORT_SYMBOL(blk_remove_plug);
> 
> I tend to like
> 
> 	if (test_and_clear_bit())
> 		call_out_of_line_function()
> 
> style for small functions like this, and inline those.

I like to do

	if (expected condition)
		do_stuff

	slow_path

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 20:03   ` Kenneth Chen
@ 2004-03-10 20:20     ` Jens Axboe
  2004-03-10 20:45       ` Jesse Barnes
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 20:20 UTC (permalink / raw)
  To: Kenneth Chen; +Cc: 'Andrew Morton', linux-kernel, thornber

On Wed, Mar 10 2004, Kenneth Chen wrote:
> Andrew Morton wrote on Wednesday, March 10, 2004 11:56 AM
> > Jens Axboe <axboe@suse.de> wrote:
> > >
> > > Here's a first cut at killing global plugging of block devices to reduce
> > > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > > plugging, controlled by the backing_dev_info.
> >
> > This is such an improvement over what we have now it isn't funny.
> >
> > Ken, the next -mm is starting to look like linux-3.1.0 so I think it
> > would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.
> 
> I'm planning on couple experiments: one is just Jens's change on top of
> what we have so we can validate the backing dev unplug. Then we will run
> 2.6.4-rc2-mm1 + Jens's patch.

It'll be hard to apply the patch against anything but -mm, since it
builds (or at least will conflict with) other changes in there. I
deliberately made a -mm version this time though I usually make -linus
and adapt if necessary, for Andrew to get some testing on this.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 20:20     ` Jens Axboe
@ 2004-03-10 20:45       ` Jesse Barnes
  2004-03-10 20:49         ` Jens Axboe
  0 siblings, 1 reply; 31+ messages in thread
From: Jesse Barnes @ 2004-03-10 20:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Kenneth Chen, 'Andrew Morton', linux-kernel, thornber

On Wed, Mar 10, 2004 at 09:20:25PM +0100, Jens Axboe wrote:
> It'll be hard to apply the patch against anything but -mm, since it
> builds (or at least will conflict with) other changes in there. I
> deliberately made a -mm version this time though I usually make -linus
> and adapt if necessary, for Andrew to get some testing on this.

I tried applying to a BK tree (that was fresh as of a couple of hours
ago), and ignored the dm related changes, and it seemed to work ok after
I fixed up rejects in direct-io.c and blkdev.h (which didn't seem hard,
unless I'm missing something):

10 qla2200 fc controllers, 64 cpus, 112 disks

stock BK tree: ~43945 I/Os per second
w/Jens' patch: ~47149 I/Os per second

Jesse

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 20:45       ` Jesse Barnes
@ 2004-03-10 20:49         ` Jens Axboe
       [not found]           ` <20040310205237.GK15087@suse.de>
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 20:49 UTC (permalink / raw)
  To: Kenneth Chen, 'Andrew Morton', linux-kernel, thornber

On Wed, Mar 10 2004, Jesse Barnes wrote:
> On Wed, Mar 10, 2004 at 09:20:25PM +0100, Jens Axboe wrote:
> > It'll be hard to apply the patch against anything but -mm, since it
> > builds (or at least will conflict with) other changes in there. I
> > deliberately made a -mm version this time though I usually make -linus
> > and adapt if necessary, for Andrew to get some testing on this.
> 
> I tried applying to a BK tree (that was fresh as of a couple of hours
> ago), and ignored the dm related changes, and it seemed to work ok after
> I fixed up rejects in direct-io.c and blkdev.h (which didn't seem hard,
> unless I'm missing something):
> 
> 10 qla2200 fc controllers, 64 cpus, 112 disks
> 
> stock BK tree: ~43945 I/Os per second
> w/Jens' patch: ~47149 I/Os per second

Do you have a profile for both runs (what was the work load, btw)?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
  2004-03-10 19:55 ` Andrew Morton
  2004-03-10 20:14 ` Jeff Garzik
@ 2004-03-10 21:00 ` Andrew Morton
  2004-03-10 21:02   ` Jens Axboe
  2004-03-10 22:22 ` Nathan Scott
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2004-03-10 21:00 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, kenneth.w.chen, thornber

Jens Axboe <axboe@suse.de> wrote:
>
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused.

Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?

If so, what are the lock ranking issues here?  The queue lock is not held
yet, so it seems pretty simple?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
       [not found]           ` <20040310205237.GK15087@suse.de>
@ 2004-03-10 21:01             ` Jesse Barnes
  2004-03-10 21:02               ` Jens Axboe
  0 siblings, 1 reply; 31+ messages in thread
From: Jesse Barnes @ 2004-03-10 21:01 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel

On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> And what fs, if any? The XFS changes probably need someone knowledgable
> about XFS internals to verify them.

Yeah, sorry.  I forgot to post CPU time info too (I'm putting that
together now).  The benchmark is doing small, direct I/O requests
directly to the block devices (e.g. /dev/sdd1).

Jesse

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:00 ` Andrew Morton
@ 2004-03-10 21:02   ` Jens Axboe
  2004-03-10 21:40     ` Miquel van Smoorenburg
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 21:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, kenneth.w.chen, thornber

On Wed, Mar 10 2004, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused.
> 
> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?

Ugh yes, we certainly should.

> If so, what are the lock ranking issues here?  The queue lock is not
> held yet, so it seems pretty simple?

As far as I can tell, it's pretty straight forward. The unplug_fn() will
grab the queue lock for 'ordinary' devices, for dm on dm you'd nest the
maplock inside each other (which should be quite alright, as far as I
can tell, without pulling any nasty tricks).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:01             ` Jesse Barnes
@ 2004-03-10 21:02               ` Jens Axboe
  2004-03-10 21:35                 ` Jesse Barnes
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2004-03-10 21:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: jbarnes

On Wed, Mar 10 2004, Jesse Barnes wrote:
> On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> > And what fs, if any? The XFS changes probably need someone knowledgable
> > about XFS internals to verify them.
> 
> Yeah, sorry.  I forgot to post CPU time info too (I'm putting that
> together now).  The benchmark is doing small, direct I/O requests
> directly to the block devices (e.g. /dev/sdd1).

Neat thanks, looking forward to seeing them.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:02               ` Jens Axboe
@ 2004-03-10 21:35                 ` Jesse Barnes
  2004-03-10 23:54                   ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Jesse Barnes @ 2004-03-10 21:35 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel

On Wed, Mar 10, 2004 at 10:02:50PM +0100, Jens Axboe wrote:
> On Wed, Mar 10 2004, Jesse Barnes wrote:
> > On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> > > And what fs, if any? The XFS changes probably need someone knowledgable
> > > about XFS internals to verify them.
> > 
> > Yeah, sorry.  I forgot to post CPU time info too (I'm putting that
> > together now).  The benchmark is doing small, direct I/O requests
> > directly to the block devices (e.g. /dev/sdd1).
> 
> Neat thanks, looking forward to seeing them.

Here's a bit more info.  I can probably do some more benchmarking later
if you need it (though sometimes getting time on that machine can be
hard).

The benchmark creates a thread for each block device.  Each one opens
the device with O_DIRECT and starts doing I/O.

10 qla2200 fc controllers, 64 cpus, 112 disks

-------------------------------------
stock BK tree: ~43945 I/Os per second

  [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
  204038 default_idle                             6376.1875
  700889 snidle                                   1825.2318
  234641 cpu_idle                                 488.8354
   25954 blk_run_queues                            45.0590
    4475 dio_bio_end_io                            11.6536
    6137 scsi_end_request                          11.2812
   18002 scsi_request_fn                            9.5350
     447 ia64_spinlock_contention                   6.9844
   19326 __make_request                             5.3446
    3020 sn_dma_flush                               4.4940
     856 generic_unplug_device                      2.4318
     961 scsi_device_unbusy                         2.3101
     360 current_kernel_time                        1.8750
     547 put_io_context                             1.7094
     932 dio_await_one                              1.6181
     409 blk_queue_bounce                           1.5977
    3484 qla2x00_start_scsi                         1.4140
     147 pid_alive                                  1.1484
     436 as_set_request                             1.1354
     283 kmem_cache_alloc                           1.1055

CPU time from vmstat:
 usr sys id wa
   0  15 20 65
   0  19  0 81
   0  17  0 83
   0  19  0 81
   0  16  0 83
   0  18  0 82
   0  18  0 82
   0  13  0 87
   0  16  0 84
   0  12 26 62

-------------------------------------
w/Jens' patch: ~47149 I/Os per second

  [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
  181993 default_idle                             5687.2812
  624772 snidle                                   1627.0104
  209129 cpu_idle                                 435.6854
    4755 dio_bio_end_io                            12.3828
    6593 scsi_end_request                          12.1195
     435 ia64_spinlock_contention                   6.7969
    2959 sn_dma_flush                               4.4033
    7459 scsi_request_fn                            3.9507
     449 blk_run_backing_dev                        3.5078
    1452 scsi_device_unbusy                         3.4904
     250 kobject_put                                2.6042
     590 put_io_context                             1.8438
     995 dio_await_one                              1.6365
     313 current_kernel_time                        1.6302
     361 kmem_cache_alloc                           1.4102
    3435 qla2x00_start_scsi                         1.3941
    5032 __make_request                             1.3674
     480 as_set_request                             1.2500
     603 scsi_put_command                           1.1085
    6019 schedule                                   1.1064

CPU time from vmstat:
 usr sys id wa
   0   5 60 35
   0   8 28 64
   0   8 20 72
   0   8 19 73
   0   8 17 75
   0   8 15 77
   0   8 15 77
   0   8 12 80
   0   8 11 81
   0   8 11 81

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:02   ` Jens Axboe
@ 2004-03-10 21:40     ` Miquel van Smoorenburg
  2004-03-10 23:05       ` Andrew Morton
  0 siblings, 1 reply; 31+ messages in thread
From: Miquel van Smoorenburg @ 2004-03-10 21:40 UTC (permalink / raw)
  To: linux-kernel

In article <20040310210207.GL15087@suse.de>,
Jens Axboe  <axboe@suse.de> wrote:
>On Wed, Mar 10 2004, Andrew Morton wrote:
>> Jens Axboe <axboe@suse.de> wrote:
>> >
>> > Here's a first cut at killing global plugging of block devices to reduce
>> > the nasty contention blk_plug_lock caused.
>> 
>> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
>
>Ugh yes, we certainly should.

With the latest patches from Joe it would be more like

	map = dm_get_table(md);
	if (map) {
		dm_table_unplug_all(map);
		dm_table_put(map);
	}

No lock ranking issues, you just get a refcounted map (table, really).

Mike.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
                   ` (2 preceding siblings ...)
  2004-03-10 21:00 ` Andrew Morton
@ 2004-03-10 22:22 ` Nathan Scott
  2004-03-10 23:32   ` Steve Lord
  2004-03-11  7:05   ` Jens Axboe
  2004-03-11  9:14 ` Joe Thornber
  2004-03-11 12:17 ` Christophe Saout
  5 siblings, 2 replies; 31+ messages in thread
From: Nathan Scott @ 2004-03-10 22:22 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Linux Kernel, kenneth.w.chen, Andrew Morton, thornber, linux-xfs

Hi Jens,

On Wed, Mar 10, 2004 at 01:45:07PM +0100, Jens Axboe wrote:
> ...[snip]...
> diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
> --- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c	2004-03-09 13:08:30.000000000 +0100
> +++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c	2004-03-10 13:13:49.000000000 +0100
> @@ -1013,7 +1013,7 @@
>  {
>  	PB_TRACE(pb, "lock", 0);
>  	if (atomic_read(&pb->pb_io_remaining))
> -		blk_run_queues();
> +		blk_run_address_space(pb->pb_target->pbr_mapping);
>  	down(&pb->pb_sema);
>  	PB_SET_OWNER(pb);
>  	PB_TRACE(pb, "locked", 0);
> @@ -1109,7 +1109,7 @@
>  		if (atomic_read(&pb->pb_pin_count) == 0)
>  			break;
>  		if (atomic_read(&pb->pb_io_remaining))
> -			blk_run_queues();
> +			blk_run_address_space(pb->pb_target->pbr_mapping);
>  		schedule();
>  	}
>  	remove_wait_queue(&pb->pb_waiters, &wait);
> @@ -1407,7 +1407,7 @@
>  	if (pb->pb_flags & PBF_RUN_QUEUES) {
>  		pb->pb_flags &= ~PBF_RUN_QUEUES;
>  		if (atomic_read(&pb->pb_io_remaining) > 1)
> -			blk_run_queues();
> +			blk_run_address_space(pb->pb_target->pbr_mapping);
>  	}
>  }
>  
> @@ -1471,7 +1471,7 @@
>  {
>  	PB_TRACE(pb, "iowait", 0);
>  	if (atomic_read(&pb->pb_io_remaining))
> -		blk_run_queues();
> +		blk_run_address_space(pb->pb_target->pbr_mapping);
>  	down(&pb->pb_iodonesema);
>  	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
>  	return pb->pb_error;
> @@ -1617,7 +1617,6 @@
>  pagebuf_daemon(
>  	void			*data)
>  {
> -	int			count;
>  	page_buf_t		*pb;
>  	struct list_head	*curr, *next, tmp;
>  
> @@ -1640,7 +1639,6 @@
>  
>  		spin_lock(&pbd_delwrite_lock);
>  
> -		count = 0;
>  		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
>  			pb = list_entry(curr, page_buf_t, pb_list);
>  
> @@ -1657,7 +1655,7 @@
>  				pb->pb_flags &= ~PBF_DELWRI;
>  				pb->pb_flags |= PBF_WRITE;
>  				list_move(&pb->pb_list, &tmp);
> -				count++;
> +				blk_run_address_space(pb->pb_target->pbr_mapping);
>  			}

This moves the blk_run_address_space to before we submit the
I/O (this bit of code is moving buffers off the delwri queue
onto a temporary queue, buffers on the temporary queue are
then submitted a little further down) - I suspect we need to
move this new blk_run_address_space call down into the temp
list processing, just after pagebuf_iostrategy.

>  		}
>  
> @@ -1671,8 +1669,6 @@
>  
>  		if (as_list_len > 0)
>  			purge_addresses();
> -		if (count)
> -			blk_run_queues();
>  
>  		force_flush = 0;
>  	} while (pagebuf_daemon_active);
> @@ -1734,13 +1730,11 @@
>  		pagebuf_lock(pb);
>  		pagebuf_iostrategy(pb);
>  		if (++flush_cnt > 32) {
> -			blk_run_queues();
> +			blk_run_address_space(pb->pb_target->pbr_mapping);
>  			flush_cnt = 0;
>  		}
>  	}
>  
> -	blk_run_queues();
> -
>  	while (!list_empty(&tmp)) {
>  		pb = list_entry(tmp.next, page_buf_t, pb_list);
>  

For this second one, we probably just want to ditch the flush_cnt
there (this change is doing blk_run_address_space on every 32nd
buffer target, and not the intervening ones).  We will be doing a
bunch more blk_run_address_space calls than we probably need to,
not sure if thats going to become an issue or not, let me prod
some of the other XFS folks for more insight there...

thanks.

-- 
Nathan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:40     ` Miquel van Smoorenburg
@ 2004-03-10 23:05       ` Andrew Morton
  2004-03-11  0:05         ` Miquel van Smoorenburg
  2004-03-11  6:43         ` Jens Axboe
  0 siblings, 2 replies; 31+ messages in thread
From: Andrew Morton @ 2004-03-10 23:05 UTC (permalink / raw)
  To: Miquel van Smoorenburg; +Cc: linux-kernel, Jens Axboe


(Please use reply-to-all)

"Miquel van Smoorenburg" <miquels@cistron.nl> wrote:
>
> In article <20040310210207.GL15087@suse.de>,
> Jens Axboe  <axboe@suse.de> wrote:
> >On Wed, Mar 10 2004, Andrew Morton wrote:
> >> Jens Axboe <axboe@suse.de> wrote:
> >> >
> >> > Here's a first cut at killing global plugging of block devices to reduce
> >> > the nasty contention blk_plug_lock caused.
> >> 
> >> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
> >
> >Ugh yes, we certainly should.
> 
> With the latest patches from Joe it would be more like
> 
> 	map = dm_get_table(md);
> 	if (map) {
> 		dm_table_unplug_all(map);
> 		dm_table_put(map);
> 	}
> 
> No lock ranking issues, you just get a refcounted map (table, really).

Ah, OK.  Jens, you'll be needing this (on rc2-mm1):

dm.c: protect md->map with a rw spin lock rather than the md->lock
semaphore.  Also ensure that everyone accesses md->map through
dm_get_table(), rather than directly.


---

 25-akpm/drivers/md/dm-table.c |    3 +
 25-akpm/drivers/md/dm.c       |   88 +++++++++++++++++++++++++-----------------
 2 files changed, 57 insertions(+), 34 deletions(-)

diff -puN drivers/md/dm.c~dm-map-rwlock-ng drivers/md/dm.c
--- 25/drivers/md/dm.c~dm-map-rwlock-ng	Mon Mar  8 13:58:42 2004
+++ 25-akpm/drivers/md/dm.c	Mon Mar  8 13:58:42 2004
@@ -49,7 +49,7 @@ struct target_io {
 
 struct mapped_device {
 	struct rw_semaphore lock;
-	rwlock_t maplock;
+	rwlock_t map_lock;
 	atomic_t holders;
 
 	unsigned long flags;
@@ -238,6 +238,24 @@ static int queue_io(struct mapped_device
 	return 0;		/* deferred successfully */
 }
 
+/*
+ * Everyone (including functions in this file), should use this
+ * function to access the md->map field, and make sure they call
+ * dm_table_put() when finished.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md)
+{
+	struct dm_table *t;
+
+	read_lock(&md->map_lock);
+	t = md->map;
+	if (t)
+		dm_table_get(t);
+	read_unlock(&md->map_lock);
+
+	return t;
+}
+
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
@@ -346,6 +364,7 @@ static void __map_bio(struct dm_target *
 
 struct clone_info {
 	struct mapped_device *md;
+	struct dm_table *map;
 	struct bio *bio;
 	struct dm_io *io;
 	sector_t sector;
@@ -399,7 +418,7 @@ static struct bio *clone_bio(struct bio 
 static void __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
-	struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
+	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
 	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
 	struct target_io *tio;
 
@@ -460,7 +479,7 @@ static void __clone_and_map(struct clone
 
 		ci->sector += max;
 		ci->sector_count -= max;
-		ti = dm_table_find_target(ci->md->map, ci->sector);
+		ti = dm_table_find_target(ci->map, ci->sector);
 
 		len = to_sector(bv->bv_len) - max;
 		clone = split_bvec(bio, ci->sector, ci->idx,
@@ -485,6 +504,7 @@ static void __split_bio(struct mapped_de
 	struct clone_info ci;
 
 	ci.md = md;
+	ci.map = dm_get_table(md);
 	ci.bio = bio;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
@@ -501,6 +521,7 @@ static void __split_bio(struct mapped_de
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, 0);
+	dm_table_put(ci.map);
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -563,15 +584,16 @@ static int dm_request(request_queue_t *q
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
-	struct mapped_device *md = congested_data;
+	struct mapped_device *md = (struct mapped_device *) congested_data;
+	struct dm_table *map = dm_get_table(md);
 
-	read_lock(&md->maplock);
-	if (md->map == NULL || test_bit(DMF_BLOCK_IO, &md->flags))
+	if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
+		/* FIXME: shouldn't suspended count a congested ? */
 		r = bdi_bits;
 	else
-		r = dm_table_any_congested(md->map, bdi_bits);
-	read_unlock(&md->maplock);
+		r = dm_table_any_congested(map, bdi_bits);
 
+	dm_table_put(map);
 	return r;
 }
 
@@ -646,7 +668,7 @@ static struct mapped_device *alloc_dev(u
 
 	memset(md, 0, sizeof(*md));
 	init_rwsem(&md->lock);
-	rwlock_init(&md->maplock);
+	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
@@ -746,12 +768,12 @@ static int __bind(struct mapped_device *
 	if (size == 0)
 		return 0;
 
-	write_lock(&md->maplock);
+	write_lock(&md->map_lock);
 	md->map = t;
-	write_unlock(&md->maplock);
-	dm_table_event_callback(md->map, event_callback, md);
+	write_unlock(&md->map_lock);
 
 	dm_table_get(t);
+	dm_table_event_callback(md->map, event_callback, md);
 	dm_table_set_restrictions(t, q);
 	return 0;
 }
@@ -764,9 +786,9 @@ static void __unbind(struct mapped_devic
 		return;
 
 	dm_table_event_callback(map, NULL, NULL);
-	write_lock(&md->maplock);
+	write_lock(&md->map_lock);
 	md->map = NULL;
-	write_unlock(&md->maplock);
+	write_unlock(&md->map_lock);
 	dm_table_put(map);
 }
 
@@ -803,12 +825,16 @@ void dm_get(struct mapped_device *md)
 
 void dm_put(struct mapped_device *md)
 {
+	struct dm_table *map = dm_get_table(md);
+
 	if (atomic_dec_and_test(&md->holders)) {
-		if (!test_bit(DMF_SUSPENDED, &md->flags) && md->map)
-			dm_table_suspend_targets(md->map);
+		if (!test_bit(DMF_SUSPENDED, &md->flags) && map)
+			dm_table_suspend_targets(map);
 		__unbind(md);
 		free_dev(md);
 	}
+
+	dm_table_put(map);
 }
 
 /*
@@ -859,6 +885,7 @@ int dm_swap_table(struct mapped_device *
  */
 int dm_suspend(struct mapped_device *md)
 {
+	struct dm_table *map;
 	DECLARE_WAITQUEUE(wait, current);
 
 	down_write(&md->lock);
@@ -894,8 +921,11 @@ int dm_suspend(struct mapped_device *md)
 	down_write(&md->lock);
 	remove_wait_queue(&md->wait, &wait);
 	set_bit(DMF_SUSPENDED, &md->flags);
-	if (md->map)
-		dm_table_suspend_targets(md->map);
+
+	map = dm_get_table(md);
+	if (map)
+		dm_table_suspend_targets(map);
+	dm_table_put(map);
 	up_write(&md->lock);
 
 	return 0;
@@ -904,22 +934,25 @@ int dm_suspend(struct mapped_device *md)
 int dm_resume(struct mapped_device *md)
 {
 	struct bio *def;
+	struct dm_table *map = dm_get_table(md);
 
 	down_write(&md->lock);
-	if (!md->map ||
+	if (!map ||
 	    !test_bit(DMF_SUSPENDED, &md->flags) ||
-	    !dm_table_get_size(md->map)) {
+	    !dm_table_get_size(map)) {
 		up_write(&md->lock);
+		dm_table_put(map);
 		return -EINVAL;
 	}
 
-	dm_table_resume_targets(md->map);
+	dm_table_resume_targets(map);
 	clear_bit(DMF_SUSPENDED, &md->flags);
 	clear_bit(DMF_BLOCK_IO, &md->flags);
 
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_put(map);
 
 	blk_run_queues();
 
@@ -971,19 +1004,6 @@ struct gendisk *dm_disk(struct mapped_de
 	return md->disk;
 }
 
-struct dm_table *dm_get_table(struct mapped_device *md)
-{
-	struct dm_table *t;
-
-	down_read(&md->lock);
-	t = md->map;
-	if (t)
-		dm_table_get(t);
-	up_read(&md->lock);
-
-	return t;
-}
-
 int dm_suspended(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
diff -puN drivers/md/dm-table.c~dm-map-rwlock-ng drivers/md/dm-table.c
--- 25/drivers/md/dm-table.c~dm-map-rwlock-ng	Mon Mar  8 13:58:42 2004
+++ 25-akpm/drivers/md/dm-table.c	Mon Mar  8 13:58:42 2004
@@ -279,6 +279,9 @@ void dm_table_get(struct dm_table *t)
 
 void dm_table_put(struct dm_table *t)
 {
+	if (!t)
+		return;
+
 	if (atomic_dec_and_test(&t->holders))
 		table_destroy(t);
 }

_


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 22:22 ` Nathan Scott
@ 2004-03-10 23:32   ` Steve Lord
  2004-03-11  7:05   ` Jens Axboe
  1 sibling, 0 replies; 31+ messages in thread
From: Steve Lord @ 2004-03-10 23:32 UTC (permalink / raw)
  To: Nathan Scott
  Cc: Jens Axboe, Linux Kernel, kenneth.w.chen, Andrew Morton,
	thornber, linux-xfs

Nathan Scott wrote:
> 
> For this second one, we probably just want to ditch the flush_cnt
> there (this change is doing blk_run_address_space on every 32nd
> buffer target, and not the intervening ones).  We will be doing a
> bunch more blk_run_address_space calls than we probably need to,
> not sure if thats going to become an issue or not, let me prod
> some of the other XFS folks for more insight there...
> 
> thanks.
> 

The concept there was that we were just pushing things down into the
elevator in a batch, then unplugging it afterwards. The do it every
32 I/O's was added to avoid some request starvation issues - which are
probably historical now.

I was lazy and did not look at the context on the code, but there
are two paths in here. One is unmount flushing all the entries for
a specific filesystem, the other is background flushing. I think the
background flush activity can live without the blk_run_address_space
call being there at all. If we need to grab the lock on the metadata
later we will make the call then. The unmount case needs to call it,
but since that specifies a specific target, it can just make one
call.

Steve

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 21:35                 ` Jesse Barnes
@ 2004-03-10 23:54                   ` Andrew Morton
  2004-03-11  0:03                     ` David Mosberger
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Morton @ 2004-03-10 23:54 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: axboe, linux-kernel

jbarnes@sgi.com (Jesse Barnes) wrote:
>
> -------------------------------------
> w/Jens' patch: ~47149 I/Os per second

Happier.

>   [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
>   181993 default_idle                             5687.2812
>   624772 snidle                                   1627.0104
>   209129 cpu_idle                                 435.6854
>     4755 dio_bio_end_io                            12.3828
>     6593 scsi_end_request                          12.1195
>      435 ia64_spinlock_contention                   6.7969
>     2959 sn_dma_flush                               4.4033

Do you know where that spinlock contention is coming from?

(We have a little patch for x86 which places the spinning code inline in the
caller of spin_lock() so it appears nicely in profiles.)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 23:54                   ` Andrew Morton
@ 2004-03-11  0:03                     ` David Mosberger
  2004-03-11  6:30                       ` Jesse Barnes
  0 siblings, 1 reply; 31+ messages in thread
From: David Mosberger @ 2004-03-11  0:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jesse Barnes, axboe, linux-kernel

>>>>> On Wed, 10 Mar 2004 15:54:19 -0800, Andrew Morton <akpm@osdl.org> said:

  Andrew> jbarnes@sgi.com (Jesse Barnes) wrote:
  >> 
  >> -------------------------------------
  >> w/Jens' patch: ~47149 I/Os per second

  Andrew> Happier.

  >> [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2
  >> | head -20 181993 default_idle 5687.2812 624772 snidle 1627.0104
  >> 209129 cpu_idle 435.6854 4755 dio_bio_end_io 12.3828 6593
  >> scsi_end_request 12.1195 435 ia64_spinlock_contention 6.7969 2959
  >> sn_dma_flush 4.4033

  Andrew> Do you know where that spinlock contention is coming from?

  Andrew> (We have a little patch for x86 which places the spinning
  Andrew> code inline in the caller of spin_lock() so it appears
  Andrew> nicely in profiles.)

And real men use profiling tools that provide a call-graph, so this
hack isn't necessary... ;-)

Jesse, if you want to try q-tools and need some help in getting the
per-CPU results merged, let me know (it's something that's planned for
a future release, but there is only so many hours in a day so this
hasn't been done yet).

	--david

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 23:05       ` Andrew Morton
@ 2004-03-11  0:05         ` Miquel van Smoorenburg
  2004-03-11  0:17           ` Andrew Morton
  2004-03-11  6:43         ` Jens Axboe
  1 sibling, 1 reply; 31+ messages in thread
From: Miquel van Smoorenburg @ 2004-03-11  0:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Jens Axboe, Joe Thornber

On Thu, 11 Mar 2004 00:05:42, Andrew Morton wrote:
> "Miquel van Smoorenburg" <miquels@cistron.nl> wrote:
> >
> > 
> > With the latest patches from Joe it would be more like
> > 
> > 	map = dm_get_table(md);
> > 	if (map) {
> > 		dm_table_unplug_all(map);
> > 		dm_table_put(map);
> > 	}
> > 
> > No lock ranking issues, you just get a refcounted map (table, really).
> 
> Ah, OK.  Jens, you'll be needing this (on rc2-mm1):
> 
> dm.c: protect md->map with a rw spin lock rather than the md->lock
> semaphore.  Also ensure that everyone accesses md->map through
> dm_get_table(), rather than directly.
> 
>  25-akpm/drivers/md/dm-table.c |    3 +
>  25-akpm/drivers/md/dm.c       |   88 +++++++++++++++++++++++++-----------------

.. and this final one on top of it, presumably.

See https://www.redhat.com/archives/dm-devel/2004-March/msg00036.html

dm.c: remove __dm_request (merge with previous patch).
--- diff/drivers/md/dm.c	2004-03-08 15:48:05.000000000 +0000
+++ source/drivers/md/dm.c	2004-03-09 09:40:37.000000000 +0000
@@ -506,8 +506,13 @@ static void __split_bio(struct mapped_de
 {
 	struct clone_info ci;
 
-	ci.md = md;
 	ci.map = dm_get_table(md);
+	if (!ci.map) {
+		bio_io_error(bio, bio->bi_size);
+		return;
+	}
+
+	ci.md = md;
 	ci.bio = bio;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
@@ -530,17 +535,6 @@ static void __split_bio(struct mapped_de
  * CRUD END
  *---------------------------------------------------------------*/
 
-
-static inline void __dm_request(struct mapped_device *md, struct bio *bio)
-{
-	if (!md->map) {
-		bio_io_error(bio, bio->bi_size);
-		return;
-	}
-
-	__split_bio(md, bio);
-}
-
 /*
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
@@ -579,7 +573,7 @@ static int dm_request(request_queue_t *q
 		down_read(&md->lock);
 	}
 
-	__dm_request(md, bio);
+	__split_bio(md, bio);
 	up_read(&md->lock);
 	return 0;
 }
@@ -591,7 +585,6 @@ static int dm_any_congested(void *conges
 	struct dm_table *map = dm_get_table(md);
 
 	if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
-		/* FIXME: shouldn't suspended count a congested ? */
 		r = bdi_bits;
 	else
 		r = dm_table_any_congested(map, bdi_bits);
@@ -850,7 +843,7 @@ static void __flush_deferred_io(struct m
 	while (c) {
 		n = c->bi_next;
 		c->bi_next = NULL;
-		__dm_request(md, c);
+		__split_bio(md, c);
 		c = n;
 	}
 }


Mike.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-11  0:05         ` Miquel van Smoorenburg
@ 2004-03-11  0:17           ` Andrew Morton
  0 siblings, 0 replies; 31+ messages in thread
From: Andrew Morton @ 2004-03-11  0:17 UTC (permalink / raw)
  To: Miquel van Smoorenburg; +Cc: linux-kernel, axboe, thornber

Miquel van Smoorenburg <miquels@cistron.nl> wrote:
>
>  dm.c: protect md->map with a rw spin lock rather than the md->lock
> > semaphore.  Also ensure that everyone accesses md->map through
> > dm_get_table(), rather than directly.
> > 
> >  25-akpm/drivers/md/dm-table.c |    3 +
> >  25-akpm/drivers/md/dm.c       |   88 +++++++++++++++++++++++++-----------------
> 
> .. and this final one on top of it, presumably.
> 
> See https://www.redhat.com/archives/dm-devel/2004-March/msg00036.html

Yup, thanks.  Lots happening.  I'll take a trot through the kernel
maternity ward, see if I can drag out another -mm later today.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-11  0:03                     ` David Mosberger
@ 2004-03-11  6:30                       ` Jesse Barnes
  0 siblings, 0 replies; 31+ messages in thread
From: Jesse Barnes @ 2004-03-11  6:30 UTC (permalink / raw)
  To: davidm; +Cc: Andrew Morton, axboe, linux-kernel

On Wed, Mar 10, 2004 at 04:03:55PM -0800, David Mosberger wrote:
> Jesse, if you want to try q-tools and need some help in getting the
> per-CPU results merged, let me know (it's something that's planned for
> a future release, but there is only so many hours in a day so this
> hasn't been done yet).

Yeah, I've been really wanting to try it out (esp. when you mentioned
that you'd be able to profile interrupt off code :).  I'll try to get
some more time on the machine and take a look.

Also, wli just informed me that I screwed up another aspect of the
profile too, the readprofile command i grabbed from the manpage (as my
time on the machine ran out) didn't output things in a very useful
order.

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 23:05       ` Andrew Morton
  2004-03-11  0:05         ` Miquel van Smoorenburg
@ 2004-03-11  6:43         ` Jens Axboe
  1 sibling, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-11  6:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Miquel van Smoorenburg, linux-kernel

On Wed, Mar 10 2004, Andrew Morton wrote:
> 
> (Please use reply-to-all)
> 
> "Miquel van Smoorenburg" <miquels@cistron.nl> wrote:
> >
> > In article <20040310210207.GL15087@suse.de>,
> > Jens Axboe  <axboe@suse.de> wrote:
> > >On Wed, Mar 10 2004, Andrew Morton wrote:
> > >> Jens Axboe <axboe@suse.de> wrote:
> > >> >
> > >> > Here's a first cut at killing global plugging of block devices to reduce
> > >> > the nasty contention blk_plug_lock caused.
> > >> 
> > >> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
> > >
> > >Ugh yes, we certainly should.
> > 
> > With the latest patches from Joe it would be more like
> > 
> > 	map = dm_get_table(md);
> > 	if (map) {
> > 		dm_table_unplug_all(map);
> > 		dm_table_put(map);
> > 	}
> > 
> > No lock ranking issues, you just get a refcounted map (table, really).
> 
> Ah, OK.  Jens, you'll be needing this (on rc2-mm1):
> 
> dm.c: protect md->map with a rw spin lock rather than the md->lock
> semaphore.  Also ensure that everyone accesses md->map through
> dm_get_table(), rather than directly.

Neato, much better. I'll build on top of that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 22:22 ` Nathan Scott
  2004-03-10 23:32   ` Steve Lord
@ 2004-03-11  7:05   ` Jens Axboe
  1 sibling, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-11  7:05 UTC (permalink / raw)
  To: Nathan Scott
  Cc: Linux Kernel, kenneth.w.chen, Andrew Morton, thornber, linux-xfs

On Thu, Mar 11 2004, Nathan Scott wrote:
> Hi Jens,
> 
> On Wed, Mar 10, 2004 at 01:45:07PM +0100, Jens Axboe wrote:
> > ...[snip]...
> > diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
> > --- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c	2004-03-09 13:08:30.000000000 +0100
> > +++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c	2004-03-10 13:13:49.000000000 +0100
> > @@ -1013,7 +1013,7 @@
> >  {
> >  	PB_TRACE(pb, "lock", 0);
> >  	if (atomic_read(&pb->pb_io_remaining))
> > -		blk_run_queues();
> > +		blk_run_address_space(pb->pb_target->pbr_mapping);
> >  	down(&pb->pb_sema);
> >  	PB_SET_OWNER(pb);
> >  	PB_TRACE(pb, "locked", 0);
> > @@ -1109,7 +1109,7 @@
> >  		if (atomic_read(&pb->pb_pin_count) == 0)
> >  			break;
> >  		if (atomic_read(&pb->pb_io_remaining))
> > -			blk_run_queues();
> > +			blk_run_address_space(pb->pb_target->pbr_mapping);
> >  		schedule();
> >  	}
> >  	remove_wait_queue(&pb->pb_waiters, &wait);
> > @@ -1407,7 +1407,7 @@
> >  	if (pb->pb_flags & PBF_RUN_QUEUES) {
> >  		pb->pb_flags &= ~PBF_RUN_QUEUES;
> >  		if (atomic_read(&pb->pb_io_remaining) > 1)
> > -			blk_run_queues();
> > +			blk_run_address_space(pb->pb_target->pbr_mapping);
> >  	}
> >  }
> >  
> > @@ -1471,7 +1471,7 @@
> >  {
> >  	PB_TRACE(pb, "iowait", 0);
> >  	if (atomic_read(&pb->pb_io_remaining))
> > -		blk_run_queues();
> > +		blk_run_address_space(pb->pb_target->pbr_mapping);
> >  	down(&pb->pb_iodonesema);
> >  	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
> >  	return pb->pb_error;
> > @@ -1617,7 +1617,6 @@
> >  pagebuf_daemon(
> >  	void			*data)
> >  {
> > -	int			count;
> >  	page_buf_t		*pb;
> >  	struct list_head	*curr, *next, tmp;
> >  
> > @@ -1640,7 +1639,6 @@
> >  
> >  		spin_lock(&pbd_delwrite_lock);
> >  
> > -		count = 0;
> >  		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
> >  			pb = list_entry(curr, page_buf_t, pb_list);
> >  
> > @@ -1657,7 +1655,7 @@
> >  				pb->pb_flags &= ~PBF_DELWRI;
> >  				pb->pb_flags |= PBF_WRITE;
> >  				list_move(&pb->pb_list, &tmp);
> > -				count++;
> > +				blk_run_address_space(pb->pb_target->pbr_mapping);
> >  			}
> 
> This moves the blk_run_address_space to before we submit the
> I/O (this bit of code is moving buffers off the delwri queue
> onto a temporary queue, buffers on the temporary queue are
> then submitted a little further down) - I suspect we need to
> move this new blk_run_address_space call down into the temp
> list processing, just after pagebuf_iostrategy.

I'm not surprised, the XFS 'conversion' was done quickly and is expected
to be half-assed :)

Thanks a lot for taking a look at this.

> >  		}
> >  
> > @@ -1671,8 +1669,6 @@
> >  
> >  		if (as_list_len > 0)
> >  			purge_addresses();
> > -		if (count)
> > -			blk_run_queues();
> >  
> >  		force_flush = 0;
> >  	} while (pagebuf_daemon_active);
> > @@ -1734,13 +1730,11 @@
> >  		pagebuf_lock(pb);
> >  		pagebuf_iostrategy(pb);
> >  		if (++flush_cnt > 32) {
> > -			blk_run_queues();
> > +			blk_run_address_space(pb->pb_target->pbr_mapping);
> >  			flush_cnt = 0;
> >  		}
> >  	}
> >  
> > -	blk_run_queues();
> > -
> >  	while (!list_empty(&tmp)) {
> >  		pb = list_entry(tmp.next, page_buf_t, pb_list);
> >  
> 
> For this second one, we probably just want to ditch the flush_cnt
> there (this change is doing blk_run_address_space on every 32nd
> buffer target, and not the intervening ones).  We will be doing a
> bunch more blk_run_address_space calls than we probably need to,
> not sure if thats going to become an issue or not, let me prod
> some of the other XFS folks for more insight there...

If any of you could send me a replacement xfs_buf bit, I'd much
appreciate it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
                   ` (3 preceding siblings ...)
  2004-03-10 22:22 ` Nathan Scott
@ 2004-03-11  9:14 ` Joe Thornber
  2004-03-11  9:16   ` Jens Axboe
  2004-03-11 12:17 ` Christophe Saout
  5 siblings, 1 reply; 31+ messages in thread
From: Joe Thornber @ 2004-03-11  9:14 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel, kenneth.w.chen, Andrew Morton, thornber

Jens,

Small locking changes to the dm bits.  I've just seen that you've
posted an updated version of your patch to lkml, so I'll post another
version of this patch to that thread too.

- Joe


Fix md->map protection in the global unplug removal patch.
--- diff/drivers/md/dm.c	2004-03-11 08:41:27.000000000 +0000
+++ source/drivers/md/dm.c	2004-03-11 08:57:14.000000000 +0000
@@ -578,8 +578,12 @@ static int dm_request(request_queue_t *q
 static void dm_unplug_all(request_queue_t *q)
 {
 	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
 
-	dm_table_unplug_all(md->map);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
 }
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -904,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
 	add_wait_queue(&md->wait, &wait);
 	up_write(&md->lock);
 
+	/* unplug */
+	map = dm_get_table(md);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+
 	/*
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	dm_table_unplug_all(md->map);
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -953,10 +963,9 @@ int dm_resume(struct mapped_device *md)
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_unplug_all(map);
 	dm_table_put(map);
 
-	dm_table_unplug_all(md->map);
-
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-11  9:14 ` Joe Thornber
@ 2004-03-11  9:16   ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2004-03-11  9:16 UTC (permalink / raw)
  To: Joe Thornber; +Cc: Linux Kernel, kenneth.w.chen, Andrew Morton

On Thu, Mar 11 2004, Joe Thornber wrote:
> Jens,
> 
> Small locking changes to the dm bits.  I've just seen that you've
> posted an updated version of your patch to lkml, so I'll post another
> version of this patch to that thread too.

Thanks Joe, you don't have to generate a new diff, I'll just adapt this
version and post an update.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
                   ` (4 preceding siblings ...)
  2004-03-11  9:14 ` Joe Thornber
@ 2004-03-11 12:17 ` Christophe Saout
  2004-03-11 12:22   ` Jens Axboe
  5 siblings, 1 reply; 31+ messages in thread
From: Christophe Saout @ 2004-03-11 12:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel, Andrew Morton, thornber

Am Mi, den 10.03.2004 schrieb Jens Axboe um 13:45:

> diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
> --- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c	2004-03-09 13:08:48.000000000 +0100
> +++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c	2004-03-09 15:27:36.000000000 +0100
> @@ -668,7 +668,7 @@
>  
>  		/* out of memory -> run queues */
>  		if (remaining)
> -			blk_run_queues();
> +			blk_congestion_wait(bio_data_dir(clone), HZ/100);

Why did you change this? It was the way I wanted it.

If we were out of memory the buffers were allocated from a mempool and I
want to get it out as soon as possible. If we are OOM the write will
most likely be the VM trying to free some memory and it would be
counterproductive to wait. It is not unlikely that we are the only
writer to that disk so there's a chance that the queue is not congested.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-11 12:17 ` Christophe Saout
@ 2004-03-11 12:22   ` Jens Axboe
  2004-03-11 13:11     ` Christophe Saout
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2004-03-11 12:22 UTC (permalink / raw)
  To: Christophe Saout; +Cc: Linux Kernel, Andrew Morton, thornber

On Thu, Mar 11 2004, Christophe Saout wrote:
> Am Mi, den 10.03.2004 schrieb Jens Axboe um 13:45:
> 
> > diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
> > --- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c	2004-03-09 13:08:48.000000000 +0100
> > +++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c	2004-03-09 15:27:36.000000000 +0100
> > @@ -668,7 +668,7 @@
> >  
> >  		/* out of memory -> run queues */
> >  		if (remaining)
> > -			blk_run_queues();
> > +			blk_congestion_wait(bio_data_dir(clone), HZ/100);
> 
> Why did you change this? It was the way I wanted it.
> 
> If we were out of memory the buffers were allocated from a mempool and I
> want to get it out as soon as possible. If we are OOM the write will
> most likely be the VM trying to free some memory and it would be
> counterproductive to wait. It is not unlikely that we are the only
> writer to that disk so there's a chance that the queue is not congested.

Because it wasn't right, like most of those blk_run_queues(). The vm
should get things going, you basically just want to take a little nap
and retry. The idea with a small wait is to allow io to make some
progress, you gain nothing retrying right away (except wasting CPU
spinning). It might want to be 1 instead of HZ/100, though. I doubt it
would make much difference, since it's an OOM condition.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] backing dev unplugging
  2004-03-11 12:22   ` Jens Axboe
@ 2004-03-11 13:11     ` Christophe Saout
  0 siblings, 0 replies; 31+ messages in thread
From: Christophe Saout @ 2004-03-11 13:11 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel, Andrew Morton, thornber

Am Do, den 11.03.2004 schrieb Jens Axboe um 13:22:

> > If we were out of memory the buffers were allocated from a mempool and I
> > want to get it out as soon as possible. If we are OOM the write will
> > most likely be the VM trying to free some memory and it would be
> > counterproductive to wait. It is not unlikely that we are the only
> > writer to that disk so there's a chance that the queue is not congested.
> 
> Because it wasn't right, like most of those blk_run_queues(). The vm
> should get things going, you basically just want to take a little nap
> and retry. The idea with a small wait is to allow io to make some
> progress, you gain nothing retrying right away (except wasting CPU
> spinning). It might want to be 1 instead of HZ/100, though. I doubt it
> would make much difference, since it's an OOM condition.

Well, okay. You're right then. I think the wait is unnecessary because
the next thing that the code tries to do is to allocate a new page,
waiting allowed and the VM will run the queue and wait itself. So if you
think it's safe you can even remove it.

Thanks.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] backing dev unplugging
  2004-03-10 19:55 ` Andrew Morton
  2004-03-10 20:03   ` Kenneth Chen
  2004-03-10 20:17   ` Jens Axboe
@ 2004-03-15  5:53   ` Kenneth Chen
  2 siblings, 0 replies; 31+ messages in thread
From: Kenneth Chen @ 2004-03-15  5:53 UTC (permalink / raw)
  To: 'Andrew Morton', Jens Axboe; +Cc: linux-kernel, thornber

>>>>>> Andrew Morton on Wed March 10, 2004 11:56 AM
>Jens Axboe <axboe@suse.de> wrote:
>> Here's a first cut at killing global plugging of block devices to reduce
>> the nasty contention blk_plug_lock caused. This introduceds per-queue
>> plugging, controlled by the backing_dev_info.
>
>This is such an improvement over what we have now it isn't funny.
>
>Ken, the next -mm is starting to look like linux-3.1.0 so I think it
>would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.


Our latest measurement on the 32P, 1000 disks setup indicates this patch
is working as expected performance wise.  We saw 200% improvement on the
throughput compare to global blk_plug_list.  (compare to the per-cpu
blk_plug_list, performance is the same).

- Ken



^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2004-03-15  5:54 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-03-10 12:45 [PATCH] backing dev unplugging Jens Axboe
2004-03-10 19:55 ` Andrew Morton
2004-03-10 20:03   ` Kenneth Chen
2004-03-10 20:20     ` Jens Axboe
2004-03-10 20:45       ` Jesse Barnes
2004-03-10 20:49         ` Jens Axboe
     [not found]           ` <20040310205237.GK15087@suse.de>
2004-03-10 21:01             ` Jesse Barnes
2004-03-10 21:02               ` Jens Axboe
2004-03-10 21:35                 ` Jesse Barnes
2004-03-10 23:54                   ` Andrew Morton
2004-03-11  0:03                     ` David Mosberger
2004-03-11  6:30                       ` Jesse Barnes
2004-03-10 20:17   ` Jens Axboe
2004-03-15  5:53   ` Kenneth Chen
2004-03-10 20:14 ` Jeff Garzik
2004-03-10 20:19   ` Jens Axboe
2004-03-10 21:00 ` Andrew Morton
2004-03-10 21:02   ` Jens Axboe
2004-03-10 21:40     ` Miquel van Smoorenburg
2004-03-10 23:05       ` Andrew Morton
2004-03-11  0:05         ` Miquel van Smoorenburg
2004-03-11  0:17           ` Andrew Morton
2004-03-11  6:43         ` Jens Axboe
2004-03-10 22:22 ` Nathan Scott
2004-03-10 23:32   ` Steve Lord
2004-03-11  7:05   ` Jens Axboe
2004-03-11  9:14 ` Joe Thornber
2004-03-11  9:16   ` Jens Axboe
2004-03-11 12:17 ` Christophe Saout
2004-03-11 12:22   ` Jens Axboe
2004-03-11 13:11     ` Christophe Saout

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).