linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Daniel Phillips <phillips@phunq.net>
To: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC] [PATCH] A clean approach to writeout throttling
Date: Mon, 10 Dec 2007 20:21:42 -0800	[thread overview]
Message-ID: <200712102021.42865.phillips@phunq.net> (raw)
In-Reply-To: <25797.1197322312@lwn.net>

[-- Attachment #1: Type: text/plain, Size: 1560 bytes --]

On Monday 10 December 2007 13:31, Jonathan Corbet wrote:
> Hey, Daniel,
>
> I'm just getting around to looking at this.  One thing jumped out at me:
> > +	if (bio->bi_throttle) {
> > +		struct request_queue *q = bio->bi_queue;
> > +		bio->bi_throttle = 0; /* or detect multiple endio and err? */
> > +		atomic_add(bio->bi_throttle, &q->available);
> > +		wake_up(&q->throttle_wait);
> > +	}
>
> I'm feeling like I must be really dumb, but...how can that possibly
> work?  You're zeroing >bi_throttle before adding it back into
> q->available, so the latter will never increase...

Hi Jon,

Don't you know?  These days we optimize all our code for modern
processors with tunnelling instructions and metaphysical cache.
On such processors, setting a register to zero does not entirely
destroy all the data that used to be in the register, so subsequent
instructions can make further use of the overwritten data by
reconstructing it from remnants of bits left attached to the edges of
the register.

Um, yeah, that's it.

Actually, I fat-fingered it in the merge to -mm.  Thanks for the catch,
corrected patch attached.

The offending line isn't even a functional part of the algorithm, it is
just supposed to defend against the possibility that, somehow,
->bi_endio gets called multiple times.  Probably it should really be
something like:

		BUG_ON(bio->bi_throttle == -1);
		if (bio->bi_throttle) {
			...
			bio->bi_throttle = -1;

Or perhaps we should just rely on nobody ever making that mistake
and let somebody else catch it if it does.

Regards,

Daniel

[-- Attachment #2: bio.throttle-2.6.24-rc3-mm --]
[-- Type: text/x-diff, Size: 4217 bytes --]

--- 2.6.24-rc3-mm.clean/block/ll_rw_blk.c	2007-12-04 14:45:25.000000000 -0800
+++ 2.6.24-rc3-mm/block/ll_rw_blk.c	2007-12-10 04:49:56.000000000 -0800
@@ -3210,9 +3210,9 @@ static inline int bio_check_eod(struct b
  */
 static inline void __generic_make_request(struct bio *bio)
 {
-	struct request_queue *q;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	sector_t old_sector;
-	int ret, nr_sectors = bio_sectors(bio);
+	int nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	int err = -EIO;
 
@@ -3221,6 +3221,13 @@ static inline void __generic_make_reques
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 
+	if (q && q->metric && !bio->bi_queue) {
+		int need = bio->bi_throttle = q->metric(bio);
+		bio->bi_queue = q;
+		/* FIXME: potential race if atomic_sub is called in the middle of condition check */
+		wait_event(q->throttle_wait, atomic_read(&q->available) >= need);
+		atomic_sub(need, &q->available);
+	}
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
@@ -3231,10 +3238,9 @@ static inline void __generic_make_reques
 	 */
 	old_sector = -1;
 	old_dev = 0;
-	do {
+	while (1) {
 		char b[BDEVNAME_SIZE];
 
-		q = bdev_get_queue(bio->bi_bdev);
 		if (!q) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
@@ -3282,8 +3288,10 @@ end_io:
 			goto end_io;
 		}
 
-		ret = q->make_request_fn(q, bio);
-	} while (ret);
+		if (!q->make_request_fn(q, bio))
+			return;
+		q = bdev_get_queue(bio->bi_bdev);
+	}
 }
 
 /*
--- 2.6.24-rc3-mm.clean/drivers/md/dm.c	2007-12-04 14:46:04.000000000 -0800
+++ 2.6.24-rc3-mm/drivers/md/dm.c	2007-12-04 23:31:41.000000000 -0800
@@ -889,6 +889,11 @@ static int dm_any_congested(void *conges
 	return r;
 }
 
+static unsigned dm_metric(struct bio *bio)
+{
+	return bio->bi_vcnt;
+}
+
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
@@ -967,6 +972,7 @@ out:
 
 static struct block_device_operations dm_blk_dops;
 
+#define DEFAULT_THROTTLE_CAPACITY 1000
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -1009,6 +1015,11 @@ static struct mapped_device *alloc_dev(i
 		goto bad1_free_minor;
 
 	md->queue->queuedata = md;
+	md->queue->metric = dm_metric;
+	/* A dm device constructor may change the throttle capacity */
+	atomic_set(&md->queue->available, md->queue->capacity = DEFAULT_THROTTLE_CAPACITY);
+	init_waitqueue_head(&md->queue->throttle_wait);
+
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
--- 2.6.24-rc3-mm.clean/fs/bio.c	2007-12-04 14:38:47.000000000 -0800
+++ 2.6.24-rc3-mm/fs/bio.c	2007-12-04 23:31:41.000000000 -0800
@@ -1007,6 +1007,13 @@ void bio_endio(struct bio *bio, int erro
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
+	if (bio->bi_throttle) {
+		struct request_queue *q = bio->bi_queue;
+		atomic_add(bio->bi_throttle, &q->available);
+		bio->bi_throttle = 0; /* or detect multiple endio and err? */
+		wake_up(&q->throttle_wait);
+	}
+
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, error);
 }
--- 2.6.24-rc3-mm.clean/include/linux/bio.h	2007-12-04 14:39:31.000000000 -0800
+++ 2.6.24-rc3-mm/include/linux/bio.h	2007-12-04 23:31:41.000000000 -0800
@@ -111,6 +111,9 @@ struct bio {
 	bio_end_io_t		*bi_end_io;
 	atomic_t		bi_cnt;		/* pin count */
 
+	struct request_queue	*bi_queue;	/* for throttling */
+	unsigned		bi_throttle;	/* throttle metric */
+
 	void			*bi_private;
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
--- 2.6.24-rc3-mm.clean/include/linux/blkdev.h	2007-12-04 14:47:18.000000000 -0800
+++ 2.6.24-rc3-mm/include/linux/blkdev.h	2007-12-04 23:31:41.000000000 -0800
@@ -383,6 +383,10 @@ struct request_queue
 	struct work_struct	unplug_work;
 
 	struct backing_dev_info	backing_dev_info;
+	unsigned (*metric)(struct bio *bio);	/* bio throttle metric */
+	wait_queue_head_t	throttle_wait;
+	atomic_t		available;
+	unsigned		capacity;
 
 	/*
 	 * The queue owner gets to use this for whatever they like.

      parent reply	other threads:[~2007-12-11  4:22 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-12-06  0:03 [RFC] [PATCH] A clean approach to writeout throttling Daniel Phillips
2007-12-06  1:24 ` Andrew Morton
2007-12-06  6:21   ` Daniel Phillips
2007-12-06  7:31     ` Andrew Morton
2007-12-06  9:48       ` Daniel Phillips
2007-12-06 11:55         ` Andrew Morton
2007-12-06 15:52           ` Rik van Riel
2007-12-06 17:34             ` Andrew Morton
2007-12-06 17:48               ` Rik van Riel
2007-12-06 20:04           ` Daniel Phillips
2007-12-06 20:27             ` Andrew Morton
2007-12-06 21:27               ` Daniel Phillips
2007-12-06 21:53     ` Bill Davidsen
2007-12-07  0:04       ` Daniel Phillips
2007-12-07  0:29         ` Andrew Morton
2007-12-07  7:13           ` Daniel Phillips
2007-12-10  9:20             ` Daniel Phillips
2007-12-10 10:47 ` Jens Axboe
2007-12-10 11:23   ` [RFC] [PATCH] A clean aEvgeniy pproach " Daniel Phillips
2007-12-10 11:41     ` Jens Axboe
2007-12-10 12:13       ` Daniel Phillips
2007-12-10 12:16         ` Jens Axboe
2007-12-10 12:27           ` Daniel Phillips
2007-12-10 12:32             ` Jens Axboe
2007-12-10 13:04               ` Daniel Phillips
2007-12-10 13:19                 ` Jens Axboe
2007-12-10 13:26                   ` Daniel Phillips
2007-12-10 13:30                     ` Jens Axboe
2007-12-10 13:43                       ` Daniel Phillips
2007-12-10 13:53                         ` Jens Axboe
2007-12-10 14:17                           ` Daniel Phillips
2007-12-11 13:15                             ` Jens Axboe
2007-12-11 19:38                               ` Daniel Phillips
2007-12-11 20:01                                 ` Jens Axboe
2007-12-11 20:11                                   ` Daniel Phillips
2007-12-11 20:07                               ` Daniel Phillips
2007-12-10 11:33   ` [RFC] [PATCH] A clean approach " Daniel Phillips
2007-12-10 21:31 ` Jonathan Corbet
2007-12-10 22:06   ` Pekka Enberg
2007-12-11  4:21   ` Daniel Phillips [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200712102021.42865.phillips@phunq.net \
    --to=phillips@phunq.net \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).