linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: don't cap request size based on read-ahead setting
@ 2016-11-16  4:30 Jens Axboe
  2016-11-16  7:17 ` Hillf Danton
  0 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2016-11-16  4:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-block, linux-kernel, Linus Torvalds

Hi,

We ran into a funky issue, where someone doing 256K buffered reads saw
128K requests at the device level. Turns out it is read-ahead capping
the request size, since we use 128K as the default setting. This doesn't
make a lot of sense - if someone is issuing 256K reads, they should see
256K reads, regardless of the read-ahead setting, if the underlying
device can support a 256K read in a single command.

To make matters more confusing, there's an odd interaction with the
fadvise hint setting. If we tell the kernel we're doing sequential IO on
this file descriptor, we can get twice the read-ahead size. But if we
tell the kernel that we are doing random IO, hence disabling read-ahead,
we do get nice 256K requests at the lower level. This is because
ondemand and forced read-ahead behave differently, with the latter doing
the right thing. An application developer will be, rightfully,
scratching his head at this point, wondering wtf is going on. A good one
will dive into the kernel source, and silently weep.

This patch introduces a bdi hint, io_pages. This is the soft max IO size
for the lower level, I've hooked it up to the bdev settings here.
Read-ahead is modified to issue the maximum of the user request size,
and the read-ahead max size, but capped to the max request size on the
device side. The latter is done to avoid reading ahead too much, if the
application asks for a huge read. With this patch, the kernel behaves
like the application expects.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae122843..65f16cf4f850 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue 
*q, unsigned int max_hw_secto
  	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
  	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
  	limits->max_sectors = max_sectors;
+	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
  }
  EXPORT_SYMBOL(blk_queue_max_hw_sectors);

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c5439a..ea374e820775 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, 
const char *page, size_t count)

  	spin_lock_irq(q->queue_lock);
  	q->limits.max_sectors = max_sectors_kb << 1;
+	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
  	spin_unlock_irq(q->queue_lock);

  	return ret;
diff --git a/include/linux/backing-dev-defs.h 
b/include/linux/backing-dev-defs.h
index c357f27d5483..b8144b2d59ce 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,6 +136,7 @@ struct bdi_writeback {
  struct backing_dev_info {
  	struct list_head bdi_list;
  	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
+	unsigned long io_pages;	/* max allowed IO size */
  	unsigned int capabilities; /* Device capabilities */
  	congested_fn *congested_fn; /* Function pointer if device is md/dm */
  	void *congested_data;	/* Pointer to aux data for congested func */
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..4eaec947c7c9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -369,10 +369,25 @@ ondemand_readahead(struct address_space *mapping,
  		   bool hit_readahead_marker, pgoff_t offset,
  		   unsigned long req_size)
  {
-	unsigned long max = ra->ra_pages;
+	unsigned long io_pages, max_pages;
  	pgoff_t prev_offset;

  	/*
+	 * If bdi->io_pages is set, that indicates the (soft) max IO size
+	 * per command for that device. If we have that available, use
+	 * that as the max suitable read-ahead size for this IO. Instead of
+	 * capping read-ahead at ra_pages if req_size is larger, we can go
+	 * up to io_pages. If io_pages isn't set, fall back to using
+	 * ra_pages as a safe max.
+	 */
+	io_pages = inode_to_bdi(mapping->host)->io_pages;
+	if (io_pages) {
+		max_pages = max_t(unsigned long, ra->ra_pages, req_size);
+		io_pages = min(io_pages, max_pages);
+	} else
+		max_pages = ra->ra_pages;
+
+	/*
  	 * start of file
  	 */
  	if (!offset)
@@ -385,7 +400,7 @@ ondemand_readahead(struct address_space *mapping,
  	if ((offset == (ra->start + ra->size - ra->async_size) ||
  	     offset == (ra->start + ra->size))) {
  		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
  		ra->async_size = ra->size;
  		goto readit;
  	}
@@ -400,16 +415,16 @@ ondemand_readahead(struct address_space *mapping,
  		pgoff_t start;

  		rcu_read_lock();
-		start = page_cache_next_hole(mapping, offset + 1, max);
+		start = page_cache_next_hole(mapping, offset + 1, max_pages);
  		rcu_read_unlock();

-		if (!start || start - offset > max)
+		if (!start || start - offset > max_pages)
  			return 0;

  		ra->start = start;
  		ra->size = start - offset;	/* old async_size */
  		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
  		ra->async_size = ra->size;
  		goto readit;
  	}
@@ -417,7 +432,7 @@ ondemand_readahead(struct address_space *mapping,
  	/*
  	 * oversize read
  	 */
-	if (req_size > max)
+	if (req_size > max_pages)
  		goto initial_readahead;

  	/*
@@ -433,7 +448,7 @@ ondemand_readahead(struct address_space *mapping,
  	 * Query the page cache and look for the traces(cached history pages)
  	 * that a sequential stream would leave behind.
  	 */
-	if (try_context_readahead(mapping, ra, offset, req_size, max))
+	if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
  		goto readit;

  	/*
@@ -444,7 +459,7 @@ ondemand_readahead(struct address_space *mapping,

  initial_readahead:
  	ra->start = offset;
-	ra->size = get_init_ra_size(req_size, max);
+	ra->size = get_init_ra_size(req_size, max_pages);
  	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

  readit:
@@ -454,7 +469,7 @@ ondemand_readahead(struct address_space *mapping,
  	 * the resulted next readahead window into the current one.
  	 */
  	if (offset == ra->start && ra->size == ra->async_size) {
-		ra->async_size = get_next_ra_size(ra, max);
+		ra->async_size = get_next_ra_size(ra, max_pages);
  		ra->size += ra->async_size;
  	}


-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: don't cap request size based on read-ahead setting
  2016-11-16  4:30 [PATCH] mm: don't cap request size based on read-ahead setting Jens Axboe
@ 2016-11-16  7:17 ` Hillf Danton
  2016-11-16 15:12   ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Hillf Danton @ 2016-11-16  7:17 UTC (permalink / raw)
  To: 'Jens Axboe', 'Andrew Morton'
  Cc: linux-mm, linux-block, linux-kernel, 'Linus Torvalds'

On Wednesday, November 16, 2016 12:31 PM Jens Axboe wrote:
> @@ -369,10 +369,25 @@ ondemand_readahead(struct address_space *mapping,
>   		   bool hit_readahead_marker, pgoff_t offset,
>   		   unsigned long req_size)
>   {
> -	unsigned long max = ra->ra_pages;
> +	unsigned long io_pages, max_pages;
>   	pgoff_t prev_offset;
> 
>   	/*
> +	 * If bdi->io_pages is set, that indicates the (soft) max IO size
> +	 * per command for that device. If we have that available, use
> +	 * that as the max suitable read-ahead size for this IO. Instead of
> +	 * capping read-ahead at ra_pages if req_size is larger, we can go
> +	 * up to io_pages. If io_pages isn't set, fall back to using
> +	 * ra_pages as a safe max.
> +	 */
> +	io_pages = inode_to_bdi(mapping->host)->io_pages;
> +	if (io_pages) {
> +		max_pages = max_t(unsigned long, ra->ra_pages, req_size);
> +		io_pages = min(io_pages, max_pages);

Doubt if you mean
		max_pages = min(io_pages, max_pages);
> +	} else
> +		max_pages = ra->ra_pages;
> +

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: don't cap request size based on read-ahead setting
  2016-11-16  7:17 ` Hillf Danton
@ 2016-11-16 15:12   ` Jens Axboe
  2016-11-16 18:38     ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2016-11-16 15:12 UTC (permalink / raw)
  To: Hillf Danton, 'Andrew Morton'
  Cc: linux-mm, linux-block, linux-kernel, 'Linus Torvalds'

On 11/16/2016 12:17 AM, Hillf Danton wrote:
> On Wednesday, November 16, 2016 12:31 PM Jens Axboe wrote:
>> @@ -369,10 +369,25 @@ ondemand_readahead(struct address_space *mapping,
>>   		   bool hit_readahead_marker, pgoff_t offset,
>>   		   unsigned long req_size)
>>   {
>> -	unsigned long max = ra->ra_pages;
>> +	unsigned long io_pages, max_pages;
>>   	pgoff_t prev_offset;
>>
>>   	/*
>> +	 * If bdi->io_pages is set, that indicates the (soft) max IO size
>> +	 * per command for that device. If we have that available, use
>> +	 * that as the max suitable read-ahead size for this IO. Instead of
>> +	 * capping read-ahead at ra_pages if req_size is larger, we can go
>> +	 * up to io_pages. If io_pages isn't set, fall back to using
>> +	 * ra_pages as a safe max.
>> +	 */
>> +	io_pages = inode_to_bdi(mapping->host)->io_pages;
>> +	if (io_pages) {
>> +		max_pages = max_t(unsigned long, ra->ra_pages, req_size);
>> +		io_pages = min(io_pages, max_pages);
>
> Doubt if you mean
> 		max_pages = min(io_pages, max_pages);

No, that is what I mean. We want the maximum of the RA setting and the
user IO size, but the minimum of that and the device max command size.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: don't cap request size based on read-ahead setting
  2016-11-16 15:12   ` Jens Axboe
@ 2016-11-16 18:38     ` Jens Axboe
  0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2016-11-16 18:38 UTC (permalink / raw)
  To: Hillf Danton, 'Andrew Morton'
  Cc: linux-mm, linux-block, linux-kernel, 'Linus Torvalds'

On 11/16/2016 08:12 AM, Jens Axboe wrote:
> On 11/16/2016 12:17 AM, Hillf Danton wrote:
>> On Wednesday, November 16, 2016 12:31 PM Jens Axboe wrote:
>>> @@ -369,10 +369,25 @@ ondemand_readahead(struct address_space *mapping,
>>>              bool hit_readahead_marker, pgoff_t offset,
>>>              unsigned long req_size)
>>>   {
>>> -    unsigned long max = ra->ra_pages;
>>> +    unsigned long io_pages, max_pages;
>>>       pgoff_t prev_offset;
>>>
>>>       /*
>>> +     * If bdi->io_pages is set, that indicates the (soft) max IO size
>>> +     * per command for that device. If we have that available, use
>>> +     * that as the max suitable read-ahead size for this IO. Instead of
>>> +     * capping read-ahead at ra_pages if req_size is larger, we can go
>>> +     * up to io_pages. If io_pages isn't set, fall back to using
>>> +     * ra_pages as a safe max.
>>> +     */
>>> +    io_pages = inode_to_bdi(mapping->host)->io_pages;
>>> +    if (io_pages) {
>>> +        max_pages = max_t(unsigned long, ra->ra_pages, req_size);
>>> +        io_pages = min(io_pages, max_pages);
>>
>> Doubt if you mean
>>         max_pages = min(io_pages, max_pages);
>
> No, that is what I mean. We want the maximum of the RA setting and the
> user IO size, but the minimum of that and the device max command size.

Johannes pointed out that I'm an idiot - a last minute edit introduced
this typo, and I was too blind to spot it when you sent that email this
morning. So yes, it should of course be:

	max_pages = min(io_pages, max_pages);

like the first version I posted. I'll post a v3...

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: don't cap request size based on read-ahead setting
  2016-11-21  4:44 ` Hillf Danton
@ 2016-11-21 13:12   ` Jens Axboe
  0 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2016-11-21 13:12 UTC (permalink / raw)
  To: Hillf Danton, akpm; +Cc: hannes, linux-mm, linux-kernel, linux-block

On 11/20/2016 09:44 PM, Hillf Danton wrote:
> On Saturday, November 19, 2016 3:41 AM Jens Axboe wrote:
>> We ran into a funky issue, where someone doing 256K buffered reads saw
>> 128K requests at the device level. Turns out it is read-ahead capping
>> the request size, since we use 128K as the default setting. This doesn't
>> make a lot of sense - if someone is issuing 256K reads, they should see
>> 256K reads, regardless of the read-ahead setting, if the underlying
>> device can support a 256K read in a single command.
>>
> Is it also making any sense to see 4M reads to meet 4M requests if
> the underlying device can support 4M IO?

Depends on the device, but yes. Big raid set? You definitely want larger
requests. Which is why we have the distinction between max hardware and
kernel IO size.

By default we limit the soft IO size to 1280k for a block device. See
also:

commit d2be537c3ba3568acd79cd178327b842e60d035e
Author: Jeff Moyer <jmoyer@redhat.com>
Date:   Thu Aug 13 14:57:57 2015 -0400

     block: bump BLK_DEF_MAX_SECTORS to 2560

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: don't cap request size based on read-ahead setting
  2016-11-18 19:41 Jens Axboe
@ 2016-11-21  4:44 ` Hillf Danton
  2016-11-21 13:12   ` Jens Axboe
  0 siblings, 1 reply; 7+ messages in thread
From: Hillf Danton @ 2016-11-21  4:44 UTC (permalink / raw)
  To: 'Jens Axboe', akpm; +Cc: hannes, linux-mm, linux-kernel, linux-block

On Saturday, November 19, 2016 3:41 AM Jens Axboe wrote:
> We ran into a funky issue, where someone doing 256K buffered reads saw
> 128K requests at the device level. Turns out it is read-ahead capping
> the request size, since we use 128K as the default setting. This doesn't
> make a lot of sense - if someone is issuing 256K reads, they should see
> 256K reads, regardless of the read-ahead setting, if the underlying
> device can support a 256K read in a single command.
> 
Is it also making any sense to see 4M reads to meet 4M requests if 
the underlying device can support 4M IO?

thanks
Hillf

> This patch introduces a bdi hint, io_pages. This is the soft max IO size
> for the lower level, I've hooked it up to the bdev settings here.
> Read-ahead is modified to issue the maximum of the user request size,
> and the read-ahead max size, but capped to the max request size on the
> device side. The latter is done to avoid reading ahead too much, if the
> application asks for a huge read. With this patch, the kernel behaves
> like the application expects.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  block/blk-settings.c             |  1 +
>  block/blk-sysfs.c                |  1 +
>  include/linux/backing-dev-defs.h |  1 +
>  mm/readahead.c                   | 39 ++++++++++++++++++++++++++++-----------
>  4 files changed, 31 insertions(+), 11 deletions(-)
> 
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index f679ae122843..65f16cf4f850 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
>  	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
>  	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
>  	limits->max_sectors = max_sectors;
> +	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
>  }
>  EXPORT_SYMBOL(blk_queue_max_hw_sectors);
> 
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 9cc8d7c5439a..ea374e820775 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
> 
>  	spin_lock_irq(q->queue_lock);
>  	q->limits.max_sectors = max_sectors_kb << 1;
> +	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
>  	spin_unlock_irq(q->queue_lock);
> 
>  	return ret;
> diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
> index c357f27d5483..b8144b2d59ce 100644
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -136,6 +136,7 @@ struct bdi_writeback {
>  struct backing_dev_info {
>  	struct list_head bdi_list;
>  	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
> +	unsigned long io_pages;	/* max allowed IO size */
>  	unsigned int capabilities; /* Device capabilities */
>  	congested_fn *congested_fn; /* Function pointer if device is md/dm */
>  	void *congested_data;	/* Pointer to aux data for congested func */
> diff --git a/mm/readahead.c b/mm/readahead.c
> index c8a955b1297e..fb4c99f85618 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -207,12 +207,21 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
>   * memory at once.
>   */
>  int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
> -		pgoff_t offset, unsigned long nr_to_read)
> +		               pgoff_t offset, unsigned long nr_to_read)
>  {
> +	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> +	struct file_ra_state *ra = &filp->f_ra;
> +	unsigned long max_pages;
> +
>  	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
>  		return -EINVAL;
> 
> -	nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
> +	/*
> +	 * If the request exceeds the readahead window, allow the read to
> +	 * be up to the optimal hardware IO size
> +	 */
> +	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
> +	nr_to_read = min(nr_to_read, max_pages);
>  	while (nr_to_read) {
>  		int err;
> 
> @@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping,
>  		   bool hit_readahead_marker, pgoff_t offset,
>  		   unsigned long req_size)
>  {
> -	unsigned long max = ra->ra_pages;
> +	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
> +	unsigned long max_pages = ra->ra_pages;
>  	pgoff_t prev_offset;
> 
>  	/*
> +	 * If the request exceeds the readahead window, allow the read to
> +	 * be up to the optimal hardware IO size
> +	 */
> +	if (req_size > max_pages && bdi->io_pages > max_pages)
> +		max_pages = min(req_size, bdi->io_pages);
> +
> +	/*
>  	 * start of file
>  	 */
>  	if (!offset)
> @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
>  	if ((offset == (ra->start + ra->size - ra->async_size) ||
>  	     offset == (ra->start + ra->size))) {
>  		ra->start += ra->size;
> -		ra->size = get_next_ra_size(ra, max);
> +		ra->size = get_next_ra_size(ra, max_pages);
>  		ra->async_size = ra->size;
>  		goto readit;
>  	}
> @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
>  		pgoff_t start;
> 
>  		rcu_read_lock();
> -		start = page_cache_next_hole(mapping, offset + 1, max);
> +		start = page_cache_next_hole(mapping, offset + 1, max_pages);
>  		rcu_read_unlock();
> 
> -		if (!start || start - offset > max)
> +		if (!start || start - offset > max_pages)
>  			return 0;
> 
>  		ra->start = start;
>  		ra->size = start - offset;	/* old async_size */
>  		ra->size += req_size;
> -		ra->size = get_next_ra_size(ra, max);
> +		ra->size = get_next_ra_size(ra, max_pages);
>  		ra->async_size = ra->size;
>  		goto readit;
>  	}
> @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
>  	/*
>  	 * oversize read
>  	 */
> -	if (req_size > max)
> +	if (req_size > max_pages)
>  		goto initial_readahead;
> 
>  	/*
> @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
>  	 * Query the page cache and look for the traces(cached history pages)
>  	 * that a sequential stream would leave behind.
>  	 */
> -	if (try_context_readahead(mapping, ra, offset, req_size, max))
> +	if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
>  		goto readit;
> 
>  	/*
> @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
> 
>  initial_readahead:
>  	ra->start = offset;
> -	ra->size = get_init_ra_size(req_size, max);
> +	ra->size = get_init_ra_size(req_size, max_pages);
>  	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
> 
>  readit:
> @@ -454,7 +471,7 @@ ondemand_readahead(struct address_space *mapping,
>  	 * the resulted next readahead window into the current one.
>  	 */
>  	if (offset == ra->start && ra->size == ra->async_size) {
> -		ra->async_size = get_next_ra_size(ra, max);
> +		ra->async_size = get_next_ra_size(ra, max_pages);
>  		ra->size += ra->async_size;
>  	}
> 
> --
> 2.7.4

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] mm: don't cap request size based on read-ahead setting
@ 2016-11-18 19:41 Jens Axboe
  2016-11-21  4:44 ` Hillf Danton
  0 siblings, 1 reply; 7+ messages in thread
From: Jens Axboe @ 2016-11-18 19:41 UTC (permalink / raw)
  To: akpm; +Cc: hannes, linux-mm, linux-kernel, linux-block, Jens Axboe

We ran into a funky issue, where someone doing 256K buffered reads saw
128K requests at the device level. Turns out it is read-ahead capping
the request size, since we use 128K as the default setting. This doesn't
make a lot of sense - if someone is issuing 256K reads, they should see
256K reads, regardless of the read-ahead setting, if the underlying
device can support a 256K read in a single command.

This patch introduces a bdi hint, io_pages. This is the soft max IO size
for the lower level, I've hooked it up to the bdev settings here.
Read-ahead is modified to issue the maximum of the user request size,
and the read-ahead max size, but capped to the max request size on the
device side. The latter is done to avoid reading ahead too much, if the
application asks for a huge read. With this patch, the kernel behaves
like the application expects.

Signed-off-by: Jens Axboe <axboe@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 block/blk-settings.c             |  1 +
 block/blk-sysfs.c                |  1 +
 include/linux/backing-dev-defs.h |  1 +
 mm/readahead.c                   | 39 ++++++++++++++++++++++++++++-----------
 4 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae122843..65f16cf4f850 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
+	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c5439a..ea374e820775 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	spin_lock_irq(q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
+	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c357f27d5483..b8144b2d59ce 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,6 +136,7 @@ struct bdi_writeback {
 struct backing_dev_info {
 	struct list_head bdi_list;
 	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
+	unsigned long io_pages;	/* max allowed IO size */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..fb4c99f85618 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -207,12 +207,21 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
  * memory at once.
  */
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-		pgoff_t offset, unsigned long nr_to_read)
+		               pgoff_t offset, unsigned long nr_to_read)
 {
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct file_ra_state *ra = &filp->f_ra;
+	unsigned long max_pages;
+
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 		return -EINVAL;
 
-	nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+	nr_to_read = min(nr_to_read, max_pages);
 	while (nr_to_read) {
 		int err;
 
@@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 {
-	unsigned long max = ra->ra_pages;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	unsigned long max_pages = ra->ra_pages;
 	pgoff_t prev_offset;
 
 	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	if (req_size > max_pages && bdi->io_pages > max_pages)
+		max_pages = min(req_size, bdi->io_pages);
+
+	/*
 	 * start of file
 	 */
 	if (!offset)
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
 	if ((offset == (ra->start + ra->size - ra->async_size) ||
 	     offset == (ra->start + ra->size))) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
 		pgoff_t start;
 
 		rcu_read_lock();
-		start = page_cache_next_hole(mapping, offset + 1, max);
+		start = page_cache_next_hole(mapping, offset + 1, max_pages);
 		rcu_read_unlock();
 
-		if (!start || start - offset > max)
+		if (!start || start - offset > max_pages)
 			return 0;
 
 		ra->start = start;
 		ra->size = start - offset;	/* old async_size */
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
 	/*
 	 * oversize read
 	 */
-	if (req_size > max)
+	if (req_size > max_pages)
 		goto initial_readahead;
 
 	/*
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * Query the page cache and look for the traces(cached history pages)
 	 * that a sequential stream would leave behind.
 	 */
-	if (try_context_readahead(mapping, ra, offset, req_size, max))
+	if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
 		goto readit;
 
 	/*
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
 
 initial_readahead:
 	ra->start = offset;
-	ra->size = get_init_ra_size(req_size, max);
+	ra->size = get_init_ra_size(req_size, max_pages);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
 readit:
@@ -454,7 +471,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * the resulted next readahead window into the current one.
 	 */
 	if (offset == ra->start && ra->size == ra->async_size) {
-		ra->async_size = get_next_ra_size(ra, max);
+		ra->async_size = get_next_ra_size(ra, max_pages);
 		ra->size += ra->async_size;
 	}
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-11-21 13:13 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-16  4:30 [PATCH] mm: don't cap request size based on read-ahead setting Jens Axboe
2016-11-16  7:17 ` Hillf Danton
2016-11-16 15:12   ` Jens Axboe
2016-11-16 18:38     ` Jens Axboe
2016-11-18 19:41 Jens Axboe
2016-11-21  4:44 ` Hillf Danton
2016-11-21 13:12   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).