All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peng Tao <bergwolf@gmail.com>
To: Boaz Harrosh <bharrosh@panasas.com>
Cc: linux-nfs@vger.kernel.org
Subject: Re: [PATCH RFC 2/3] NFS41: send real write size in layoutget
Date: Mon, 13 Aug 2012 17:44:45 +0800	[thread overview]
Message-ID: <CA+a=Yy5cG74RhVUYVQTYOCrwOWfuz2OfsCGpuMXW-8hoOgfKLw@mail.gmail.com> (raw)
In-Reply-To: <5027F63F.8070107@panasas.com>

On Mon, Aug 13, 2012 at 2:30 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/08/2012 05:03 AM, Peng Tao wrote:
>
>> From: Peng Tao <tao.peng@emc.com>
>>
>> For bufferred write, scan dirty pages to find out longest continuous
>> dirty pages. In this case, also allow layout driver to specify a
>> maximum layoutget size which is useful to avoid busy scanning dirty pages
>> for block layout client.
>>
>> For direct write, just use dreq->bytes_left.
>>
>> Signed-off-by: Peng Tao <tao.peng@emc.com>
>> ---
>>  fs/nfs/direct.c   |    7 ++++++
>>  fs/nfs/internal.h |    1 +
>>  fs/nfs/pnfs.c     |   58 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 64 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>> index c39f775..c1899dd 100644
>> --- a/fs/nfs/direct.c
>> +++ b/fs/nfs/direct.c
>> @@ -46,6 +46,7 @@
>>  #include <linux/kref.h>
>>  #include <linux/slab.h>
>>  #include <linux/task_io_accounting_ops.h>
>> +#include <linux/module.h>
>>
>>  #include <linux/nfs_fs.h>
>>  #include <linux/nfs_page.h>
>> @@ -191,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
>>       kref_put(&dreq->kref, nfs_direct_req_free);
>>  }
>>
>> +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
>> +{
>> +     return dreq->bytes_left;
>> +}
>> +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
>> +
>>  /*
>>   * Collects and returns the final error value/byte-count.
>>   */
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index 31fdb03..e68d329 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
>>  {
>>       inode_dio_wait(inode);
>>  }
>> +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
>>
>
>
> Why is this an EXPORT_SYMBOL_GPL at .c file. Why not just an inline
> here ?
>
>>  /* nfs4proc.c */
>>  extern void __nfs4_read_done_cb(struct nfs_read_data *);
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 2e00fea..e61a373 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -29,6 +29,7 @@
>>
>>  #include <linux/nfs_fs.h>
>>  #include <linux/nfs_page.h>
>> +#include <linux/pagevec.h>
>>  #include <linux/module.h>
>>  #include "internal.h"
>>  #include "pnfs.h"
>> @@ -1172,19 +1173,72 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
>>  }
>>  EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
>>
>> +/*
>> + * Return the number of contiguous bytes in dirty pages for a given inode
>> + * starting at page frame idx.
>> + */
>> +static u64 pnfs_num_dirty_bytes(struct inode *inode, pgoff_t idx)
>> +{
>> +     struct address_space *mapping = inode->i_mapping;
>> +     pgoff_t index;
>> +     struct pagevec pvec;
>> +     pgoff_t num = 1; /* self */
>> +     int i, done = 0;
>> +
>> +     pagevec_init(&pvec, 0);
>> +     idx++; /* self */
>> +     while (!done) {
>> +             index = idx;
>> +             pagevec_lookup_tag(&pvec, mapping, &index,
>> +                                PAGECACHE_TAG_DIRTY, (pgoff_t)PAGEVEC_SIZE);
>> +             if (pagevec_count(&pvec) == 0)
>> +                     break;
>> +
>> +             for (i = 0; i < pagevec_count(&pvec); i++) {
>> +                     struct page *page = pvec.pages[i];
>> +
>> +                     lock_page(page);
>> +                     if (unlikely(page->mapping != mapping) ||
>> +                         !PageDirty(page) ||
>> +                         PageWriteback(page) ||
>> +                         page->index != idx) {
>> +                             done = 1;
>> +                             unlock_page(page);
>> +                             break;
>> +                     }
>> +                     unlock_page(page);
>> +                     if (done)
>> +                             break;
>> +                     idx++;
>> +                     num++;
>> +             }
>> +             pagevec_release(&pvec);
>> +     }
>> +     return num << PAGE_CACHE_SHIFT;
>> +}
>> +
>
>
> Same as what Trond said. radix_tree_next_hole() should be nicer. We need never
> do any locking this is just an hint, and not a transaction guaranty. Best first
> guess approximation is all we need.
>
> Also you might want to optimize for the most common case of a linear write from
> zero. This you can do by comparing i_size / PAGE_SIZE and the number of dirty
> pages, if they are the same you know there are no holes, and need not scan.
>
>>  void
>> -pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
>> +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
>> +                        struct nfs_page *req)
>
>
> Nothing changed here, please don't do this
>
>>  {
>> +     u64 wb_size;
>> +
>>       BUG_ON(pgio->pg_lseg != NULL);
>>
>>       if (req->wb_offset != req->wb_pgbase) {
>>               nfs_pageio_reset_write_mds(pgio);
>>               return;
>>       }
>> +
>> +     if (pgio->pg_dreq == NULL)
>> +             wb_size = pnfs_num_dirty_bytes(pgio->pg_inode, req->wb_index);
>> +     else
>> +             wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
>> +
>>       pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>>                                          req->wb_context,
>>                                          req_offset(req),
>> -                                        req->wb_bytes,
>> +                                        wb_size?:req->wb_bytes,
>
>
> wb_size is always set above in the if() or else. No need to check here.
>
>>                                          IOMODE_RW,
>>                                          GFP_NOFS);
>>       /* If no lseg, fall back to write through mds */
>
>
>
> But No!
>
> much much better then last time, thank you for working on this
> but it is not optimum for objects and files
> (when "files" supports segments)
>
> For blocks, Yes radix_tree_next_hole() is the perfect fit. But for
> objects (and files) it is i_size_read(). The objects/files server usually
> determines it's topology according to file-size. And it does not have any
> bigger resources because of holes or no holes. (The files example I think of
> is CEPH)
>
> So for objects the wasting factor is the actual i_size extending as a cause
> of layout_get, and not the number of pages served. So for us the gain is if
> client, that has a much newer information about i_size, sends it on first
> layout_get. Though extending file size only once on first layout_get and
> not on every layout_get.
>
> So the small change I want is:
>
> +enum pnfs_layout_get_strategy {
> +       PLGS_USE_ISIZE,
> +       PLGS_SEARCH_FIRST_HOLE,
> +       PLGS_ALL_FILE,
> +};
>
Just a second thought, since each layout driver would use one
strategy, it is more reasonable to set the strategy in
pnfs_curr_ld->flags instead of changing pg_init API to pass it in. I
will do it this way.

-- 
Thanks,
Tao

  parent reply	other threads:[~2012-08-13  9:45 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-08-08  2:03 [PATCH RFC 0/3] NFS41: optimize layoutget Peng Tao
2012-08-08  2:03 ` [PATCH RFC 1/3] NFS: track direct IO left bytes Peng Tao
2012-08-08  2:03 ` [PATCH RFC 2/3] NFS41: send real write size in layoutget Peng Tao
2012-08-08 18:50   ` Myklebust, Trond
2012-08-09  2:24     ` Peng Tao
2012-08-12 18:30   ` Boaz Harrosh
2012-08-12 18:40     ` Boaz Harrosh
2012-08-13  6:15     ` Peng Tao
2012-08-13  9:44     ` Peng Tao [this message]
2012-08-13 20:13       ` Boaz Harrosh
2012-08-13 20:21         ` Myklebust, Trond
2012-08-08  2:03 ` [PATCH RFC 3/3] NFS41: send real read size in layoutget for DIO Peng Tao
2012-08-08 18:57   ` Myklebust, Trond
2012-08-09  2:30     ` Peng Tao
2012-08-12 17:39       ` Boaz Harrosh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CA+a=Yy5cG74RhVUYVQTYOCrwOWfuz2OfsCGpuMXW-8hoOgfKLw@mail.gmail.com' \
    --to=bergwolf@gmail.com \
    --cc=bharrosh@panasas.com \
    --cc=linux-nfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.