All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V3 6/6] ceph: scattered page writeback
@ 2016-01-19 13:30 Yan, Zheng
  2016-01-26 22:15 ` Ilya Dryomov
  0 siblings, 1 reply; 7+ messages in thread
From: Yan, Zheng @ 2016-01-19 13:30 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, Yan, Zheng

This patch makes ceph_writepages_start() try using single OSD request
to write all dirty pages within a strip. When a nonconsecutive dirty
page is found, ceph_writepages_start() tries starting a new write
operation to existing OSD request. If it succeeds, it uses the new
operation to writeback the dirty page.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 174 insertions(+), 89 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c222137..ac14d02 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct inode *inode = req->r_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_data *osd_data;
-	unsigned wrote;
 	struct page *page;
-	int num_pages;
-	int i;
+	int num_pages, total_pages = 0;
+	int i, j;
+	int rc = req->r_result;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	int rc = req->r_result;
-	u64 bytes = req->r_ops[0].extent.length;
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-	long writeback_stat;
-	unsigned issued = ceph_caps_issued(ci);
+	bool remove_page;
 
-	osd_data = osd_req_op_extent_osd_data(req, 0);
-	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-	num_pages = calc_pages_for((u64)osd_data->alignment,
-					(u64)osd_data->length);
-	if (rc >= 0) {
-		/*
-		 * Assume we wrote the pages we originally sent.  The
-		 * osd might reply with fewer pages if our writeback
-		 * raced with a truncation and was adjusted at the osd,
-		 * so don't believe the reply.
-		 */
-		wrote = num_pages;
-	} else {
-		wrote = 0;
+
+	dout("writepages_finish %p rc %d\n", inode, rc);
+	if (rc < 0)
 		mapping_set_error(mapping, rc);
-	}
-	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
-	     inode, rc, bytes, wrote);
 
-	/* clean all pages */
-	for (i = 0; i < num_pages; i++) {
-		page = osd_data->pages[i];
-		BUG_ON(!page);
-		WARN_ON(!PageUptodate(page));
+	/*
+	 * We lost the cache cap, need to truncate the page before
+	 * it is unlocked, otherwise we'd truncate it later in the
+	 * page truncation thread, possibly losing some data that
+	 * raced its way in
+	 */
+	remove_page = !(ceph_caps_issued(ci) &
+			(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
 
-		writeback_stat =
-			atomic_long_dec_return(&fsc->writeback_count);
-		if (writeback_stat <
-		    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
-			clear_bdi_congested(&fsc->backing_dev_info,
-					    BLK_RW_ASYNC);
+	/* clean all pages */
+	for (i = 0; i < req->r_num_ops; i++) {
+		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+			break;
 
-		ceph_put_snap_context(page_snap_context(page));
-		page->private = 0;
-		ClearPagePrivate(page);
-		dout("unlocking %d %p\n", i, page);
-		end_page_writeback(page);
+		osd_data = osd_req_op_extent_osd_data(req, i);
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+		num_pages = calc_pages_for((u64)osd_data->alignment,
+					   (u64)osd_data->length);
+		total_pages += num_pages;
+		for (j = 0; j < num_pages; j++) {
+			page = osd_data->pages[j];
+			BUG_ON(!page);
+			WARN_ON(!PageUptodate(page));
+
+			if (atomic_long_dec_return(&fsc->writeback_count) <
+			     CONGESTION_OFF_THRESH(
+					fsc->mount_options->congestion_kb))
+				clear_bdi_congested(&fsc->backing_dev_info,
+						    BLK_RW_ASYNC);
+
+			ceph_put_snap_context(page_snap_context(page));
+			page->private = 0;
+			ClearPagePrivate(page);
+			dout("unlocking %p\n", page);
+			end_page_writeback(page);
+
+			if (remove_page)
+				generic_error_remove_page(inode->i_mapping,
+							  page);
 
-		/*
-		 * We lost the cache cap, need to truncate the page before
-		 * it is unlocked, otherwise we'd truncate it later in the
-		 * page truncation thread, possibly losing some data that
-		 * raced its way in
-		 */
-		if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
-			generic_error_remove_page(inode->i_mapping, page);
+			unlock_page(page);
+		}
+		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
+		     inode, osd_data->length, rc >= 0 ? num_pages : 0);
 
-		unlock_page(page);
+		ceph_release_pages(osd_data->pages, num_pages);
 	}
-	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
 
-	ceph_release_pages(osd_data->pages, num_pages);
+	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+
+	osd_data = osd_req_op_extent_osd_data(req, 0);
 	if (osd_data->pages_from_pool)
 		mempool_free(osd_data->pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -778,17 +778,15 @@ retry:
 	while (!done && index <= end) {
 		unsigned i;
 		int first;
-		pgoff_t next;
-		int pvec_pages, locked_pages;
-		struct page **pages = NULL;
+		pgoff_t strip_end = 0;
+		int num_ops = 0, max_ops = 0;
+		int pvec_pages, locked_pages = 0;
+		struct page **pages = NULL, **data_pages = NULL;
 		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
 		struct page *page;
 		int want;
-		u64 offset, len;
-		long writeback_stat;
+		u64 offset = 0, len = 0;
 
-		next = 0;
-		locked_pages = 0;
 		max_pages = max_pages_ever;
 
 get_more_pages:
@@ -824,8 +822,8 @@ get_more_pages:
 				unlock_page(page);
 				break;
 			}
-			if (next && (page->index != next)) {
-				dout("not consecutive %p\n", page);
+			if (strip_end && (page->index > strip_end)) {
+				dout("end of strip %p\n", page);
 				unlock_page(page);
 				break;
 			}
@@ -871,28 +869,60 @@ get_more_pages:
 			 * that it will use.
 			 */
 			if (locked_pages == 0) {
+				int j;
 				BUG_ON(pages);
 				/* prepare async write request */
 				offset = (u64)page_offset(page);
 				len = wsize;
-				req = ceph_osdc_new_request(&fsc->client->osdc,
+
+				max_ops = 1 + do_sync;
+				strip_end = page->index +
+					    ((len - 1) >> PAGE_CACHE_SHIFT);
+				for (j = i + 1; j < pvec_pages; j++) {
+					if (pvec.pages[j]->index > strip_end)
+						break;
+					if (pvec.pages[j]->index - 1 !=
+					    pvec.pages[j - 1]->index)
+						max_ops++;
+				}
+
+				if (max_ops > CEPH_OSD_INITIAL_OP) {
+					max_ops = min(max_ops, CEPH_OSD_MAX_OP);
+					req = ceph_osdc_new_request(
+							&fsc->client->osdc,
 							&ci->i_layout, vino,
-							offset, &len, 0,
-							do_sync ? 2 : 1,
+							offset, &len,
+							0, max_ops,
+							CEPH_OSD_OP_WRITE,
+							CEPH_OSD_FLAG_WRITE |
+							CEPH_OSD_FLAG_ONDISK,
+							snapc, truncate_seq,
+							truncate_size, false);
+					if (IS_ERR(req))
+						req = NULL;
+				}
+				if (!req) {
+					max_ops = CEPH_OSD_INITIAL_OP;
+					req = ceph_osdc_new_request(
+							&fsc->client->osdc,
+							&ci->i_layout, vino,
+							offset, &len,
+							0, max_ops,
 							CEPH_OSD_OP_WRITE,
 							CEPH_OSD_FLAG_WRITE |
 							CEPH_OSD_FLAG_ONDISK,
 							snapc, truncate_seq,
 							truncate_size, true);
-				if (IS_ERR(req)) {
-					rc = PTR_ERR(req);
-					unlock_page(page);
-					break;
+					if (IS_ERR(req)) {
+						unlock_page(page);
+						rc = PTR_ERR(req);
+						req = NULL;
+						break;
+					}
 				}
 
-				if (do_sync)
-					osd_req_op_init(req, 1,
-							CEPH_OSD_OP_STARTSYNC, 0);
+				strip_end = page->index +
+					    ((len - 1) >> PAGE_CACHE_SHIFT);
 
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
@@ -905,6 +935,60 @@ get_more_pages:
 					pages = mempool_alloc(pool, GFP_NOFS);
 					BUG_ON(!pages);
 				}
+
+				num_ops = 1;
+				data_pages = pages;
+				len = 0;
+			} else if (page->index !=
+				   (offset + len) >> PAGE_CACHE_SHIFT) {
+				u64 offset_inc;
+				bool new_op = true;
+
+				if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
+					new_op = false;
+				} else if (num_ops + do_sync == max_ops) {
+					int new_max = max_ops;
+					int j;
+					for (j = i + 1; j < pvec_pages; j++) {
+						if (pvec.pages[j]->index >
+						    strip_end)
+							break;
+						if (pvec.pages[j]->index - 1 !=
+						    pvec.pages[j - 1]->index)
+							new_max++;
+					}
+					new_max++;
+					new_max = min(new_max, CEPH_OSD_MAX_OP);
+					if (ceph_osdc_extend_request(req,
+								new_max,
+								GFP_NOFS) >= 0)
+						max_ops = new_max;
+					else
+						new_op = false;
+				}
+				if (!new_op) {
+					redirty_page_for_writepage(wbc, page);
+					unlock_page(page);
+					break;
+				}
+
+				offset_inc = (u64)page_offset(page) - offset;
+				osd_req_op_extent_dup_last(req, offset_inc);
+
+				dout("writepages got pages at %llu~%llu\n",
+				      offset, len);
+
+				osd_req_op_extent_update(req, num_ops - 1, len);
+				osd_req_op_extent_osd_data_pages(req,
+								num_ops - 1,
+								data_pages,
+								len, 0,
+								!!pool, false);
+
+				num_ops++;
+				data_pages = pages + locked_pages;
+				offset = (u64)page_offset(page);
+				len = 0;
 			}
 
 			/* note position of first page in pvec */
@@ -913,9 +997,8 @@ get_more_pages:
 			dout("%p will write page %p idx %lu\n",
 			     inode, page, page->index);
 
-			writeback_stat =
-			       atomic_long_inc_return(&fsc->writeback_count);
-			if (writeback_stat > CONGESTION_ON_THRESH(
+			if (atomic_long_inc_return(&fsc->writeback_count) >
+			    CONGESTION_ON_THRESH(
 				    fsc->mount_options->congestion_kb)) {
 				set_bdi_congested(&fsc->backing_dev_info,
 						  BLK_RW_ASYNC);
@@ -924,7 +1007,7 @@ get_more_pages:
 			set_page_writeback(page);
 			pages[locked_pages] = page;
 			locked_pages++;
-			next = page->index + 1;
+			len += PAGE_CACHE_SIZE;
 		}
 
 		/* did we get anything? */
@@ -952,31 +1035,34 @@ get_more_pages:
 		}
 
 		/* Format the osd request message and submit the write */
-		offset = page_offset(pages[0]);
-		len = (u64)locked_pages << PAGE_CACHE_SHIFT;
 		if (snap_size == -1) {
+			/* writepages_finish() clears writeback pages
+			 * according to the data length, so make sure
+			 * data length covers all locked pages */
+			u64 min_len = len + 1 - PAGE_CACHE_SIZE;
 			len = min(len, (u64)i_size_read(inode) - offset);
-			 /* writepages_finish() clears writeback pages
-			  * according to the data length, so make sure
-			  * data length covers all locked pages */
-			len = max(len, 1 +
-				((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+			len = max(len, min_len);
 		} else {
 			len = min(len, snap_size - offset);
 		}
-		dout("writepages got %d pages at %llu~%llu\n",
-		     locked_pages, offset, len);
+		dout("writepages got pages at %llu~%llu\n", offset, len);
 
-		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
-							!!pool, false);
+		osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
+						 len, 0, !!pool, false);
+		osd_req_op_extent_update(req, num_ops - 1, len);
 
+		if (do_sync) {
+			osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
+			num_ops++;
+		}
+		BUG_ON(num_ops > max_ops);
+
+		index = pages[locked_pages - 1]->index + 1;
 		pages = NULL;	/* request message now owns the pages array */
 		pool = NULL;
 
 		/* Update the write op length in case we changed it */
 
-		osd_req_op_extent_update(req, 0, len);
-
 		vino = ceph_vino(inode);
 		ceph_osdc_build_request(req, offset, snapc, vino.snap,
 					&inode->i_mtime);
@@ -986,7 +1072,6 @@ get_more_pages:
 		req = NULL;
 
 		/* continue? */
-		index = next;
 		wbc->nr_to_write -= locked_pages;
 		if (wbc->nr_to_write <= 0)
 			done = 1;
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-19 13:30 [PATCH V3 6/6] ceph: scattered page writeback Yan, Zheng
@ 2016-01-26 22:15 ` Ilya Dryomov
  2016-01-27  1:52   ` Yan, Zheng
  0 siblings, 1 reply; 7+ messages in thread
From: Ilya Dryomov @ 2016-01-26 22:15 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
> This patch makes ceph_writepages_start() try using single OSD request
> to write all dirty pages within a strip. When a nonconsecutive dirty
> page is found, ceph_writepages_start() tries starting a new write
> operation to existing OSD request. If it succeeds, it uses the new
> operation to writeback the dirty page.
>
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 174 insertions(+), 89 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index c222137..ac14d02 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>         struct inode *inode = req->r_inode;
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         struct ceph_osd_data *osd_data;
> -       unsigned wrote;
>         struct page *page;
> -       int num_pages;
> -       int i;
> +       int num_pages, total_pages = 0;
> +       int i, j;
> +       int rc = req->r_result;
>         struct ceph_snap_context *snapc = req->r_snapc;
>         struct address_space *mapping = inode->i_mapping;
> -       int rc = req->r_result;
> -       u64 bytes = req->r_ops[0].extent.length;
>         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> -       long writeback_stat;
> -       unsigned issued = ceph_caps_issued(ci);
> +       bool remove_page;
>
> -       osd_data = osd_req_op_extent_osd_data(req, 0);
> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> -       num_pages = calc_pages_for((u64)osd_data->alignment,
> -                                       (u64)osd_data->length);
> -       if (rc >= 0) {
> -               /*
> -                * Assume we wrote the pages we originally sent.  The
> -                * osd might reply with fewer pages if our writeback
> -                * raced with a truncation and was adjusted at the osd,
> -                * so don't believe the reply.
> -                */
> -               wrote = num_pages;
> -       } else {
> -               wrote = 0;
> +
> +       dout("writepages_finish %p rc %d\n", inode, rc);
> +       if (rc < 0)
>                 mapping_set_error(mapping, rc);
> -       }
> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
> -            inode, rc, bytes, wrote);
>
> -       /* clean all pages */
> -       for (i = 0; i < num_pages; i++) {
> -               page = osd_data->pages[i];
> -               BUG_ON(!page);
> -               WARN_ON(!PageUptodate(page));
> +       /*
> +        * We lost the cache cap, need to truncate the page before
> +        * it is unlocked, otherwise we'd truncate it later in the
> +        * page truncation thread, possibly losing some data that
> +        * raced its way in
> +        */
> +       remove_page = !(ceph_caps_issued(ci) &
> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>
> -               writeback_stat =
> -                       atomic_long_dec_return(&fsc->writeback_count);
> -               if (writeback_stat <
> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
> -                       clear_bdi_congested(&fsc->backing_dev_info,
> -                                           BLK_RW_ASYNC);
> +       /* clean all pages */
> +       for (i = 0; i < req->r_num_ops; i++) {
> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
> +                       break;
>
> -               ceph_put_snap_context(page_snap_context(page));
> -               page->private = 0;
> -               ClearPagePrivate(page);
> -               dout("unlocking %d %p\n", i, page);
> -               end_page_writeback(page);
> +               osd_data = osd_req_op_extent_osd_data(req, i);
> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> +               num_pages = calc_pages_for((u64)osd_data->alignment,
> +                                          (u64)osd_data->length);
> +               total_pages += num_pages;
> +               for (j = 0; j < num_pages; j++) {
> +                       page = osd_data->pages[j];
> +                       BUG_ON(!page);
> +                       WARN_ON(!PageUptodate(page));
> +
> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
> +                            CONGESTION_OFF_THRESH(
> +                                       fsc->mount_options->congestion_kb))
> +                               clear_bdi_congested(&fsc->backing_dev_info,
> +                                                   BLK_RW_ASYNC);
> +
> +                       ceph_put_snap_context(page_snap_context(page));
> +                       page->private = 0;
> +                       ClearPagePrivate(page);
> +                       dout("unlocking %p\n", page);
> +                       end_page_writeback(page);
> +
> +                       if (remove_page)
> +                               generic_error_remove_page(inode->i_mapping,
> +                                                         page);
>
> -               /*
> -                * We lost the cache cap, need to truncate the page before
> -                * it is unlocked, otherwise we'd truncate it later in the
> -                * page truncation thread, possibly losing some data that
> -                * raced its way in
> -                */
> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
> -                       generic_error_remove_page(inode->i_mapping, page);
> +                       unlock_page(page);
> +               }
> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>
> -               unlock_page(page);
> +               ceph_release_pages(osd_data->pages, num_pages);
>         }
> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>
> -       ceph_release_pages(osd_data->pages, num_pages);
> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
> +
> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>         if (osd_data->pages_from_pool)
>                 mempool_free(osd_data->pages,
>                              ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
> @@ -778,17 +778,15 @@ retry:
>         while (!done && index <= end) {
>                 unsigned i;
>                 int first;
> -               pgoff_t next;
> -               int pvec_pages, locked_pages;
> -               struct page **pages = NULL;
> +               pgoff_t strip_end = 0;
> +               int num_ops = 0, max_ops = 0;
> +               int pvec_pages, locked_pages = 0;
> +               struct page **pages = NULL, **data_pages = NULL;
>                 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>                 struct page *page;
>                 int want;
> -               u64 offset, len;
> -               long writeback_stat;
> +               u64 offset = 0, len = 0;
>
> -               next = 0;
> -               locked_pages = 0;
>                 max_pages = max_pages_ever;
>
>  get_more_pages:
> @@ -824,8 +822,8 @@ get_more_pages:
>                                 unlock_page(page);
>                                 break;
>                         }
> -                       if (next && (page->index != next)) {
> -                               dout("not consecutive %p\n", page);
> +                       if (strip_end && (page->index > strip_end)) {
> +                               dout("end of strip %p\n", page);
>                                 unlock_page(page);
>                                 break;
>                         }
> @@ -871,28 +869,60 @@ get_more_pages:
>                          * that it will use.
>                          */
>                         if (locked_pages == 0) {
> +                               int j;
>                                 BUG_ON(pages);
>                                 /* prepare async write request */
>                                 offset = (u64)page_offset(page);
>                                 len = wsize;
> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
> +
> +                               max_ops = 1 + do_sync;
> +                               strip_end = page->index +
> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
> +                               for (j = i + 1; j < pvec_pages; j++) {
> +                                       if (pvec.pages[j]->index > strip_end)
> +                                               break;
> +                                       if (pvec.pages[j]->index - 1 !=
> +                                           pvec.pages[j - 1]->index)
> +                                               max_ops++;
> +                               }
> +
> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
> +                                       req = ceph_osdc_new_request(
> +                                                       &fsc->client->osdc,
>                                                         &ci->i_layout, vino,
> -                                                       offset, &len, 0,
> -                                                       do_sync ? 2 : 1,
> +                                                       offset, &len,
> +                                                       0, max_ops,
> +                                                       CEPH_OSD_OP_WRITE,
> +                                                       CEPH_OSD_FLAG_WRITE |
> +                                                       CEPH_OSD_FLAG_ONDISK,
> +                                                       snapc, truncate_seq,
> +                                                       truncate_size, false);
> +                                       if (IS_ERR(req))
> +                                               req = NULL;
> +                               }
> +                               if (!req) {
> +                                       max_ops = CEPH_OSD_INITIAL_OP;
> +                                       req = ceph_osdc_new_request(
> +                                                       &fsc->client->osdc,
> +                                                       &ci->i_layout, vino,
> +                                                       offset, &len,
> +                                                       0, max_ops,
>                                                         CEPH_OSD_OP_WRITE,
>                                                         CEPH_OSD_FLAG_WRITE |
>                                                         CEPH_OSD_FLAG_ONDISK,
>                                                         snapc, truncate_seq,
>                                                         truncate_size, true);
> -                               if (IS_ERR(req)) {
> -                                       rc = PTR_ERR(req);
> -                                       unlock_page(page);
> -                                       break;
> +                                       if (IS_ERR(req)) {
> +                                               unlock_page(page);
> +                                               rc = PTR_ERR(req);
> +                                               req = NULL;
> +                                               break;
> +                                       }
>                                 }
>
> -                               if (do_sync)
> -                                       osd_req_op_init(req, 1,
> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
> +                               strip_end = page->index +
> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>
>                                 req->r_callback = writepages_finish;
>                                 req->r_inode = inode;
> @@ -905,6 +935,60 @@ get_more_pages:
>                                         pages = mempool_alloc(pool, GFP_NOFS);
>                                         BUG_ON(!pages);
>                                 }
> +
> +                               num_ops = 1;
> +                               data_pages = pages;
> +                               len = 0;
> +                       } else if (page->index !=
> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
> +                               u64 offset_inc;
> +                               bool new_op = true;
> +
> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
> +                                       new_op = false;
> +                               } else if (num_ops + do_sync == max_ops) {
> +                                       int new_max = max_ops;
> +                                       int j;
> +                                       for (j = i + 1; j < pvec_pages; j++) {
> +                                               if (pvec.pages[j]->index >
> +                                                   strip_end)
> +                                                       break;
> +                                               if (pvec.pages[j]->index - 1 !=
> +                                                   pvec.pages[j - 1]->index)
> +                                                       new_max++;
> +                                       }
> +                                       new_max++;
> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
> +                                       if (ceph_osdc_extend_request(req,
> +                                                               new_max,
> +                                                               GFP_NOFS) >= 0)
> +                                               max_ops = new_max;
> +                                       else
> +                                               new_op = false;
> +                               }
> +                               if (!new_op) {
> +                                       redirty_page_for_writepage(wbc, page);
> +                                       unlock_page(page);
> +                                       break;
> +                               }
> +
> +                               offset_inc = (u64)page_offset(page) - offset;
> +                               osd_req_op_extent_dup_last(req, offset_inc);
> +
> +                               dout("writepages got pages at %llu~%llu\n",
> +                                     offset, len);
> +
> +                               osd_req_op_extent_update(req, num_ops - 1, len);
> +                               osd_req_op_extent_osd_data_pages(req,
> +                                                               num_ops - 1,
> +                                                               data_pages,
> +                                                               len, 0,
> +                                                               !!pool, false);
> +
> +                               num_ops++;
> +                               data_pages = pages + locked_pages;
> +                               offset = (u64)page_offset(page);
> +                               len = 0;
>                         }
>
>                         /* note position of first page in pvec */
> @@ -913,9 +997,8 @@ get_more_pages:
>                         dout("%p will write page %p idx %lu\n",
>                              inode, page, page->index);
>
> -                       writeback_stat =
> -                              atomic_long_inc_return(&fsc->writeback_count);
> -                       if (writeback_stat > CONGESTION_ON_THRESH(
> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
> +                           CONGESTION_ON_THRESH(
>                                     fsc->mount_options->congestion_kb)) {
>                                 set_bdi_congested(&fsc->backing_dev_info,
>                                                   BLK_RW_ASYNC);
> @@ -924,7 +1007,7 @@ get_more_pages:
>                         set_page_writeback(page);
>                         pages[locked_pages] = page;
>                         locked_pages++;
> -                       next = page->index + 1;
> +                       len += PAGE_CACHE_SIZE;
>                 }
>
>                 /* did we get anything? */
> @@ -952,31 +1035,34 @@ get_more_pages:
>                 }
>
>                 /* Format the osd request message and submit the write */
> -               offset = page_offset(pages[0]);
> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>                 if (snap_size == -1) {
> +                       /* writepages_finish() clears writeback pages
> +                        * according to the data length, so make sure
> +                        * data length covers all locked pages */
> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>                         len = min(len, (u64)i_size_read(inode) - offset);
> -                        /* writepages_finish() clears writeback pages
> -                         * according to the data length, so make sure
> -                         * data length covers all locked pages */
> -                       len = max(len, 1 +
> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
> +                       len = max(len, min_len);
>                 } else {
>                         len = min(len, snap_size - offset);
>                 }
> -               dout("writepages got %d pages at %llu~%llu\n",
> -                    locked_pages, offset, len);
> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>
> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
> -                                                       !!pool, false);
> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
> +                                                len, 0, !!pool, false);
> +               osd_req_op_extent_update(req, num_ops - 1, len);
>
> +               if (do_sync) {
> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
> +                       num_ops++;
> +               }
> +               BUG_ON(num_ops > max_ops);
> +
> +               index = pages[locked_pages - 1]->index + 1;
>                 pages = NULL;   /* request message now owns the pages array */
>                 pool = NULL;
>
>                 /* Update the write op length in case we changed it */
>
> -               osd_req_op_extent_update(req, 0, len);
> -
>                 vino = ceph_vino(inode);
>                 ceph_osdc_build_request(req, offset, snapc, vino.snap,
>                                         &inode->i_mtime);
> @@ -986,7 +1072,6 @@ get_more_pages:
>                 req = NULL;
>
>                 /* continue? */
> -               index = next;
>                 wbc->nr_to_write -= locked_pages;
>                 if (wbc->nr_to_write <= 0)
>                         done = 1;

Hi Zheng,

I know I mentioned off-list that v3 looked good, but I've taken
a deeper look and have to retract that.  This version still has a call
to ceph_osdc_extend_request(), which suggests to me that the number of
ops is calculated incorrectly.

A single op within an OSD request can be used for a single region of
contiguous dirty pages, so what this comes down to is calculating the
number of such dirty contiguous regions.  What you appear to be doing
in this version is calculating this number for the *first* gang of
pages we get from pagevec_lookup_tag() and allocating an OSD request
based on that.  If that's not enough (i.e. there are more dirty pages
in the stripe unit), we move on, you do the calculation for the next
gang and enlange the OSD request by reallocating parts of it.  This
goes on until we either run out of dirty pages in the stripe unit or
reach CEPH_OSD_MAX_OP ops in the request.

What I don't understand is what prevents us from pre-calculating the
number of dirty contiguous regions for the entire stripe unit in one go
and trying to allocate the OSD request based on that number, possibly
falling back to a smaller request if we are short on memory.  The page
pointer array can hold all of pages in the stripe unit - I don't see
the need for ceph_osdc_extend_request() at all.  Am I missing something?

Also, what you call a "strip" is really a stripe unit (in ceph file
layout terms).  It's important to use the same terminology throughout
the codebase, please rename.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-26 22:15 ` Ilya Dryomov
@ 2016-01-27  1:52   ` Yan, Zheng
  2016-01-27  9:29     ` Ilya Dryomov
  0 siblings, 1 reply; 7+ messages in thread
From: Yan, Zheng @ 2016-01-27  1:52 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Yan, Zheng, Ceph Development

On Wed, Jan 27, 2016 at 6:15 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> This patch makes ceph_writepages_start() try using single OSD request
>> to write all dirty pages within a strip. When a nonconsecutive dirty
>> page is found, ceph_writepages_start() tries starting a new write
>> operation to existing OSD request. If it succeeds, it uses the new
>> operation to writeback the dirty page.
>>
>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>> ---
>>  fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>>  1 file changed, 174 insertions(+), 89 deletions(-)
>>
>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> index c222137..ac14d02 100644
>> --- a/fs/ceph/addr.c
>> +++ b/fs/ceph/addr.c
>> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>>         struct inode *inode = req->r_inode;
>>         struct ceph_inode_info *ci = ceph_inode(inode);
>>         struct ceph_osd_data *osd_data;
>> -       unsigned wrote;
>>         struct page *page;
>> -       int num_pages;
>> -       int i;
>> +       int num_pages, total_pages = 0;
>> +       int i, j;
>> +       int rc = req->r_result;
>>         struct ceph_snap_context *snapc = req->r_snapc;
>>         struct address_space *mapping = inode->i_mapping;
>> -       int rc = req->r_result;
>> -       u64 bytes = req->r_ops[0].extent.length;
>>         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>> -       long writeback_stat;
>> -       unsigned issued = ceph_caps_issued(ci);
>> +       bool remove_page;
>>
>> -       osd_data = osd_req_op_extent_osd_data(req, 0);
>> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>> -       num_pages = calc_pages_for((u64)osd_data->alignment,
>> -                                       (u64)osd_data->length);
>> -       if (rc >= 0) {
>> -               /*
>> -                * Assume we wrote the pages we originally sent.  The
>> -                * osd might reply with fewer pages if our writeback
>> -                * raced with a truncation and was adjusted at the osd,
>> -                * so don't believe the reply.
>> -                */
>> -               wrote = num_pages;
>> -       } else {
>> -               wrote = 0;
>> +
>> +       dout("writepages_finish %p rc %d\n", inode, rc);
>> +       if (rc < 0)
>>                 mapping_set_error(mapping, rc);
>> -       }
>> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
>> -            inode, rc, bytes, wrote);
>>
>> -       /* clean all pages */
>> -       for (i = 0; i < num_pages; i++) {
>> -               page = osd_data->pages[i];
>> -               BUG_ON(!page);
>> -               WARN_ON(!PageUptodate(page));
>> +       /*
>> +        * We lost the cache cap, need to truncate the page before
>> +        * it is unlocked, otherwise we'd truncate it later in the
>> +        * page truncation thread, possibly losing some data that
>> +        * raced its way in
>> +        */
>> +       remove_page = !(ceph_caps_issued(ci) &
>> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>>
>> -               writeback_stat =
>> -                       atomic_long_dec_return(&fsc->writeback_count);
>> -               if (writeback_stat <
>> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
>> -                       clear_bdi_congested(&fsc->backing_dev_info,
>> -                                           BLK_RW_ASYNC);
>> +       /* clean all pages */
>> +       for (i = 0; i < req->r_num_ops; i++) {
>> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
>> +                       break;
>>
>> -               ceph_put_snap_context(page_snap_context(page));
>> -               page->private = 0;
>> -               ClearPagePrivate(page);
>> -               dout("unlocking %d %p\n", i, page);
>> -               end_page_writeback(page);
>> +               osd_data = osd_req_op_extent_osd_data(req, i);
>> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>> +               num_pages = calc_pages_for((u64)osd_data->alignment,
>> +                                          (u64)osd_data->length);
>> +               total_pages += num_pages;
>> +               for (j = 0; j < num_pages; j++) {
>> +                       page = osd_data->pages[j];
>> +                       BUG_ON(!page);
>> +                       WARN_ON(!PageUptodate(page));
>> +
>> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
>> +                            CONGESTION_OFF_THRESH(
>> +                                       fsc->mount_options->congestion_kb))
>> +                               clear_bdi_congested(&fsc->backing_dev_info,
>> +                                                   BLK_RW_ASYNC);
>> +
>> +                       ceph_put_snap_context(page_snap_context(page));
>> +                       page->private = 0;
>> +                       ClearPagePrivate(page);
>> +                       dout("unlocking %p\n", page);
>> +                       end_page_writeback(page);
>> +
>> +                       if (remove_page)
>> +                               generic_error_remove_page(inode->i_mapping,
>> +                                                         page);
>>
>> -               /*
>> -                * We lost the cache cap, need to truncate the page before
>> -                * it is unlocked, otherwise we'd truncate it later in the
>> -                * page truncation thread, possibly losing some data that
>> -                * raced its way in
>> -                */
>> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
>> -                       generic_error_remove_page(inode->i_mapping, page);
>> +                       unlock_page(page);
>> +               }
>> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
>> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>>
>> -               unlock_page(page);
>> +               ceph_release_pages(osd_data->pages, num_pages);
>>         }
>> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
>> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>>
>> -       ceph_release_pages(osd_data->pages, num_pages);
>> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
>> +
>> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>>         if (osd_data->pages_from_pool)
>>                 mempool_free(osd_data->pages,
>>                              ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
>> @@ -778,17 +778,15 @@ retry:
>>         while (!done && index <= end) {
>>                 unsigned i;
>>                 int first;
>> -               pgoff_t next;
>> -               int pvec_pages, locked_pages;
>> -               struct page **pages = NULL;
>> +               pgoff_t strip_end = 0;
>> +               int num_ops = 0, max_ops = 0;
>> +               int pvec_pages, locked_pages = 0;
>> +               struct page **pages = NULL, **data_pages = NULL;
>>                 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>>                 struct page *page;
>>                 int want;
>> -               u64 offset, len;
>> -               long writeback_stat;
>> +               u64 offset = 0, len = 0;
>>
>> -               next = 0;
>> -               locked_pages = 0;
>>                 max_pages = max_pages_ever;
>>
>>  get_more_pages:
>> @@ -824,8 +822,8 @@ get_more_pages:
>>                                 unlock_page(page);
>>                                 break;
>>                         }
>> -                       if (next && (page->index != next)) {
>> -                               dout("not consecutive %p\n", page);
>> +                       if (strip_end && (page->index > strip_end)) {
>> +                               dout("end of strip %p\n", page);
>>                                 unlock_page(page);
>>                                 break;
>>                         }
>> @@ -871,28 +869,60 @@ get_more_pages:
>>                          * that it will use.
>>                          */
>>                         if (locked_pages == 0) {
>> +                               int j;
>>                                 BUG_ON(pages);
>>                                 /* prepare async write request */
>>                                 offset = (u64)page_offset(page);
>>                                 len = wsize;
>> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
>> +
>> +                               max_ops = 1 + do_sync;
>> +                               strip_end = page->index +
>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>> +                               for (j = i + 1; j < pvec_pages; j++) {
>> +                                       if (pvec.pages[j]->index > strip_end)
>> +                                               break;
>> +                                       if (pvec.pages[j]->index - 1 !=
>> +                                           pvec.pages[j - 1]->index)
>> +                                               max_ops++;
>> +                               }
>> +
>> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
>> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
>> +                                       req = ceph_osdc_new_request(
>> +                                                       &fsc->client->osdc,
>>                                                         &ci->i_layout, vino,
>> -                                                       offset, &len, 0,
>> -                                                       do_sync ? 2 : 1,
>> +                                                       offset, &len,
>> +                                                       0, max_ops,
>> +                                                       CEPH_OSD_OP_WRITE,
>> +                                                       CEPH_OSD_FLAG_WRITE |
>> +                                                       CEPH_OSD_FLAG_ONDISK,
>> +                                                       snapc, truncate_seq,
>> +                                                       truncate_size, false);
>> +                                       if (IS_ERR(req))
>> +                                               req = NULL;
>> +                               }
>> +                               if (!req) {
>> +                                       max_ops = CEPH_OSD_INITIAL_OP;
>> +                                       req = ceph_osdc_new_request(
>> +                                                       &fsc->client->osdc,
>> +                                                       &ci->i_layout, vino,
>> +                                                       offset, &len,
>> +                                                       0, max_ops,
>>                                                         CEPH_OSD_OP_WRITE,
>>                                                         CEPH_OSD_FLAG_WRITE |
>>                                                         CEPH_OSD_FLAG_ONDISK,
>>                                                         snapc, truncate_seq,
>>                                                         truncate_size, true);
>> -                               if (IS_ERR(req)) {
>> -                                       rc = PTR_ERR(req);
>> -                                       unlock_page(page);
>> -                                       break;
>> +                                       if (IS_ERR(req)) {
>> +                                               unlock_page(page);
>> +                                               rc = PTR_ERR(req);
>> +                                               req = NULL;
>> +                                               break;
>> +                                       }
>>                                 }
>>
>> -                               if (do_sync)
>> -                                       osd_req_op_init(req, 1,
>> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
>> +                               strip_end = page->index +
>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>
>>                                 req->r_callback = writepages_finish;
>>                                 req->r_inode = inode;
>> @@ -905,6 +935,60 @@ get_more_pages:
>>                                         pages = mempool_alloc(pool, GFP_NOFS);
>>                                         BUG_ON(!pages);
>>                                 }
>> +
>> +                               num_ops = 1;
>> +                               data_pages = pages;
>> +                               len = 0;
>> +                       } else if (page->index !=
>> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
>> +                               u64 offset_inc;
>> +                               bool new_op = true;
>> +
>> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
>> +                                       new_op = false;
>> +                               } else if (num_ops + do_sync == max_ops) {
>> +                                       int new_max = max_ops;
>> +                                       int j;
>> +                                       for (j = i + 1; j < pvec_pages; j++) {
>> +                                               if (pvec.pages[j]->index >
>> +                                                   strip_end)
>> +                                                       break;
>> +                                               if (pvec.pages[j]->index - 1 !=
>> +                                                   pvec.pages[j - 1]->index)
>> +                                                       new_max++;
>> +                                       }
>> +                                       new_max++;
>> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
>> +                                       if (ceph_osdc_extend_request(req,
>> +                                                               new_max,
>> +                                                               GFP_NOFS) >= 0)
>> +                                               max_ops = new_max;
>> +                                       else
>> +                                               new_op = false;
>> +                               }
>> +                               if (!new_op) {
>> +                                       redirty_page_for_writepage(wbc, page);
>> +                                       unlock_page(page);
>> +                                       break;
>> +                               }
>> +
>> +                               offset_inc = (u64)page_offset(page) - offset;
>> +                               osd_req_op_extent_dup_last(req, offset_inc);
>> +
>> +                               dout("writepages got pages at %llu~%llu\n",
>> +                                     offset, len);
>> +
>> +                               osd_req_op_extent_update(req, num_ops - 1, len);
>> +                               osd_req_op_extent_osd_data_pages(req,
>> +                                                               num_ops - 1,
>> +                                                               data_pages,
>> +                                                               len, 0,
>> +                                                               !!pool, false);
>> +
>> +                               num_ops++;
>> +                               data_pages = pages + locked_pages;
>> +                               offset = (u64)page_offset(page);
>> +                               len = 0;
>>                         }
>>
>>                         /* note position of first page in pvec */
>> @@ -913,9 +997,8 @@ get_more_pages:
>>                         dout("%p will write page %p idx %lu\n",
>>                              inode, page, page->index);
>>
>> -                       writeback_stat =
>> -                              atomic_long_inc_return(&fsc->writeback_count);
>> -                       if (writeback_stat > CONGESTION_ON_THRESH(
>> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
>> +                           CONGESTION_ON_THRESH(
>>                                     fsc->mount_options->congestion_kb)) {
>>                                 set_bdi_congested(&fsc->backing_dev_info,
>>                                                   BLK_RW_ASYNC);
>> @@ -924,7 +1007,7 @@ get_more_pages:
>>                         set_page_writeback(page);
>>                         pages[locked_pages] = page;
>>                         locked_pages++;
>> -                       next = page->index + 1;
>> +                       len += PAGE_CACHE_SIZE;
>>                 }
>>
>>                 /* did we get anything? */
>> @@ -952,31 +1035,34 @@ get_more_pages:
>>                 }
>>
>>                 /* Format the osd request message and submit the write */
>> -               offset = page_offset(pages[0]);
>> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>>                 if (snap_size == -1) {
>> +                       /* writepages_finish() clears writeback pages
>> +                        * according to the data length, so make sure
>> +                        * data length covers all locked pages */
>> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>>                         len = min(len, (u64)i_size_read(inode) - offset);
>> -                        /* writepages_finish() clears writeback pages
>> -                         * according to the data length, so make sure
>> -                         * data length covers all locked pages */
>> -                       len = max(len, 1 +
>> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
>> +                       len = max(len, min_len);
>>                 } else {
>>                         len = min(len, snap_size - offset);
>>                 }
>> -               dout("writepages got %d pages at %llu~%llu\n",
>> -                    locked_pages, offset, len);
>> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>>
>> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>> -                                                       !!pool, false);
>> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
>> +                                                len, 0, !!pool, false);
>> +               osd_req_op_extent_update(req, num_ops - 1, len);
>>
>> +               if (do_sync) {
>> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
>> +                       num_ops++;
>> +               }
>> +               BUG_ON(num_ops > max_ops);
>> +
>> +               index = pages[locked_pages - 1]->index + 1;
>>                 pages = NULL;   /* request message now owns the pages array */
>>                 pool = NULL;
>>
>>                 /* Update the write op length in case we changed it */
>>
>> -               osd_req_op_extent_update(req, 0, len);
>> -
>>                 vino = ceph_vino(inode);
>>                 ceph_osdc_build_request(req, offset, snapc, vino.snap,
>>                                         &inode->i_mtime);
>> @@ -986,7 +1072,6 @@ get_more_pages:
>>                 req = NULL;
>>
>>                 /* continue? */
>> -               index = next;
>>                 wbc->nr_to_write -= locked_pages;
>>                 if (wbc->nr_to_write <= 0)
>>                         done = 1;
>
> Hi Zheng,
>
> I know I mentioned off-list that v3 looked good, but I've taken
> a deeper look and have to retract that.  This version still has a call
> to ceph_osdc_extend_request(), which suggests to me that the number of
> ops is calculated incorrectly.
>
> A single op within an OSD request can be used for a single region of
> contiguous dirty pages, so what this comes down to is calculating the
> number of such dirty contiguous regions.  What you appear to be doing
> in this version is calculating this number for the *first* gang of
> pages we get from pagevec_lookup_tag() and allocating an OSD request
> based on that.  If that's not enough (i.e. there are more dirty pages
> in the stripe unit), we move on, you do the calculation for the next
> gang and enlange the OSD request by reallocating parts of it.  This
> goes on until we either run out of dirty pages in the stripe unit or
> reach CEPH_OSD_MAX_OP ops in the request.
>
> What I don't understand is what prevents us from pre-calculating the
> number of dirty contiguous regions for the entire stripe unit in one go
> and trying to allocate the OSD request based on that number, possibly
> falling back to a smaller request if we are short on memory.  The page
> pointer array can hold all of pages in the stripe unit - I don't see
> the need for ceph_osdc_extend_request() at all.  Am I missing something?
>

pre-calculating number of regions in the strip unit is ideal for
random write, but it impose overhead on the most common sequential
write (that's why I don't allocate CEPH_OSD_MAX_OP ops in the first
place)  Besides,  pre-calculating number of regions require big
modification on both ceph_writepages_start() and
ceph_osdc_new_request()  (we don't know where a strip unit end until
calling ceph_osdc_new_request()).

Regards
Yan, Zheng

> Also, what you call a "strip" is really a stripe unit (in ceph file
> layout terms).  It's important to use the same terminology throughout
> the codebase, please rename.
>
> Thanks,
>
>                 Ilya
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-27  1:52   ` Yan, Zheng
@ 2016-01-27  9:29     ` Ilya Dryomov
  2016-01-27  9:42       ` Yan, Zheng
  0 siblings, 1 reply; 7+ messages in thread
From: Ilya Dryomov @ 2016-01-27  9:29 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Yan, Zheng, Ceph Development

On Wed, Jan 27, 2016 at 2:52 AM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Wed, Jan 27, 2016 at 6:15 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
>> On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>> This patch makes ceph_writepages_start() try using single OSD request
>>> to write all dirty pages within a strip. When a nonconsecutive dirty
>>> page is found, ceph_writepages_start() tries starting a new write
>>> operation to existing OSD request. If it succeeds, it uses the new
>>> operation to writeback the dirty page.
>>>
>>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>>> ---
>>>  fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>>>  1 file changed, 174 insertions(+), 89 deletions(-)
>>>
>>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>>> index c222137..ac14d02 100644
>>> --- a/fs/ceph/addr.c
>>> +++ b/fs/ceph/addr.c
>>> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>>>         struct inode *inode = req->r_inode;
>>>         struct ceph_inode_info *ci = ceph_inode(inode);
>>>         struct ceph_osd_data *osd_data;
>>> -       unsigned wrote;
>>>         struct page *page;
>>> -       int num_pages;
>>> -       int i;
>>> +       int num_pages, total_pages = 0;
>>> +       int i, j;
>>> +       int rc = req->r_result;
>>>         struct ceph_snap_context *snapc = req->r_snapc;
>>>         struct address_space *mapping = inode->i_mapping;
>>> -       int rc = req->r_result;
>>> -       u64 bytes = req->r_ops[0].extent.length;
>>>         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>> -       long writeback_stat;
>>> -       unsigned issued = ceph_caps_issued(ci);
>>> +       bool remove_page;
>>>
>>> -       osd_data = osd_req_op_extent_osd_data(req, 0);
>>> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>> -       num_pages = calc_pages_for((u64)osd_data->alignment,
>>> -                                       (u64)osd_data->length);
>>> -       if (rc >= 0) {
>>> -               /*
>>> -                * Assume we wrote the pages we originally sent.  The
>>> -                * osd might reply with fewer pages if our writeback
>>> -                * raced with a truncation and was adjusted at the osd,
>>> -                * so don't believe the reply.
>>> -                */
>>> -               wrote = num_pages;
>>> -       } else {
>>> -               wrote = 0;
>>> +
>>> +       dout("writepages_finish %p rc %d\n", inode, rc);
>>> +       if (rc < 0)
>>>                 mapping_set_error(mapping, rc);
>>> -       }
>>> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
>>> -            inode, rc, bytes, wrote);
>>>
>>> -       /* clean all pages */
>>> -       for (i = 0; i < num_pages; i++) {
>>> -               page = osd_data->pages[i];
>>> -               BUG_ON(!page);
>>> -               WARN_ON(!PageUptodate(page));
>>> +       /*
>>> +        * We lost the cache cap, need to truncate the page before
>>> +        * it is unlocked, otherwise we'd truncate it later in the
>>> +        * page truncation thread, possibly losing some data that
>>> +        * raced its way in
>>> +        */
>>> +       remove_page = !(ceph_caps_issued(ci) &
>>> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>>>
>>> -               writeback_stat =
>>> -                       atomic_long_dec_return(&fsc->writeback_count);
>>> -               if (writeback_stat <
>>> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
>>> -                       clear_bdi_congested(&fsc->backing_dev_info,
>>> -                                           BLK_RW_ASYNC);
>>> +       /* clean all pages */
>>> +       for (i = 0; i < req->r_num_ops; i++) {
>>> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
>>> +                       break;
>>>
>>> -               ceph_put_snap_context(page_snap_context(page));
>>> -               page->private = 0;
>>> -               ClearPagePrivate(page);
>>> -               dout("unlocking %d %p\n", i, page);
>>> -               end_page_writeback(page);
>>> +               osd_data = osd_req_op_extent_osd_data(req, i);
>>> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>> +               num_pages = calc_pages_for((u64)osd_data->alignment,
>>> +                                          (u64)osd_data->length);
>>> +               total_pages += num_pages;
>>> +               for (j = 0; j < num_pages; j++) {
>>> +                       page = osd_data->pages[j];
>>> +                       BUG_ON(!page);
>>> +                       WARN_ON(!PageUptodate(page));
>>> +
>>> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
>>> +                            CONGESTION_OFF_THRESH(
>>> +                                       fsc->mount_options->congestion_kb))
>>> +                               clear_bdi_congested(&fsc->backing_dev_info,
>>> +                                                   BLK_RW_ASYNC);
>>> +
>>> +                       ceph_put_snap_context(page_snap_context(page));
>>> +                       page->private = 0;
>>> +                       ClearPagePrivate(page);
>>> +                       dout("unlocking %p\n", page);
>>> +                       end_page_writeback(page);
>>> +
>>> +                       if (remove_page)
>>> +                               generic_error_remove_page(inode->i_mapping,
>>> +                                                         page);
>>>
>>> -               /*
>>> -                * We lost the cache cap, need to truncate the page before
>>> -                * it is unlocked, otherwise we'd truncate it later in the
>>> -                * page truncation thread, possibly losing some data that
>>> -                * raced its way in
>>> -                */
>>> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
>>> -                       generic_error_remove_page(inode->i_mapping, page);
>>> +                       unlock_page(page);
>>> +               }
>>> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
>>> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>>>
>>> -               unlock_page(page);
>>> +               ceph_release_pages(osd_data->pages, num_pages);
>>>         }
>>> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
>>> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>>>
>>> -       ceph_release_pages(osd_data->pages, num_pages);
>>> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
>>> +
>>> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>         if (osd_data->pages_from_pool)
>>>                 mempool_free(osd_data->pages,
>>>                              ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
>>> @@ -778,17 +778,15 @@ retry:
>>>         while (!done && index <= end) {
>>>                 unsigned i;
>>>                 int first;
>>> -               pgoff_t next;
>>> -               int pvec_pages, locked_pages;
>>> -               struct page **pages = NULL;
>>> +               pgoff_t strip_end = 0;
>>> +               int num_ops = 0, max_ops = 0;
>>> +               int pvec_pages, locked_pages = 0;
>>> +               struct page **pages = NULL, **data_pages = NULL;
>>>                 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>>>                 struct page *page;
>>>                 int want;
>>> -               u64 offset, len;
>>> -               long writeback_stat;
>>> +               u64 offset = 0, len = 0;
>>>
>>> -               next = 0;
>>> -               locked_pages = 0;
>>>                 max_pages = max_pages_ever;
>>>
>>>  get_more_pages:
>>> @@ -824,8 +822,8 @@ get_more_pages:
>>>                                 unlock_page(page);
>>>                                 break;
>>>                         }
>>> -                       if (next && (page->index != next)) {
>>> -                               dout("not consecutive %p\n", page);
>>> +                       if (strip_end && (page->index > strip_end)) {
>>> +                               dout("end of strip %p\n", page);
>>>                                 unlock_page(page);
>>>                                 break;
>>>                         }
>>> @@ -871,28 +869,60 @@ get_more_pages:
>>>                          * that it will use.
>>>                          */
>>>                         if (locked_pages == 0) {
>>> +                               int j;
>>>                                 BUG_ON(pages);
>>>                                 /* prepare async write request */
>>>                                 offset = (u64)page_offset(page);
>>>                                 len = wsize;
>>> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
>>> +
>>> +                               max_ops = 1 + do_sync;
>>> +                               strip_end = page->index +
>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>> +                               for (j = i + 1; j < pvec_pages; j++) {
>>> +                                       if (pvec.pages[j]->index > strip_end)
>>> +                                               break;
>>> +                                       if (pvec.pages[j]->index - 1 !=
>>> +                                           pvec.pages[j - 1]->index)
>>> +                                               max_ops++;
>>> +                               }
>>> +
>>> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
>>> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
>>> +                                       req = ceph_osdc_new_request(
>>> +                                                       &fsc->client->osdc,
>>>                                                         &ci->i_layout, vino,
>>> -                                                       offset, &len, 0,
>>> -                                                       do_sync ? 2 : 1,
>>> +                                                       offset, &len,
>>> +                                                       0, max_ops,
>>> +                                                       CEPH_OSD_OP_WRITE,
>>> +                                                       CEPH_OSD_FLAG_WRITE |
>>> +                                                       CEPH_OSD_FLAG_ONDISK,
>>> +                                                       snapc, truncate_seq,
>>> +                                                       truncate_size, false);
>>> +                                       if (IS_ERR(req))
>>> +                                               req = NULL;
>>> +                               }
>>> +                               if (!req) {
>>> +                                       max_ops = CEPH_OSD_INITIAL_OP;
>>> +                                       req = ceph_osdc_new_request(
>>> +                                                       &fsc->client->osdc,
>>> +                                                       &ci->i_layout, vino,
>>> +                                                       offset, &len,
>>> +                                                       0, max_ops,
>>>                                                         CEPH_OSD_OP_WRITE,
>>>                                                         CEPH_OSD_FLAG_WRITE |
>>>                                                         CEPH_OSD_FLAG_ONDISK,
>>>                                                         snapc, truncate_seq,
>>>                                                         truncate_size, true);
>>> -                               if (IS_ERR(req)) {
>>> -                                       rc = PTR_ERR(req);
>>> -                                       unlock_page(page);
>>> -                                       break;
>>> +                                       if (IS_ERR(req)) {
>>> +                                               unlock_page(page);
>>> +                                               rc = PTR_ERR(req);
>>> +                                               req = NULL;
>>> +                                               break;
>>> +                                       }
>>>                                 }
>>>
>>> -                               if (do_sync)
>>> -                                       osd_req_op_init(req, 1,
>>> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
>>> +                               strip_end = page->index +
>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>
>>>                                 req->r_callback = writepages_finish;
>>>                                 req->r_inode = inode;
>>> @@ -905,6 +935,60 @@ get_more_pages:
>>>                                         pages = mempool_alloc(pool, GFP_NOFS);
>>>                                         BUG_ON(!pages);
>>>                                 }
>>> +
>>> +                               num_ops = 1;
>>> +                               data_pages = pages;
>>> +                               len = 0;
>>> +                       } else if (page->index !=
>>> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
>>> +                               u64 offset_inc;
>>> +                               bool new_op = true;
>>> +
>>> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
>>> +                                       new_op = false;
>>> +                               } else if (num_ops + do_sync == max_ops) {
>>> +                                       int new_max = max_ops;
>>> +                                       int j;
>>> +                                       for (j = i + 1; j < pvec_pages; j++) {
>>> +                                               if (pvec.pages[j]->index >
>>> +                                                   strip_end)
>>> +                                                       break;
>>> +                                               if (pvec.pages[j]->index - 1 !=
>>> +                                                   pvec.pages[j - 1]->index)
>>> +                                                       new_max++;
>>> +                                       }
>>> +                                       new_max++;
>>> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
>>> +                                       if (ceph_osdc_extend_request(req,
>>> +                                                               new_max,
>>> +                                                               GFP_NOFS) >= 0)
>>> +                                               max_ops = new_max;
>>> +                                       else
>>> +                                               new_op = false;
>>> +                               }
>>> +                               if (!new_op) {
>>> +                                       redirty_page_for_writepage(wbc, page);
>>> +                                       unlock_page(page);
>>> +                                       break;
>>> +                               }
>>> +
>>> +                               offset_inc = (u64)page_offset(page) - offset;
>>> +                               osd_req_op_extent_dup_last(req, offset_inc);
>>> +
>>> +                               dout("writepages got pages at %llu~%llu\n",
>>> +                                     offset, len);
>>> +
>>> +                               osd_req_op_extent_update(req, num_ops - 1, len);
>>> +                               osd_req_op_extent_osd_data_pages(req,
>>> +                                                               num_ops - 1,
>>> +                                                               data_pages,
>>> +                                                               len, 0,
>>> +                                                               !!pool, false);
>>> +
>>> +                               num_ops++;
>>> +                               data_pages = pages + locked_pages;
>>> +                               offset = (u64)page_offset(page);
>>> +                               len = 0;
>>>                         }
>>>
>>>                         /* note position of first page in pvec */
>>> @@ -913,9 +997,8 @@ get_more_pages:
>>>                         dout("%p will write page %p idx %lu\n",
>>>                              inode, page, page->index);
>>>
>>> -                       writeback_stat =
>>> -                              atomic_long_inc_return(&fsc->writeback_count);
>>> -                       if (writeback_stat > CONGESTION_ON_THRESH(
>>> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
>>> +                           CONGESTION_ON_THRESH(
>>>                                     fsc->mount_options->congestion_kb)) {
>>>                                 set_bdi_congested(&fsc->backing_dev_info,
>>>                                                   BLK_RW_ASYNC);
>>> @@ -924,7 +1007,7 @@ get_more_pages:
>>>                         set_page_writeback(page);
>>>                         pages[locked_pages] = page;
>>>                         locked_pages++;
>>> -                       next = page->index + 1;
>>> +                       len += PAGE_CACHE_SIZE;
>>>                 }
>>>
>>>                 /* did we get anything? */
>>> @@ -952,31 +1035,34 @@ get_more_pages:
>>>                 }
>>>
>>>                 /* Format the osd request message and submit the write */
>>> -               offset = page_offset(pages[0]);
>>> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>>>                 if (snap_size == -1) {
>>> +                       /* writepages_finish() clears writeback pages
>>> +                        * according to the data length, so make sure
>>> +                        * data length covers all locked pages */
>>> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>>>                         len = min(len, (u64)i_size_read(inode) - offset);
>>> -                        /* writepages_finish() clears writeback pages
>>> -                         * according to the data length, so make sure
>>> -                         * data length covers all locked pages */
>>> -                       len = max(len, 1 +
>>> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
>>> +                       len = max(len, min_len);
>>>                 } else {
>>>                         len = min(len, snap_size - offset);
>>>                 }
>>> -               dout("writepages got %d pages at %llu~%llu\n",
>>> -                    locked_pages, offset, len);
>>> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>>>
>>> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>>> -                                                       !!pool, false);
>>> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
>>> +                                                len, 0, !!pool, false);
>>> +               osd_req_op_extent_update(req, num_ops - 1, len);
>>>
>>> +               if (do_sync) {
>>> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
>>> +                       num_ops++;
>>> +               }
>>> +               BUG_ON(num_ops > max_ops);
>>> +
>>> +               index = pages[locked_pages - 1]->index + 1;
>>>                 pages = NULL;   /* request message now owns the pages array */
>>>                 pool = NULL;
>>>
>>>                 /* Update the write op length in case we changed it */
>>>
>>> -               osd_req_op_extent_update(req, 0, len);
>>> -
>>>                 vino = ceph_vino(inode);
>>>                 ceph_osdc_build_request(req, offset, snapc, vino.snap,
>>>                                         &inode->i_mtime);
>>> @@ -986,7 +1072,6 @@ get_more_pages:
>>>                 req = NULL;
>>>
>>>                 /* continue? */
>>> -               index = next;
>>>                 wbc->nr_to_write -= locked_pages;
>>>                 if (wbc->nr_to_write <= 0)
>>>                         done = 1;
>>
>> Hi Zheng,
>>
>> I know I mentioned off-list that v3 looked good, but I've taken
>> a deeper look and have to retract that.  This version still has a call
>> to ceph_osdc_extend_request(), which suggests to me that the number of
>> ops is calculated incorrectly.
>>
>> A single op within an OSD request can be used for a single region of
>> contiguous dirty pages, so what this comes down to is calculating the
>> number of such dirty contiguous regions.  What you appear to be doing
>> in this version is calculating this number for the *first* gang of
>> pages we get from pagevec_lookup_tag() and allocating an OSD request
>> based on that.  If that's not enough (i.e. there are more dirty pages
>> in the stripe unit), we move on, you do the calculation for the next
>> gang and enlange the OSD request by reallocating parts of it.  This
>> goes on until we either run out of dirty pages in the stripe unit or
>> reach CEPH_OSD_MAX_OP ops in the request.
>>
>> What I don't understand is what prevents us from pre-calculating the
>> number of dirty contiguous regions for the entire stripe unit in one go
>> and trying to allocate the OSD request based on that number, possibly
>> falling back to a smaller request if we are short on memory.  The page
>> pointer array can hold all of pages in the stripe unit - I don't see
>> the need for ceph_osdc_extend_request() at all.  Am I missing something?
>>
>
> pre-calculating number of regions in the strip unit is ideal for
> random write, but it impose overhead on the most common sequential
> write (that's why I don't allocate CEPH_OSD_MAX_OP ops in the first
> place)  Besides,  pre-calculating number of regions require big

Care to be more specific about the overhead?  You have to go through
enough of dirty pages to either stuff up the OSD request or move past
the stripe unit no matter which type of write it is.  Allocating
CEPH_OSD_MAX_OP right off the bat is of course a bad idea, especially
in the sequential case, but that's not what I'm advocating for.  What
I'm proposing would result in allocating a request with exactly the
number of ops needed, in both sequential and random cases - no more, no
less.

> modification on both ceph_writepages_start() and
> ceph_osdc_new_request()  (we don't know where a strip unit end until
> calling ceph_osdc_new_request()).

Well, calling calc_layout() to learn where the first page's stripe unit
ends is trivial enough.  I get your reluctance - ceph_writepages_start()
is not pleasant to deal with - but in my opinion that's not reason
enough for reallocating memory in middle of it, possibly more than
once (!).

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-27  9:29     ` Ilya Dryomov
@ 2016-01-27  9:42       ` Yan, Zheng
  2016-01-27 10:45         ` Ilya Dryomov
  0 siblings, 1 reply; 7+ messages in thread
From: Yan, Zheng @ 2016-01-27  9:42 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Zheng Yan, Ceph Development


> On Jan 27, 2016, at 17:29, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Jan 27, 2016 at 2:52 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>> On Wed, Jan 27, 2016 at 6:15 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> This patch makes ceph_writepages_start() try using single OSD request
>>>> to write all dirty pages within a strip. When a nonconsecutive dirty
>>>> page is found, ceph_writepages_start() tries starting a new write
>>>> operation to existing OSD request. If it succeeds, it uses the new
>>>> operation to writeback the dirty page.
>>>> 
>>>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>>>> ---
>>>> fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>>>> 1 file changed, 174 insertions(+), 89 deletions(-)
>>>> 
>>>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>>>> index c222137..ac14d02 100644
>>>> --- a/fs/ceph/addr.c
>>>> +++ b/fs/ceph/addr.c
>>>> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>>>>        struct inode *inode = req->r_inode;
>>>>        struct ceph_inode_info *ci = ceph_inode(inode);
>>>>        struct ceph_osd_data *osd_data;
>>>> -       unsigned wrote;
>>>>        struct page *page;
>>>> -       int num_pages;
>>>> -       int i;
>>>> +       int num_pages, total_pages = 0;
>>>> +       int i, j;
>>>> +       int rc = req->r_result;
>>>>        struct ceph_snap_context *snapc = req->r_snapc;
>>>>        struct address_space *mapping = inode->i_mapping;
>>>> -       int rc = req->r_result;
>>>> -       u64 bytes = req->r_ops[0].extent.length;
>>>>        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>>> -       long writeback_stat;
>>>> -       unsigned issued = ceph_caps_issued(ci);
>>>> +       bool remove_page;
>>>> 
>>>> -       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>> -       num_pages = calc_pages_for((u64)osd_data->alignment,
>>>> -                                       (u64)osd_data->length);
>>>> -       if (rc >= 0) {
>>>> -               /*
>>>> -                * Assume we wrote the pages we originally sent.  The
>>>> -                * osd might reply with fewer pages if our writeback
>>>> -                * raced with a truncation and was adjusted at the osd,
>>>> -                * so don't believe the reply.
>>>> -                */
>>>> -               wrote = num_pages;
>>>> -       } else {
>>>> -               wrote = 0;
>>>> +
>>>> +       dout("writepages_finish %p rc %d\n", inode, rc);
>>>> +       if (rc < 0)
>>>>                mapping_set_error(mapping, rc);
>>>> -       }
>>>> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
>>>> -            inode, rc, bytes, wrote);
>>>> 
>>>> -       /* clean all pages */
>>>> -       for (i = 0; i < num_pages; i++) {
>>>> -               page = osd_data->pages[i];
>>>> -               BUG_ON(!page);
>>>> -               WARN_ON(!PageUptodate(page));
>>>> +       /*
>>>> +        * We lost the cache cap, need to truncate the page before
>>>> +        * it is unlocked, otherwise we'd truncate it later in the
>>>> +        * page truncation thread, possibly losing some data that
>>>> +        * raced its way in
>>>> +        */
>>>> +       remove_page = !(ceph_caps_issued(ci) &
>>>> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>>>> 
>>>> -               writeback_stat =
>>>> -                       atomic_long_dec_return(&fsc->writeback_count);
>>>> -               if (writeback_stat <
>>>> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
>>>> -                       clear_bdi_congested(&fsc->backing_dev_info,
>>>> -                                           BLK_RW_ASYNC);
>>>> +       /* clean all pages */
>>>> +       for (i = 0; i < req->r_num_ops; i++) {
>>>> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
>>>> +                       break;
>>>> 
>>>> -               ceph_put_snap_context(page_snap_context(page));
>>>> -               page->private = 0;
>>>> -               ClearPagePrivate(page);
>>>> -               dout("unlocking %d %p\n", i, page);
>>>> -               end_page_writeback(page);
>>>> +               osd_data = osd_req_op_extent_osd_data(req, i);
>>>> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>> +               num_pages = calc_pages_for((u64)osd_data->alignment,
>>>> +                                          (u64)osd_data->length);
>>>> +               total_pages += num_pages;
>>>> +               for (j = 0; j < num_pages; j++) {
>>>> +                       page = osd_data->pages[j];
>>>> +                       BUG_ON(!page);
>>>> +                       WARN_ON(!PageUptodate(page));
>>>> +
>>>> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
>>>> +                            CONGESTION_OFF_THRESH(
>>>> +                                       fsc->mount_options->congestion_kb))
>>>> +                               clear_bdi_congested(&fsc->backing_dev_info,
>>>> +                                                   BLK_RW_ASYNC);
>>>> +
>>>> +                       ceph_put_snap_context(page_snap_context(page));
>>>> +                       page->private = 0;
>>>> +                       ClearPagePrivate(page);
>>>> +                       dout("unlocking %p\n", page);
>>>> +                       end_page_writeback(page);
>>>> +
>>>> +                       if (remove_page)
>>>> +                               generic_error_remove_page(inode->i_mapping,
>>>> +                                                         page);
>>>> 
>>>> -               /*
>>>> -                * We lost the cache cap, need to truncate the page before
>>>> -                * it is unlocked, otherwise we'd truncate it later in the
>>>> -                * page truncation thread, possibly losing some data that
>>>> -                * raced its way in
>>>> -                */
>>>> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
>>>> -                       generic_error_remove_page(inode->i_mapping, page);
>>>> +                       unlock_page(page);
>>>> +               }
>>>> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
>>>> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>>>> 
>>>> -               unlock_page(page);
>>>> +               ceph_release_pages(osd_data->pages, num_pages);
>>>>        }
>>>> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
>>>> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>>>> 
>>>> -       ceph_release_pages(osd_data->pages, num_pages);
>>>> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
>>>> +
>>>> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>>        if (osd_data->pages_from_pool)
>>>>                mempool_free(osd_data->pages,
>>>>                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
>>>> @@ -778,17 +778,15 @@ retry:
>>>>        while (!done && index <= end) {
>>>>                unsigned i;
>>>>                int first;
>>>> -               pgoff_t next;
>>>> -               int pvec_pages, locked_pages;
>>>> -               struct page **pages = NULL;
>>>> +               pgoff_t strip_end = 0;
>>>> +               int num_ops = 0, max_ops = 0;
>>>> +               int pvec_pages, locked_pages = 0;
>>>> +               struct page **pages = NULL, **data_pages = NULL;
>>>>                mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>>>>                struct page *page;
>>>>                int want;
>>>> -               u64 offset, len;
>>>> -               long writeback_stat;
>>>> +               u64 offset = 0, len = 0;
>>>> 
>>>> -               next = 0;
>>>> -               locked_pages = 0;
>>>>                max_pages = max_pages_ever;
>>>> 
>>>> get_more_pages:
>>>> @@ -824,8 +822,8 @@ get_more_pages:
>>>>                                unlock_page(page);
>>>>                                break;
>>>>                        }
>>>> -                       if (next && (page->index != next)) {
>>>> -                               dout("not consecutive %p\n", page);
>>>> +                       if (strip_end && (page->index > strip_end)) {
>>>> +                               dout("end of strip %p\n", page);
>>>>                                unlock_page(page);
>>>>                                break;
>>>>                        }
>>>> @@ -871,28 +869,60 @@ get_more_pages:
>>>>                         * that it will use.
>>>>                         */
>>>>                        if (locked_pages == 0) {
>>>> +                               int j;
>>>>                                BUG_ON(pages);
>>>>                                /* prepare async write request */
>>>>                                offset = (u64)page_offset(page);
>>>>                                len = wsize;
>>>> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
>>>> +
>>>> +                               max_ops = 1 + do_sync;
>>>> +                               strip_end = page->index +
>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>> +                               for (j = i + 1; j < pvec_pages; j++) {
>>>> +                                       if (pvec.pages[j]->index > strip_end)
>>>> +                                               break;
>>>> +                                       if (pvec.pages[j]->index - 1 !=
>>>> +                                           pvec.pages[j - 1]->index)
>>>> +                                               max_ops++;
>>>> +                               }
>>>> +
>>>> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
>>>> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
>>>> +                                       req = ceph_osdc_new_request(
>>>> +                                                       &fsc->client->osdc,
>>>>                                                        &ci->i_layout, vino,
>>>> -                                                       offset, &len, 0,
>>>> -                                                       do_sync ? 2 : 1,
>>>> +                                                       offset, &len,
>>>> +                                                       0, max_ops,
>>>> +                                                       CEPH_OSD_OP_WRITE,
>>>> +                                                       CEPH_OSD_FLAG_WRITE |
>>>> +                                                       CEPH_OSD_FLAG_ONDISK,
>>>> +                                                       snapc, truncate_seq,
>>>> +                                                       truncate_size, false);
>>>> +                                       if (IS_ERR(req))
>>>> +                                               req = NULL;
>>>> +                               }
>>>> +                               if (!req) {
>>>> +                                       max_ops = CEPH_OSD_INITIAL_OP;
>>>> +                                       req = ceph_osdc_new_request(
>>>> +                                                       &fsc->client->osdc,
>>>> +                                                       &ci->i_layout, vino,
>>>> +                                                       offset, &len,
>>>> +                                                       0, max_ops,
>>>>                                                        CEPH_OSD_OP_WRITE,
>>>>                                                        CEPH_OSD_FLAG_WRITE |
>>>>                                                        CEPH_OSD_FLAG_ONDISK,
>>>>                                                        snapc, truncate_seq,
>>>>                                                        truncate_size, true);
>>>> -                               if (IS_ERR(req)) {
>>>> -                                       rc = PTR_ERR(req);
>>>> -                                       unlock_page(page);
>>>> -                                       break;
>>>> +                                       if (IS_ERR(req)) {
>>>> +                                               unlock_page(page);
>>>> +                                               rc = PTR_ERR(req);
>>>> +                                               req = NULL;
>>>> +                                               break;
>>>> +                                       }
>>>>                                }
>>>> 
>>>> -                               if (do_sync)
>>>> -                                       osd_req_op_init(req, 1,
>>>> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
>>>> +                               strip_end = page->index +
>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>> 
>>>>                                req->r_callback = writepages_finish;
>>>>                                req->r_inode = inode;
>>>> @@ -905,6 +935,60 @@ get_more_pages:
>>>>                                        pages = mempool_alloc(pool, GFP_NOFS);
>>>>                                        BUG_ON(!pages);
>>>>                                }
>>>> +
>>>> +                               num_ops = 1;
>>>> +                               data_pages = pages;
>>>> +                               len = 0;
>>>> +                       } else if (page->index !=
>>>> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
>>>> +                               u64 offset_inc;
>>>> +                               bool new_op = true;
>>>> +
>>>> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
>>>> +                                       new_op = false;
>>>> +                               } else if (num_ops + do_sync == max_ops) {
>>>> +                                       int new_max = max_ops;
>>>> +                                       int j;
>>>> +                                       for (j = i + 1; j < pvec_pages; j++) {
>>>> +                                               if (pvec.pages[j]->index >
>>>> +                                                   strip_end)
>>>> +                                                       break;
>>>> +                                               if (pvec.pages[j]->index - 1 !=
>>>> +                                                   pvec.pages[j - 1]->index)
>>>> +                                                       new_max++;
>>>> +                                       }
>>>> +                                       new_max++;
>>>> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
>>>> +                                       if (ceph_osdc_extend_request(req,
>>>> +                                                               new_max,
>>>> +                                                               GFP_NOFS) >= 0)
>>>> +                                               max_ops = new_max;
>>>> +                                       else
>>>> +                                               new_op = false;
>>>> +                               }
>>>> +                               if (!new_op) {
>>>> +                                       redirty_page_for_writepage(wbc, page);
>>>> +                                       unlock_page(page);
>>>> +                                       break;
>>>> +                               }
>>>> +
>>>> +                               offset_inc = (u64)page_offset(page) - offset;
>>>> +                               osd_req_op_extent_dup_last(req, offset_inc);
>>>> +
>>>> +                               dout("writepages got pages at %llu~%llu\n",
>>>> +                                     offset, len);
>>>> +
>>>> +                               osd_req_op_extent_update(req, num_ops - 1, len);
>>>> +                               osd_req_op_extent_osd_data_pages(req,
>>>> +                                                               num_ops - 1,
>>>> +                                                               data_pages,
>>>> +                                                               len, 0,
>>>> +                                                               !!pool, false);
>>>> +
>>>> +                               num_ops++;
>>>> +                               data_pages = pages + locked_pages;
>>>> +                               offset = (u64)page_offset(page);
>>>> +                               len = 0;
>>>>                        }
>>>> 
>>>>                        /* note position of first page in pvec */
>>>> @@ -913,9 +997,8 @@ get_more_pages:
>>>>                        dout("%p will write page %p idx %lu\n",
>>>>                             inode, page, page->index);
>>>> 
>>>> -                       writeback_stat =
>>>> -                              atomic_long_inc_return(&fsc->writeback_count);
>>>> -                       if (writeback_stat > CONGESTION_ON_THRESH(
>>>> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
>>>> +                           CONGESTION_ON_THRESH(
>>>>                                    fsc->mount_options->congestion_kb)) {
>>>>                                set_bdi_congested(&fsc->backing_dev_info,
>>>>                                                  BLK_RW_ASYNC);
>>>> @@ -924,7 +1007,7 @@ get_more_pages:
>>>>                        set_page_writeback(page);
>>>>                        pages[locked_pages] = page;
>>>>                        locked_pages++;
>>>> -                       next = page->index + 1;
>>>> +                       len += PAGE_CACHE_SIZE;
>>>>                }
>>>> 
>>>>                /* did we get anything? */
>>>> @@ -952,31 +1035,34 @@ get_more_pages:
>>>>                }
>>>> 
>>>>                /* Format the osd request message and submit the write */
>>>> -               offset = page_offset(pages[0]);
>>>> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>>>>                if (snap_size == -1) {
>>>> +                       /* writepages_finish() clears writeback pages
>>>> +                        * according to the data length, so make sure
>>>> +                        * data length covers all locked pages */
>>>> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>>>>                        len = min(len, (u64)i_size_read(inode) - offset);
>>>> -                        /* writepages_finish() clears writeback pages
>>>> -                         * according to the data length, so make sure
>>>> -                         * data length covers all locked pages */
>>>> -                       len = max(len, 1 +
>>>> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
>>>> +                       len = max(len, min_len);
>>>>                } else {
>>>>                        len = min(len, snap_size - offset);
>>>>                }
>>>> -               dout("writepages got %d pages at %llu~%llu\n",
>>>> -                    locked_pages, offset, len);
>>>> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>>>> 
>>>> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>>>> -                                                       !!pool, false);
>>>> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
>>>> +                                                len, 0, !!pool, false);
>>>> +               osd_req_op_extent_update(req, num_ops - 1, len);
>>>> 
>>>> +               if (do_sync) {
>>>> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
>>>> +                       num_ops++;
>>>> +               }
>>>> +               BUG_ON(num_ops > max_ops);
>>>> +
>>>> +               index = pages[locked_pages - 1]->index + 1;
>>>>                pages = NULL;   /* request message now owns the pages array */
>>>>                pool = NULL;
>>>> 
>>>>                /* Update the write op length in case we changed it */
>>>> 
>>>> -               osd_req_op_extent_update(req, 0, len);
>>>> -
>>>>                vino = ceph_vino(inode);
>>>>                ceph_osdc_build_request(req, offset, snapc, vino.snap,
>>>>                                        &inode->i_mtime);
>>>> @@ -986,7 +1072,6 @@ get_more_pages:
>>>>                req = NULL;
>>>> 
>>>>                /* continue? */
>>>> -               index = next;
>>>>                wbc->nr_to_write -= locked_pages;
>>>>                if (wbc->nr_to_write <= 0)
>>>>                        done = 1;
>>> 
>>> Hi Zheng,
>>> 
>>> I know I mentioned off-list that v3 looked good, but I've taken
>>> a deeper look and have to retract that.  This version still has a call
>>> to ceph_osdc_extend_request(), which suggests to me that the number of
>>> ops is calculated incorrectly.
>>> 
>>> A single op within an OSD request can be used for a single region of
>>> contiguous dirty pages, so what this comes down to is calculating the
>>> number of such dirty contiguous regions.  What you appear to be doing
>>> in this version is calculating this number for the *first* gang of
>>> pages we get from pagevec_lookup_tag() and allocating an OSD request
>>> based on that.  If that's not enough (i.e. there are more dirty pages
>>> in the stripe unit), we move on, you do the calculation for the next
>>> gang and enlange the OSD request by reallocating parts of it.  This
>>> goes on until we either run out of dirty pages in the stripe unit or
>>> reach CEPH_OSD_MAX_OP ops in the request.
>>> 
>>> What I don't understand is what prevents us from pre-calculating the
>>> number of dirty contiguous regions for the entire stripe unit in one go
>>> and trying to allocate the OSD request based on that number, possibly
>>> falling back to a smaller request if we are short on memory.  The page
>>> pointer array can hold all of pages in the stripe unit - I don't see
>>> the need for ceph_osdc_extend_request() at all.  Am I missing something?
>>> 
>> 
>> pre-calculating number of regions in the strip unit is ideal for
>> random write, but it impose overhead on the most common sequential
>> write (that's why I don't allocate CEPH_OSD_MAX_OP ops in the first
>> place)  Besides,  pre-calculating number of regions require big
> 
> Care to be more specific about the overhead?  You have to go through
> enough of dirty pages to either stuff up the OSD request or move past
> the stripe unit no matter which type of write it is.  Allocating
> CEPH_OSD_MAX_OP right off the bat is of course a bad idea, especially
> in the sequential case, but that's not what I'm advocating for.  What
> I'm proposing would result in allocating a request with exactly the
> number of ops needed, in both sequential and random cases - no more, no
> less.

Scanning page within a strip unit requires calling pagevec_lookup_tag() multiples times. There are about 1000 pages in the worst case. It means we need to get/release page 1000 times. In my option, it’s non-trivial overhead.

Regards
Yan, Zheng

>  
>> modification on both ceph_writepages_start() and
>> ceph_osdc_new_request()  (we don't know where a strip unit end until
>> calling ceph_osdc_new_request()).
> 
> Well, calling calc_layout() to learn where the first page's stripe unit
> ends is trivial enough.  I get your reluctance - ceph_writepages_start()
> is not pleasant to deal with - but in my opinion that's not reason
> enough for reallocating memory in middle of it, possibly more than
> once (!).
> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-27  9:42       ` Yan, Zheng
@ 2016-01-27 10:45         ` Ilya Dryomov
  2016-01-27 11:00           ` Yan, Zheng
  0 siblings, 1 reply; 7+ messages in thread
From: Ilya Dryomov @ 2016-01-27 10:45 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Zheng Yan, Ceph Development

On Wed, Jan 27, 2016 at 10:42 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Jan 27, 2016, at 17:29, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Jan 27, 2016 at 2:52 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>> On Wed, Jan 27, 2016 at 6:15 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>> On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>> This patch makes ceph_writepages_start() try using single OSD request
>>>>> to write all dirty pages within a strip. When a nonconsecutive dirty
>>>>> page is found, ceph_writepages_start() tries starting a new write
>>>>> operation to existing OSD request. If it succeeds, it uses the new
>>>>> operation to writeback the dirty page.
>>>>>
>>>>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>>>>> ---
>>>>> fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>>>>> 1 file changed, 174 insertions(+), 89 deletions(-)
>>>>>
>>>>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>>>>> index c222137..ac14d02 100644
>>>>> --- a/fs/ceph/addr.c
>>>>> +++ b/fs/ceph/addr.c
>>>>> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>>>>>        struct inode *inode = req->r_inode;
>>>>>        struct ceph_inode_info *ci = ceph_inode(inode);
>>>>>        struct ceph_osd_data *osd_data;
>>>>> -       unsigned wrote;
>>>>>        struct page *page;
>>>>> -       int num_pages;
>>>>> -       int i;
>>>>> +       int num_pages, total_pages = 0;
>>>>> +       int i, j;
>>>>> +       int rc = req->r_result;
>>>>>        struct ceph_snap_context *snapc = req->r_snapc;
>>>>>        struct address_space *mapping = inode->i_mapping;
>>>>> -       int rc = req->r_result;
>>>>> -       u64 bytes = req->r_ops[0].extent.length;
>>>>>        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>>>> -       long writeback_stat;
>>>>> -       unsigned issued = ceph_caps_issued(ci);
>>>>> +       bool remove_page;
>>>>>
>>>>> -       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>>> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>>> -       num_pages = calc_pages_for((u64)osd_data->alignment,
>>>>> -                                       (u64)osd_data->length);
>>>>> -       if (rc >= 0) {
>>>>> -               /*
>>>>> -                * Assume we wrote the pages we originally sent.  The
>>>>> -                * osd might reply with fewer pages if our writeback
>>>>> -                * raced with a truncation and was adjusted at the osd,
>>>>> -                * so don't believe the reply.
>>>>> -                */
>>>>> -               wrote = num_pages;
>>>>> -       } else {
>>>>> -               wrote = 0;
>>>>> +
>>>>> +       dout("writepages_finish %p rc %d\n", inode, rc);
>>>>> +       if (rc < 0)
>>>>>                mapping_set_error(mapping, rc);
>>>>> -       }
>>>>> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
>>>>> -            inode, rc, bytes, wrote);
>>>>>
>>>>> -       /* clean all pages */
>>>>> -       for (i = 0; i < num_pages; i++) {
>>>>> -               page = osd_data->pages[i];
>>>>> -               BUG_ON(!page);
>>>>> -               WARN_ON(!PageUptodate(page));
>>>>> +       /*
>>>>> +        * We lost the cache cap, need to truncate the page before
>>>>> +        * it is unlocked, otherwise we'd truncate it later in the
>>>>> +        * page truncation thread, possibly losing some data that
>>>>> +        * raced its way in
>>>>> +        */
>>>>> +       remove_page = !(ceph_caps_issued(ci) &
>>>>> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>>>>>
>>>>> -               writeback_stat =
>>>>> -                       atomic_long_dec_return(&fsc->writeback_count);
>>>>> -               if (writeback_stat <
>>>>> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
>>>>> -                       clear_bdi_congested(&fsc->backing_dev_info,
>>>>> -                                           BLK_RW_ASYNC);
>>>>> +       /* clean all pages */
>>>>> +       for (i = 0; i < req->r_num_ops; i++) {
>>>>> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
>>>>> +                       break;
>>>>>
>>>>> -               ceph_put_snap_context(page_snap_context(page));
>>>>> -               page->private = 0;
>>>>> -               ClearPagePrivate(page);
>>>>> -               dout("unlocking %d %p\n", i, page);
>>>>> -               end_page_writeback(page);
>>>>> +               osd_data = osd_req_op_extent_osd_data(req, i);
>>>>> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>>> +               num_pages = calc_pages_for((u64)osd_data->alignment,
>>>>> +                                          (u64)osd_data->length);
>>>>> +               total_pages += num_pages;
>>>>> +               for (j = 0; j < num_pages; j++) {
>>>>> +                       page = osd_data->pages[j];
>>>>> +                       BUG_ON(!page);
>>>>> +                       WARN_ON(!PageUptodate(page));
>>>>> +
>>>>> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
>>>>> +                            CONGESTION_OFF_THRESH(
>>>>> +                                       fsc->mount_options->congestion_kb))
>>>>> +                               clear_bdi_congested(&fsc->backing_dev_info,
>>>>> +                                                   BLK_RW_ASYNC);
>>>>> +
>>>>> +                       ceph_put_snap_context(page_snap_context(page));
>>>>> +                       page->private = 0;
>>>>> +                       ClearPagePrivate(page);
>>>>> +                       dout("unlocking %p\n", page);
>>>>> +                       end_page_writeback(page);
>>>>> +
>>>>> +                       if (remove_page)
>>>>> +                               generic_error_remove_page(inode->i_mapping,
>>>>> +                                                         page);
>>>>>
>>>>> -               /*
>>>>> -                * We lost the cache cap, need to truncate the page before
>>>>> -                * it is unlocked, otherwise we'd truncate it later in the
>>>>> -                * page truncation thread, possibly losing some data that
>>>>> -                * raced its way in
>>>>> -                */
>>>>> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
>>>>> -                       generic_error_remove_page(inode->i_mapping, page);
>>>>> +                       unlock_page(page);
>>>>> +               }
>>>>> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
>>>>> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>>>>>
>>>>> -               unlock_page(page);
>>>>> +               ceph_release_pages(osd_data->pages, num_pages);
>>>>>        }
>>>>> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
>>>>> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>>>>>
>>>>> -       ceph_release_pages(osd_data->pages, num_pages);
>>>>> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
>>>>> +
>>>>> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>>>        if (osd_data->pages_from_pool)
>>>>>                mempool_free(osd_data->pages,
>>>>>                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
>>>>> @@ -778,17 +778,15 @@ retry:
>>>>>        while (!done && index <= end) {
>>>>>                unsigned i;
>>>>>                int first;
>>>>> -               pgoff_t next;
>>>>> -               int pvec_pages, locked_pages;
>>>>> -               struct page **pages = NULL;
>>>>> +               pgoff_t strip_end = 0;
>>>>> +               int num_ops = 0, max_ops = 0;
>>>>> +               int pvec_pages, locked_pages = 0;
>>>>> +               struct page **pages = NULL, **data_pages = NULL;
>>>>>                mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>>>>>                struct page *page;
>>>>>                int want;
>>>>> -               u64 offset, len;
>>>>> -               long writeback_stat;
>>>>> +               u64 offset = 0, len = 0;
>>>>>
>>>>> -               next = 0;
>>>>> -               locked_pages = 0;
>>>>>                max_pages = max_pages_ever;
>>>>>
>>>>> get_more_pages:
>>>>> @@ -824,8 +822,8 @@ get_more_pages:
>>>>>                                unlock_page(page);
>>>>>                                break;
>>>>>                        }
>>>>> -                       if (next && (page->index != next)) {
>>>>> -                               dout("not consecutive %p\n", page);
>>>>> +                       if (strip_end && (page->index > strip_end)) {
>>>>> +                               dout("end of strip %p\n", page);
>>>>>                                unlock_page(page);
>>>>>                                break;
>>>>>                        }
>>>>> @@ -871,28 +869,60 @@ get_more_pages:
>>>>>                         * that it will use.
>>>>>                         */
>>>>>                        if (locked_pages == 0) {
>>>>> +                               int j;
>>>>>                                BUG_ON(pages);
>>>>>                                /* prepare async write request */
>>>>>                                offset = (u64)page_offset(page);
>>>>>                                len = wsize;
>>>>> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
>>>>> +
>>>>> +                               max_ops = 1 + do_sync;
>>>>> +                               strip_end = page->index +
>>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>>> +                               for (j = i + 1; j < pvec_pages; j++) {
>>>>> +                                       if (pvec.pages[j]->index > strip_end)
>>>>> +                                               break;
>>>>> +                                       if (pvec.pages[j]->index - 1 !=
>>>>> +                                           pvec.pages[j - 1]->index)
>>>>> +                                               max_ops++;
>>>>> +                               }
>>>>> +
>>>>> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
>>>>> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
>>>>> +                                       req = ceph_osdc_new_request(
>>>>> +                                                       &fsc->client->osdc,
>>>>>                                                        &ci->i_layout, vino,
>>>>> -                                                       offset, &len, 0,
>>>>> -                                                       do_sync ? 2 : 1,
>>>>> +                                                       offset, &len,
>>>>> +                                                       0, max_ops,
>>>>> +                                                       CEPH_OSD_OP_WRITE,
>>>>> +                                                       CEPH_OSD_FLAG_WRITE |
>>>>> +                                                       CEPH_OSD_FLAG_ONDISK,
>>>>> +                                                       snapc, truncate_seq,
>>>>> +                                                       truncate_size, false);
>>>>> +                                       if (IS_ERR(req))
>>>>> +                                               req = NULL;
>>>>> +                               }
>>>>> +                               if (!req) {
>>>>> +                                       max_ops = CEPH_OSD_INITIAL_OP;
>>>>> +                                       req = ceph_osdc_new_request(
>>>>> +                                                       &fsc->client->osdc,
>>>>> +                                                       &ci->i_layout, vino,
>>>>> +                                                       offset, &len,
>>>>> +                                                       0, max_ops,
>>>>>                                                        CEPH_OSD_OP_WRITE,
>>>>>                                                        CEPH_OSD_FLAG_WRITE |
>>>>>                                                        CEPH_OSD_FLAG_ONDISK,
>>>>>                                                        snapc, truncate_seq,
>>>>>                                                        truncate_size, true);
>>>>> -                               if (IS_ERR(req)) {
>>>>> -                                       rc = PTR_ERR(req);
>>>>> -                                       unlock_page(page);
>>>>> -                                       break;
>>>>> +                                       if (IS_ERR(req)) {
>>>>> +                                               unlock_page(page);
>>>>> +                                               rc = PTR_ERR(req);
>>>>> +                                               req = NULL;
>>>>> +                                               break;
>>>>> +                                       }
>>>>>                                }
>>>>>
>>>>> -                               if (do_sync)
>>>>> -                                       osd_req_op_init(req, 1,
>>>>> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
>>>>> +                               strip_end = page->index +
>>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>>>
>>>>>                                req->r_callback = writepages_finish;
>>>>>                                req->r_inode = inode;
>>>>> @@ -905,6 +935,60 @@ get_more_pages:
>>>>>                                        pages = mempool_alloc(pool, GFP_NOFS);
>>>>>                                        BUG_ON(!pages);
>>>>>                                }
>>>>> +
>>>>> +                               num_ops = 1;
>>>>> +                               data_pages = pages;
>>>>> +                               len = 0;
>>>>> +                       } else if (page->index !=
>>>>> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
>>>>> +                               u64 offset_inc;
>>>>> +                               bool new_op = true;
>>>>> +
>>>>> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
>>>>> +                                       new_op = false;
>>>>> +                               } else if (num_ops + do_sync == max_ops) {
>>>>> +                                       int new_max = max_ops;
>>>>> +                                       int j;
>>>>> +                                       for (j = i + 1; j < pvec_pages; j++) {
>>>>> +                                               if (pvec.pages[j]->index >
>>>>> +                                                   strip_end)
>>>>> +                                                       break;
>>>>> +                                               if (pvec.pages[j]->index - 1 !=
>>>>> +                                                   pvec.pages[j - 1]->index)
>>>>> +                                                       new_max++;
>>>>> +                                       }
>>>>> +                                       new_max++;
>>>>> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
>>>>> +                                       if (ceph_osdc_extend_request(req,
>>>>> +                                                               new_max,
>>>>> +                                                               GFP_NOFS) >= 0)
>>>>> +                                               max_ops = new_max;
>>>>> +                                       else
>>>>> +                                               new_op = false;
>>>>> +                               }
>>>>> +                               if (!new_op) {
>>>>> +                                       redirty_page_for_writepage(wbc, page);
>>>>> +                                       unlock_page(page);
>>>>> +                                       break;
>>>>> +                               }
>>>>> +
>>>>> +                               offset_inc = (u64)page_offset(page) - offset;
>>>>> +                               osd_req_op_extent_dup_last(req, offset_inc);
>>>>> +
>>>>> +                               dout("writepages got pages at %llu~%llu\n",
>>>>> +                                     offset, len);
>>>>> +
>>>>> +                               osd_req_op_extent_update(req, num_ops - 1, len);
>>>>> +                               osd_req_op_extent_osd_data_pages(req,
>>>>> +                                                               num_ops - 1,
>>>>> +                                                               data_pages,
>>>>> +                                                               len, 0,
>>>>> +                                                               !!pool, false);
>>>>> +
>>>>> +                               num_ops++;
>>>>> +                               data_pages = pages + locked_pages;
>>>>> +                               offset = (u64)page_offset(page);
>>>>> +                               len = 0;
>>>>>                        }
>>>>>
>>>>>                        /* note position of first page in pvec */
>>>>> @@ -913,9 +997,8 @@ get_more_pages:
>>>>>                        dout("%p will write page %p idx %lu\n",
>>>>>                             inode, page, page->index);
>>>>>
>>>>> -                       writeback_stat =
>>>>> -                              atomic_long_inc_return(&fsc->writeback_count);
>>>>> -                       if (writeback_stat > CONGESTION_ON_THRESH(
>>>>> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
>>>>> +                           CONGESTION_ON_THRESH(
>>>>>                                    fsc->mount_options->congestion_kb)) {
>>>>>                                set_bdi_congested(&fsc->backing_dev_info,
>>>>>                                                  BLK_RW_ASYNC);
>>>>> @@ -924,7 +1007,7 @@ get_more_pages:
>>>>>                        set_page_writeback(page);
>>>>>                        pages[locked_pages] = page;
>>>>>                        locked_pages++;
>>>>> -                       next = page->index + 1;
>>>>> +                       len += PAGE_CACHE_SIZE;
>>>>>                }
>>>>>
>>>>>                /* did we get anything? */
>>>>> @@ -952,31 +1035,34 @@ get_more_pages:
>>>>>                }
>>>>>
>>>>>                /* Format the osd request message and submit the write */
>>>>> -               offset = page_offset(pages[0]);
>>>>> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>>>>>                if (snap_size == -1) {
>>>>> +                       /* writepages_finish() clears writeback pages
>>>>> +                        * according to the data length, so make sure
>>>>> +                        * data length covers all locked pages */
>>>>> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>>>>>                        len = min(len, (u64)i_size_read(inode) - offset);
>>>>> -                        /* writepages_finish() clears writeback pages
>>>>> -                         * according to the data length, so make sure
>>>>> -                         * data length covers all locked pages */
>>>>> -                       len = max(len, 1 +
>>>>> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
>>>>> +                       len = max(len, min_len);
>>>>>                } else {
>>>>>                        len = min(len, snap_size - offset);
>>>>>                }
>>>>> -               dout("writepages got %d pages at %llu~%llu\n",
>>>>> -                    locked_pages, offset, len);
>>>>> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>>>>>
>>>>> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>>>>> -                                                       !!pool, false);
>>>>> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
>>>>> +                                                len, 0, !!pool, false);
>>>>> +               osd_req_op_extent_update(req, num_ops - 1, len);
>>>>>
>>>>> +               if (do_sync) {
>>>>> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
>>>>> +                       num_ops++;
>>>>> +               }
>>>>> +               BUG_ON(num_ops > max_ops);
>>>>> +
>>>>> +               index = pages[locked_pages - 1]->index + 1;
>>>>>                pages = NULL;   /* request message now owns the pages array */
>>>>>                pool = NULL;
>>>>>
>>>>>                /* Update the write op length in case we changed it */
>>>>>
>>>>> -               osd_req_op_extent_update(req, 0, len);
>>>>> -
>>>>>                vino = ceph_vino(inode);
>>>>>                ceph_osdc_build_request(req, offset, snapc, vino.snap,
>>>>>                                        &inode->i_mtime);
>>>>> @@ -986,7 +1072,6 @@ get_more_pages:
>>>>>                req = NULL;
>>>>>
>>>>>                /* continue? */
>>>>> -               index = next;
>>>>>                wbc->nr_to_write -= locked_pages;
>>>>>                if (wbc->nr_to_write <= 0)
>>>>>                        done = 1;
>>>>
>>>> Hi Zheng,
>>>>
>>>> I know I mentioned off-list that v3 looked good, but I've taken
>>>> a deeper look and have to retract that.  This version still has a call
>>>> to ceph_osdc_extend_request(), which suggests to me that the number of
>>>> ops is calculated incorrectly.
>>>>
>>>> A single op within an OSD request can be used for a single region of
>>>> contiguous dirty pages, so what this comes down to is calculating the
>>>> number of such dirty contiguous regions.  What you appear to be doing
>>>> in this version is calculating this number for the *first* gang of
>>>> pages we get from pagevec_lookup_tag() and allocating an OSD request
>>>> based on that.  If that's not enough (i.e. there are more dirty pages
>>>> in the stripe unit), we move on, you do the calculation for the next
>>>> gang and enlange the OSD request by reallocating parts of it.  This
>>>> goes on until we either run out of dirty pages in the stripe unit or
>>>> reach CEPH_OSD_MAX_OP ops in the request.
>>>>
>>>> What I don't understand is what prevents us from pre-calculating the
>>>> number of dirty contiguous regions for the entire stripe unit in one go
>>>> and trying to allocate the OSD request based on that number, possibly
>>>> falling back to a smaller request if we are short on memory.  The page
>>>> pointer array can hold all of pages in the stripe unit - I don't see
>>>> the need for ceph_osdc_extend_request() at all.  Am I missing something?
>>>>
>>>
>>> pre-calculating number of regions in the strip unit is ideal for
>>> random write, but it impose overhead on the most common sequential
>>> write (that's why I don't allocate CEPH_OSD_MAX_OP ops in the first
>>> place)  Besides,  pre-calculating number of regions require big
>>
>> Care to be more specific about the overhead?  You have to go through
>> enough of dirty pages to either stuff up the OSD request or move past
>> the stripe unit no matter which type of write it is.  Allocating
>> CEPH_OSD_MAX_OP right off the bat is of course a bad idea, especially
>> in the sequential case, but that's not what I'm advocating for.  What
>> I'm proposing would result in allocating a request with exactly the
>> number of ops needed, in both sequential and random cases - no more, no
>> less.
>
> Scanning page within a strip unit requires calling pagevec_lookup_tag() multiples times. There are about 1000 pages in the worst case. It means we need to get/release page 1000 times. In my option, it’s non-trivial overhead.

Which page would have to be gotten/released a 1000 times?  We already
call pagevec_lookup_tag() more than once, until we either move past the
stripe unit or fill in the OSD request.  Pre-calculating the number of
dirty regions properly doesn't entail any additional calls to it.  We
lock the page or release it, it happens once.  Locked pages are stashed
in the pages page pointer array - is there a reason we can't defer the
OSD request allocation until after we lock and stash all of the pages
we are interested in?  Unless I'm misreading the code, the page pointer
array is always big enough to accommodate.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 6/6] ceph: scattered page writeback
  2016-01-27 10:45         ` Ilya Dryomov
@ 2016-01-27 11:00           ` Yan, Zheng
  0 siblings, 0 replies; 7+ messages in thread
From: Yan, Zheng @ 2016-01-27 11:00 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Zheng Yan, Ceph Development


> On Jan 27, 2016, at 18:45, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Jan 27, 2016 at 10:42 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Jan 27, 2016, at 17:29, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Wed, Jan 27, 2016 at 2:52 AM, Yan, Zheng <ukernel@gmail.com> wrote:
>>>> On Wed, Jan 27, 2016 at 6:15 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> On Tue, Jan 19, 2016 at 2:30 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> This patch makes ceph_writepages_start() try using single OSD request
>>>>>> to write all dirty pages within a strip. When a nonconsecutive dirty
>>>>>> page is found, ceph_writepages_start() tries starting a new write
>>>>>> operation to existing OSD request. If it succeeds, it uses the new
>>>>>> operation to writeback the dirty page.
>>>>>> 
>>>>>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>>>>>> ---
>>>>>> fs/ceph/addr.c | 263 ++++++++++++++++++++++++++++++++++++++-------------------
>>>>>> 1 file changed, 174 insertions(+), 89 deletions(-)
>>>>>> 
>>>>>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>>>>>> index c222137..ac14d02 100644
>>>>>> --- a/fs/ceph/addr.c
>>>>>> +++ b/fs/ceph/addr.c
>>>>>> @@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
>>>>>>       struct inode *inode = req->r_inode;
>>>>>>       struct ceph_inode_info *ci = ceph_inode(inode);
>>>>>>       struct ceph_osd_data *osd_data;
>>>>>> -       unsigned wrote;
>>>>>>       struct page *page;
>>>>>> -       int num_pages;
>>>>>> -       int i;
>>>>>> +       int num_pages, total_pages = 0;
>>>>>> +       int i, j;
>>>>>> +       int rc = req->r_result;
>>>>>>       struct ceph_snap_context *snapc = req->r_snapc;
>>>>>>       struct address_space *mapping = inode->i_mapping;
>>>>>> -       int rc = req->r_result;
>>>>>> -       u64 bytes = req->r_ops[0].extent.length;
>>>>>>       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>>>>> -       long writeback_stat;
>>>>>> -       unsigned issued = ceph_caps_issued(ci);
>>>>>> +       bool remove_page;
>>>>>> 
>>>>>> -       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>>>> -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>>>> -       num_pages = calc_pages_for((u64)osd_data->alignment,
>>>>>> -                                       (u64)osd_data->length);
>>>>>> -       if (rc >= 0) {
>>>>>> -               /*
>>>>>> -                * Assume we wrote the pages we originally sent.  The
>>>>>> -                * osd might reply with fewer pages if our writeback
>>>>>> -                * raced with a truncation and was adjusted at the osd,
>>>>>> -                * so don't believe the reply.
>>>>>> -                */
>>>>>> -               wrote = num_pages;
>>>>>> -       } else {
>>>>>> -               wrote = 0;
>>>>>> +
>>>>>> +       dout("writepages_finish %p rc %d\n", inode, rc);
>>>>>> +       if (rc < 0)
>>>>>>               mapping_set_error(mapping, rc);
>>>>>> -       }
>>>>>> -       dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
>>>>>> -            inode, rc, bytes, wrote);
>>>>>> 
>>>>>> -       /* clean all pages */
>>>>>> -       for (i = 0; i < num_pages; i++) {
>>>>>> -               page = osd_data->pages[i];
>>>>>> -               BUG_ON(!page);
>>>>>> -               WARN_ON(!PageUptodate(page));
>>>>>> +       /*
>>>>>> +        * We lost the cache cap, need to truncate the page before
>>>>>> +        * it is unlocked, otherwise we'd truncate it later in the
>>>>>> +        * page truncation thread, possibly losing some data that
>>>>>> +        * raced its way in
>>>>>> +        */
>>>>>> +       remove_page = !(ceph_caps_issued(ci) &
>>>>>> +                       (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
>>>>>> 
>>>>>> -               writeback_stat =
>>>>>> -                       atomic_long_dec_return(&fsc->writeback_count);
>>>>>> -               if (writeback_stat <
>>>>>> -                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
>>>>>> -                       clear_bdi_congested(&fsc->backing_dev_info,
>>>>>> -                                           BLK_RW_ASYNC);
>>>>>> +       /* clean all pages */
>>>>>> +       for (i = 0; i < req->r_num_ops; i++) {
>>>>>> +               if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
>>>>>> +                       break;
>>>>>> 
>>>>>> -               ceph_put_snap_context(page_snap_context(page));
>>>>>> -               page->private = 0;
>>>>>> -               ClearPagePrivate(page);
>>>>>> -               dout("unlocking %d %p\n", i, page);
>>>>>> -               end_page_writeback(page);
>>>>>> +               osd_data = osd_req_op_extent_osd_data(req, i);
>>>>>> +               BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
>>>>>> +               num_pages = calc_pages_for((u64)osd_data->alignment,
>>>>>> +                                          (u64)osd_data->length);
>>>>>> +               total_pages += num_pages;
>>>>>> +               for (j = 0; j < num_pages; j++) {
>>>>>> +                       page = osd_data->pages[j];
>>>>>> +                       BUG_ON(!page);
>>>>>> +                       WARN_ON(!PageUptodate(page));
>>>>>> +
>>>>>> +                       if (atomic_long_dec_return(&fsc->writeback_count) <
>>>>>> +                            CONGESTION_OFF_THRESH(
>>>>>> +                                       fsc->mount_options->congestion_kb))
>>>>>> +                               clear_bdi_congested(&fsc->backing_dev_info,
>>>>>> +                                                   BLK_RW_ASYNC);
>>>>>> +
>>>>>> +                       ceph_put_snap_context(page_snap_context(page));
>>>>>> +                       page->private = 0;
>>>>>> +                       ClearPagePrivate(page);
>>>>>> +                       dout("unlocking %p\n", page);
>>>>>> +                       end_page_writeback(page);
>>>>>> +
>>>>>> +                       if (remove_page)
>>>>>> +                               generic_error_remove_page(inode->i_mapping,
>>>>>> +                                                         page);
>>>>>> 
>>>>>> -               /*
>>>>>> -                * We lost the cache cap, need to truncate the page before
>>>>>> -                * it is unlocked, otherwise we'd truncate it later in the
>>>>>> -                * page truncation thread, possibly losing some data that
>>>>>> -                * raced its way in
>>>>>> -                */
>>>>>> -               if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
>>>>>> -                       generic_error_remove_page(inode->i_mapping, page);
>>>>>> +                       unlock_page(page);
>>>>>> +               }
>>>>>> +               dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
>>>>>> +                    inode, osd_data->length, rc >= 0 ? num_pages : 0);
>>>>>> 
>>>>>> -               unlock_page(page);
>>>>>> +               ceph_release_pages(osd_data->pages, num_pages);
>>>>>>       }
>>>>>> -       dout("%p wrote+cleaned %d pages\n", inode, wrote);
>>>>>> -       ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
>>>>>> 
>>>>>> -       ceph_release_pages(osd_data->pages, num_pages);
>>>>>> +       ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
>>>>>> +
>>>>>> +       osd_data = osd_req_op_extent_osd_data(req, 0);
>>>>>>       if (osd_data->pages_from_pool)
>>>>>>               mempool_free(osd_data->pages,
>>>>>>                            ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
>>>>>> @@ -778,17 +778,15 @@ retry:
>>>>>>       while (!done && index <= end) {
>>>>>>               unsigned i;
>>>>>>               int first;
>>>>>> -               pgoff_t next;
>>>>>> -               int pvec_pages, locked_pages;
>>>>>> -               struct page **pages = NULL;
>>>>>> +               pgoff_t strip_end = 0;
>>>>>> +               int num_ops = 0, max_ops = 0;
>>>>>> +               int pvec_pages, locked_pages = 0;
>>>>>> +               struct page **pages = NULL, **data_pages = NULL;
>>>>>>               mempool_t *pool = NULL; /* Becomes non-null if mempool used */
>>>>>>               struct page *page;
>>>>>>               int want;
>>>>>> -               u64 offset, len;
>>>>>> -               long writeback_stat;
>>>>>> +               u64 offset = 0, len = 0;
>>>>>> 
>>>>>> -               next = 0;
>>>>>> -               locked_pages = 0;
>>>>>>               max_pages = max_pages_ever;
>>>>>> 
>>>>>> get_more_pages:
>>>>>> @@ -824,8 +822,8 @@ get_more_pages:
>>>>>>                               unlock_page(page);
>>>>>>                               break;
>>>>>>                       }
>>>>>> -                       if (next && (page->index != next)) {
>>>>>> -                               dout("not consecutive %p\n", page);
>>>>>> +                       if (strip_end && (page->index > strip_end)) {
>>>>>> +                               dout("end of strip %p\n", page);
>>>>>>                               unlock_page(page);
>>>>>>                               break;
>>>>>>                       }
>>>>>> @@ -871,28 +869,60 @@ get_more_pages:
>>>>>>                        * that it will use.
>>>>>>                        */
>>>>>>                       if (locked_pages == 0) {
>>>>>> +                               int j;
>>>>>>                               BUG_ON(pages);
>>>>>>                               /* prepare async write request */
>>>>>>                               offset = (u64)page_offset(page);
>>>>>>                               len = wsize;
>>>>>> -                               req = ceph_osdc_new_request(&fsc->client->osdc,
>>>>>> +
>>>>>> +                               max_ops = 1 + do_sync;
>>>>>> +                               strip_end = page->index +
>>>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>>>> +                               for (j = i + 1; j < pvec_pages; j++) {
>>>>>> +                                       if (pvec.pages[j]->index > strip_end)
>>>>>> +                                               break;
>>>>>> +                                       if (pvec.pages[j]->index - 1 !=
>>>>>> +                                           pvec.pages[j - 1]->index)
>>>>>> +                                               max_ops++;
>>>>>> +                               }
>>>>>> +
>>>>>> +                               if (max_ops > CEPH_OSD_INITIAL_OP) {
>>>>>> +                                       max_ops = min(max_ops, CEPH_OSD_MAX_OP);
>>>>>> +                                       req = ceph_osdc_new_request(
>>>>>> +                                                       &fsc->client->osdc,
>>>>>>                                                       &ci->i_layout, vino,
>>>>>> -                                                       offset, &len, 0,
>>>>>> -                                                       do_sync ? 2 : 1,
>>>>>> +                                                       offset, &len,
>>>>>> +                                                       0, max_ops,
>>>>>> +                                                       CEPH_OSD_OP_WRITE,
>>>>>> +                                                       CEPH_OSD_FLAG_WRITE |
>>>>>> +                                                       CEPH_OSD_FLAG_ONDISK,
>>>>>> +                                                       snapc, truncate_seq,
>>>>>> +                                                       truncate_size, false);
>>>>>> +                                       if (IS_ERR(req))
>>>>>> +                                               req = NULL;
>>>>>> +                               }
>>>>>> +                               if (!req) {
>>>>>> +                                       max_ops = CEPH_OSD_INITIAL_OP;
>>>>>> +                                       req = ceph_osdc_new_request(
>>>>>> +                                                       &fsc->client->osdc,
>>>>>> +                                                       &ci->i_layout, vino,
>>>>>> +                                                       offset, &len,
>>>>>> +                                                       0, max_ops,
>>>>>>                                                       CEPH_OSD_OP_WRITE,
>>>>>>                                                       CEPH_OSD_FLAG_WRITE |
>>>>>>                                                       CEPH_OSD_FLAG_ONDISK,
>>>>>>                                                       snapc, truncate_seq,
>>>>>>                                                       truncate_size, true);
>>>>>> -                               if (IS_ERR(req)) {
>>>>>> -                                       rc = PTR_ERR(req);
>>>>>> -                                       unlock_page(page);
>>>>>> -                                       break;
>>>>>> +                                       if (IS_ERR(req)) {
>>>>>> +                                               unlock_page(page);
>>>>>> +                                               rc = PTR_ERR(req);
>>>>>> +                                               req = NULL;
>>>>>> +                                               break;
>>>>>> +                                       }
>>>>>>                               }
>>>>>> 
>>>>>> -                               if (do_sync)
>>>>>> -                                       osd_req_op_init(req, 1,
>>>>>> -                                                       CEPH_OSD_OP_STARTSYNC, 0);
>>>>>> +                               strip_end = page->index +
>>>>>> +                                           ((len - 1) >> PAGE_CACHE_SHIFT);
>>>>>> 
>>>>>>                               req->r_callback = writepages_finish;
>>>>>>                               req->r_inode = inode;
>>>>>> @@ -905,6 +935,60 @@ get_more_pages:
>>>>>>                                       pages = mempool_alloc(pool, GFP_NOFS);
>>>>>>                                       BUG_ON(!pages);
>>>>>>                               }
>>>>>> +
>>>>>> +                               num_ops = 1;
>>>>>> +                               data_pages = pages;
>>>>>> +                               len = 0;
>>>>>> +                       } else if (page->index !=
>>>>>> +                                  (offset + len) >> PAGE_CACHE_SHIFT) {
>>>>>> +                               u64 offset_inc;
>>>>>> +                               bool new_op = true;
>>>>>> +
>>>>>> +                               if (num_ops + do_sync == CEPH_OSD_MAX_OP) {
>>>>>> +                                       new_op = false;
>>>>>> +                               } else if (num_ops + do_sync == max_ops) {
>>>>>> +                                       int new_max = max_ops;
>>>>>> +                                       int j;
>>>>>> +                                       for (j = i + 1; j < pvec_pages; j++) {
>>>>>> +                                               if (pvec.pages[j]->index >
>>>>>> +                                                   strip_end)
>>>>>> +                                                       break;
>>>>>> +                                               if (pvec.pages[j]->index - 1 !=
>>>>>> +                                                   pvec.pages[j - 1]->index)
>>>>>> +                                                       new_max++;
>>>>>> +                                       }
>>>>>> +                                       new_max++;
>>>>>> +                                       new_max = min(new_max, CEPH_OSD_MAX_OP);
>>>>>> +                                       if (ceph_osdc_extend_request(req,
>>>>>> +                                                               new_max,
>>>>>> +                                                               GFP_NOFS) >= 0)
>>>>>> +                                               max_ops = new_max;
>>>>>> +                                       else
>>>>>> +                                               new_op = false;
>>>>>> +                               }
>>>>>> +                               if (!new_op) {
>>>>>> +                                       redirty_page_for_writepage(wbc, page);
>>>>>> +                                       unlock_page(page);
>>>>>> +                                       break;
>>>>>> +                               }
>>>>>> +
>>>>>> +                               offset_inc = (u64)page_offset(page) - offset;
>>>>>> +                               osd_req_op_extent_dup_last(req, offset_inc);
>>>>>> +
>>>>>> +                               dout("writepages got pages at %llu~%llu\n",
>>>>>> +                                     offset, len);
>>>>>> +
>>>>>> +                               osd_req_op_extent_update(req, num_ops - 1, len);
>>>>>> +                               osd_req_op_extent_osd_data_pages(req,
>>>>>> +                                                               num_ops - 1,
>>>>>> +                                                               data_pages,
>>>>>> +                                                               len, 0,
>>>>>> +                                                               !!pool, false);
>>>>>> +
>>>>>> +                               num_ops++;
>>>>>> +                               data_pages = pages + locked_pages;
>>>>>> +                               offset = (u64)page_offset(page);
>>>>>> +                               len = 0;
>>>>>>                       }
>>>>>> 
>>>>>>                       /* note position of first page in pvec */
>>>>>> @@ -913,9 +997,8 @@ get_more_pages:
>>>>>>                       dout("%p will write page %p idx %lu\n",
>>>>>>                            inode, page, page->index);
>>>>>> 
>>>>>> -                       writeback_stat =
>>>>>> -                              atomic_long_inc_return(&fsc->writeback_count);
>>>>>> -                       if (writeback_stat > CONGESTION_ON_THRESH(
>>>>>> +                       if (atomic_long_inc_return(&fsc->writeback_count) >
>>>>>> +                           CONGESTION_ON_THRESH(
>>>>>>                                   fsc->mount_options->congestion_kb)) {
>>>>>>                               set_bdi_congested(&fsc->backing_dev_info,
>>>>>>                                                 BLK_RW_ASYNC);
>>>>>> @@ -924,7 +1007,7 @@ get_more_pages:
>>>>>>                       set_page_writeback(page);
>>>>>>                       pages[locked_pages] = page;
>>>>>>                       locked_pages++;
>>>>>> -                       next = page->index + 1;
>>>>>> +                       len += PAGE_CACHE_SIZE;
>>>>>>               }
>>>>>> 
>>>>>>               /* did we get anything? */
>>>>>> @@ -952,31 +1035,34 @@ get_more_pages:
>>>>>>               }
>>>>>> 
>>>>>>               /* Format the osd request message and submit the write */
>>>>>> -               offset = page_offset(pages[0]);
>>>>>> -               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
>>>>>>               if (snap_size == -1) {
>>>>>> +                       /* writepages_finish() clears writeback pages
>>>>>> +                        * according to the data length, so make sure
>>>>>> +                        * data length covers all locked pages */
>>>>>> +                       u64 min_len = len + 1 - PAGE_CACHE_SIZE;
>>>>>>                       len = min(len, (u64)i_size_read(inode) - offset);
>>>>>> -                        /* writepages_finish() clears writeback pages
>>>>>> -                         * according to the data length, so make sure
>>>>>> -                         * data length covers all locked pages */
>>>>>> -                       len = max(len, 1 +
>>>>>> -                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
>>>>>> +                       len = max(len, min_len);
>>>>>>               } else {
>>>>>>                       len = min(len, snap_size - offset);
>>>>>>               }
>>>>>> -               dout("writepages got %d pages at %llu~%llu\n",
>>>>>> -                    locked_pages, offset, len);
>>>>>> +               dout("writepages got pages at %llu~%llu\n", offset, len);
>>>>>> 
>>>>>> -               osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>>>>>> -                                                       !!pool, false);
>>>>>> +               osd_req_op_extent_osd_data_pages(req, num_ops - 1, data_pages,
>>>>>> +                                                len, 0, !!pool, false);
>>>>>> +               osd_req_op_extent_update(req, num_ops - 1, len);
>>>>>> 
>>>>>> +               if (do_sync) {
>>>>>> +                       osd_req_op_init(req, num_ops, CEPH_OSD_OP_STARTSYNC, 0);
>>>>>> +                       num_ops++;
>>>>>> +               }
>>>>>> +               BUG_ON(num_ops > max_ops);
>>>>>> +
>>>>>> +               index = pages[locked_pages - 1]->index + 1;
>>>>>>               pages = NULL;   /* request message now owns the pages array */
>>>>>>               pool = NULL;
>>>>>> 
>>>>>>               /* Update the write op length in case we changed it */
>>>>>> 
>>>>>> -               osd_req_op_extent_update(req, 0, len);
>>>>>> -
>>>>>>               vino = ceph_vino(inode);
>>>>>>               ceph_osdc_build_request(req, offset, snapc, vino.snap,
>>>>>>                                       &inode->i_mtime);
>>>>>> @@ -986,7 +1072,6 @@ get_more_pages:
>>>>>>               req = NULL;
>>>>>> 
>>>>>>               /* continue? */
>>>>>> -               index = next;
>>>>>>               wbc->nr_to_write -= locked_pages;
>>>>>>               if (wbc->nr_to_write <= 0)
>>>>>>                       done = 1;
>>>>> 
>>>>> Hi Zheng,
>>>>> 
>>>>> I know I mentioned off-list that v3 looked good, but I've taken
>>>>> a deeper look and have to retract that.  This version still has a call
>>>>> to ceph_osdc_extend_request(), which suggests to me that the number of
>>>>> ops is calculated incorrectly.
>>>>> 
>>>>> A single op within an OSD request can be used for a single region of
>>>>> contiguous dirty pages, so what this comes down to is calculating the
>>>>> number of such dirty contiguous regions.  What you appear to be doing
>>>>> in this version is calculating this number for the *first* gang of
>>>>> pages we get from pagevec_lookup_tag() and allocating an OSD request
>>>>> based on that.  If that's not enough (i.e. there are more dirty pages
>>>>> in the stripe unit), we move on, you do the calculation for the next
>>>>> gang and enlange the OSD request by reallocating parts of it.  This
>>>>> goes on until we either run out of dirty pages in the stripe unit or
>>>>> reach CEPH_OSD_MAX_OP ops in the request.
>>>>> 
>>>>> What I don't understand is what prevents us from pre-calculating the
>>>>> number of dirty contiguous regions for the entire stripe unit in one go
>>>>> and trying to allocate the OSD request based on that number, possibly
>>>>> falling back to a smaller request if we are short on memory.  The page
>>>>> pointer array can hold all of pages in the stripe unit - I don't see
>>>>> the need for ceph_osdc_extend_request() at all.  Am I missing something?
>>>>> 
>>>> 
>>>> pre-calculating number of regions in the strip unit is ideal for
>>>> random write, but it impose overhead on the most common sequential
>>>> write (that's why I don't allocate CEPH_OSD_MAX_OP ops in the first
>>>> place)  Besides,  pre-calculating number of regions require big
>>> 
>>> Care to be more specific about the overhead?  You have to go through
>>> enough of dirty pages to either stuff up the OSD request or move past
>>> the stripe unit no matter which type of write it is.  Allocating
>>> CEPH_OSD_MAX_OP right off the bat is of course a bad idea, especially
>>> in the sequential case, but that's not what I'm advocating for.  What
>>> I'm proposing would result in allocating a request with exactly the
>>> number of ops needed, in both sequential and random cases - no more, no
>>> less.
>> 
>> Scanning page within a strip unit requires calling pagevec_lookup_tag() multiples times. There are about 1000 pages in the worst case. It means we need to get/release page 1000 times. In my option, it’s non-trivial overhead.
> 
> Which page would have to be gotten/released a 1000 times?  We already
> call pagevec_lookup_tag() more than once, until we either move past the
> stripe unit or fill in the OSD request.  Pre-calculating the number of
> dirty regions properly doesn't entail any additional calls to it.  We
> lock the page or release it, it happens once.  Locked pages are stashed
> in the pages page pointer array - is there a reason we can't defer the
> OSD request allocation until after we lock and stash all of the pages
> we are interested in?  Unless I'm misreading the code, the page pointer
> array is always big enough to accommodate.

Make sense, I will update my patches

Regards
Yan, Zheng

> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-01-27 11:00 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-19 13:30 [PATCH V3 6/6] ceph: scattered page writeback Yan, Zheng
2016-01-26 22:15 ` Ilya Dryomov
2016-01-27  1:52   ` Yan, Zheng
2016-01-27  9:29     ` Ilya Dryomov
2016-01-27  9:42       ` Yan, Zheng
2016-01-27 10:45         ` Ilya Dryomov
2016-01-27 11:00           ` Yan, Zheng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.