All of lore.kernel.org
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 06/20] lustre: brw: log T10 GRD tags during checksum calcs
Date: Mon, 11 Oct 2021 13:40:35 -0400	[thread overview]
Message-ID: <1633974049-26490-7-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1633974049-26490-1-git-send-email-jsimmons@infradead.org>

From: Andreas Dilger <adilger@whamcloud.com>

Log the T10 guard tags during checksum calculation on the client and
target to help identify where checksum errors are being introduced.
The added debugging is only active on RPC resend, so will not add
overhead during the normal IO path.

WC-bug-id: https://jira.whamcloud.com/browse/LU-14895
Lustre-commit: c628b1b441d0ee191 ("LU-14895 brw: log T10 GRD tags during checksum calcs")
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/44655
Reviewed-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/osc/osc_request.c | 83 ++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 34 deletions(-)

diff --git a/fs/lustre/osc/osc_request.c b/fs/lustre/osc/osc_request.c
index db73fce..def2ee7 100644
--- a/fs/lustre/osc/osc_request.c
+++ b/fs/lustre/osc/osc_request.c
@@ -1186,7 +1186,7 @@ static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
 				   size_t pg_count, struct brw_page **pga,
 				   int opc, obd_dif_csum_fn *fn,
 				   int sector_size,
-				   u32 *check_sum)
+				   u32 *check_sum, bool resend)
 {
 	struct ahash_request *hdesc;
 	/* Used Adler as the default checksum type on top of DIF tags */
@@ -1219,6 +1219,10 @@ static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
 	buffer = kmap(__page);
 	guard_start = (u16 *)buffer;
 	guard_number = PAGE_SIZE / sizeof(*guard_start);
+	CDEBUG(D_PAGE | (resend ? D_HA : 0),
+	       "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n",
+	       guard_number, resend, nob, pg_count);
+
 	while (nob > 0 && pg_count > 0) {
 		unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
 
@@ -1245,6 +1249,12 @@ static int osc_checksum_bulk_t10pi(const char *obd_name, int nob,
 						  guard_number - used_number,
 						  &used, sector_size,
 						  fn);
+		if (unlikely(resend))
+			CDEBUG(D_PAGE | D_HA,
+			       "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n",
+			       i, used, pga[i]->off & ~PAGE_MASK, count,
+			       (int)(used * sizeof(*guard_start)),
+			       guard_start + used_number);
 		if (rc)
 			break;
 
@@ -1346,7 +1356,7 @@ static int osc_checksum_bulk_rw(const char *obd_name,
 				enum cksum_types cksum_type,
 				int nob, size_t pg_count,
 				struct brw_page **pga, int opc,
-				u32 *check_sum)
+				u32 *check_sum, bool resend)
 {
 	obd_dif_csum_fn *fn = NULL;
 	int sector_size = 0;
@@ -1356,7 +1366,8 @@ static int osc_checksum_bulk_rw(const char *obd_name,
 
 	if (fn)
 		rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
-					     opc, fn, sector_size, check_sum);
+					     opc, fn, sector_size, check_sum,
+					     resend);
 	else
 		rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
 				       check_sum);
@@ -1727,14 +1738,15 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
 			rc = osc_checksum_bulk_rw(obd_name, cksum_type,
 						  requested_nob, page_count,
 						  pga, OST_WRITE,
-						  &body->oa.o_cksum);
+						  &body->oa.o_cksum, resend);
 			if (rc < 0) {
-				CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
+				CDEBUG(D_PAGE, "failed to checksum: rc = %d\n",
 				       rc);
 				goto out;
 			}
-			CDEBUG(D_PAGE, "checksum at write origin: %x\n",
-			       body->oa.o_cksum);
+			CDEBUG(D_PAGE | (resend ? D_HA : 0),
+			       "checksum at write origin: %x (%x)\n",
+			       body->oa.o_cksum, cksum_type);
 
 			/* save this in 'oa', too, for later checking */
 			oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
@@ -1814,6 +1826,7 @@ static void dump_all_bulk_pages(struct obdo *oa, u32 page_count,
 		 pga[0]->off,
 		 pga[page_count - 1]->off + pga[page_count - 1]->count - 1,
 		 client_cksum, server_cksum);
+	CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
 	filp = filp_open(dbgcksum_file_name,
 			 O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
 	if (IS_ERR(filp)) {
@@ -1840,8 +1853,6 @@ static void dump_all_bulk_pages(struct obdo *oa, u32 page_count,
 			}
 			len -= rc;
 			buf += rc;
-			CDEBUG(D_INFO, "%s: wrote %d bytes\n",
-			       dbgcksum_file_name, rc);
 		}
 		kunmap(pga[i]->pg);
 	}
@@ -1850,6 +1861,8 @@ static void dump_all_bulk_pages(struct obdo *oa, u32 page_count,
 	if (rc)
 		CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
 	filp_close(filp, NULL);
+
+	libcfs_debug_dumplog();
 }
 
 static int check_write_checksum(struct obdo *oa,
@@ -1902,7 +1915,7 @@ static int check_write_checksum(struct obdo *oa,
 		rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
 					     aa->aa_page_count, aa->aa_ppga,
 					     OST_WRITE, fn, sector_size,
-					     &new_cksum);
+					     &new_cksum, true);
 	else
 		rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
 				       aa->aa_ppga, OST_WRITE, cksum_type,
@@ -2067,17 +2080,18 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 	if (body->oa.o_valid & OBD_MD_FLCKSUM) {
 		static int cksum_counter;
 		u32 server_cksum = body->oa.o_cksum;
+		int nob = rc;
 		char *via = "";
 		char *router = "";
 		enum cksum_types cksum_type;
 		u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
-			body->oa.o_flags : 0;
+			      body->oa.o_flags : 0;
 
 		cksum_type = obd_cksum_type_unpack(o_flags);
 
-		rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
+		rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob,
 					  aa->aa_page_count, aa->aa_ppga,
-					  OST_READ, &client_cksum);
+					  OST_READ, &client_cksum, false);
 		if (rc < 0)
 			goto out;
 
@@ -2090,7 +2104,11 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 		if (server_cksum != client_cksum) {
 			u32 page_count = aa->aa_page_count;
 			struct ost_body *clbody;
+			u32 client_cksum2;
 
+			osc_checksum_bulk_rw(obd_name, cksum_type, nob,
+					     page_count, aa->aa_ppga,
+					     OST_READ, &client_cksum2, true);
 			clbody = req_capsule_client_get(&req->rq_pill,
 							&RMF_OST_BODY);
 			if (cli->cl_checksum_dump)
@@ -2098,26 +2116,23 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 						    aa->aa_ppga, server_cksum,
 						    client_cksum);
 
-			LCONSOLE_ERROR_MSG(
-				0x133,
-				"%s: BAD READ CHECKSUM: from %s%s%s inode " DFID
-				" object " DOSTID
-				" extent [%llu-%llu], client %x, server %x, cksum_type %x\n",
-				obd_name,
-				libcfs_nid2str(peer->nid),
-				via, router,
-				clbody->oa.o_valid & OBD_MD_FLFID ?
-				clbody->oa.o_parent_seq : (u64)0,
-				clbody->oa.o_valid & OBD_MD_FLFID ?
-				clbody->oa.o_parent_oid : 0,
-				clbody->oa.o_valid & OBD_MD_FLFID ?
-				clbody->oa.o_parent_ver : 0,
-				POSTID(&body->oa.o_oi),
-				aa->aa_ppga[0]->off,
-				aa->aa_ppga[page_count - 1]->off +
-				aa->aa_ppga[page_count - 1]->count - 1,
-				client_cksum, server_cksum,
-				cksum_type);
+			LCONSOLE_ERROR_MSG(0x133,
+					   "%s: BAD READ CHECKSUM: from %s%s%s inode "DFID" object "DOSTID" extent [%llu-%llu], client %x/%x, server %x, cksum_type %x\n",
+					   obd_name,
+					   libcfs_nid2str(peer->nid),
+					   via, router,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+					   clbody->oa.o_parent_seq : (u64)0,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+					   clbody->oa.o_parent_oid : 0,
+					   clbody->oa.o_valid & OBD_MD_FLFID ?
+					   clbody->oa.o_parent_ver : 0,
+					   POSTID(&body->oa.o_oi),
+					   aa->aa_ppga[0]->off,
+					   aa->aa_ppga[page_count - 1]->off +
+					   aa->aa_ppga[page_count - 1]->count - 1,
+					   client_cksum, client_cksum2,
+					   server_cksum, cksum_type);
 			cksum_counter = 0;
 			aa->aa_oa->o_cksum = client_cksum;
 			rc = -EAGAIN;
@@ -2356,7 +2371,7 @@ static int brw_interpret(const struct lu_env *env,
 			       req->rq_import->imp_obd->obd_name,
 			       POSTID(&aa->aa_oa->o_oi), rc);
 		} else if (rc == -EINPROGRESS ||
-		    client_should_resend(aa->aa_resends, aa->aa_cli)) {
+			   client_should_resend(aa->aa_resends, aa->aa_cli)) {
 			rc = osc_brw_redo_request(req, aa, rc);
 		} else {
 			CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n",
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  parent reply	other threads:[~2021-10-11 17:41 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-11 17:40 [lustre-devel] [PATCH 00/20] lustre: sync to OpenSFS Oct 11, 2021 James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 01/20] lustre: nfs: don't store parent fid James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 02/20] lustre: sec: filename encryption - symlink support James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 03/20] lustre: llite: support fallocate() on selected mirror James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 04/20] lustre: llite: move env contexts to ll_inode_info level James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 05/20] lustre: sec: do not expose security.c to listxattr/getxattr James Simmons
2021-10-11 17:40 ` James Simmons [this message]
2021-10-11 17:40 ` [lustre-devel] [PATCH 07/20] lustre: lov: prefer mirrors on non-rotational OSTs James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 08/20] lustre: sec: access to enc file's xattrs James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 09/20] lustre: update version to 2.14.55 James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 10/20] lustre: osc: Do not attempt sending empty pages James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 11/20] lustre: ptlrpc: handle reply and resend reorder James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 12/20] lustre: ptlrpc: use wait_woken() in ptlrpcd() James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 13/20] lustre: quota: fix quota with root squash enabled James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 14/20] lustre: llite: harden ll_sbi ll_flags James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 15/20] lustre: osc: use original cli for osc_lru_reclaim for debug msg James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 16/20] lustre: obdclass: lu_ref_add() called in atomic context James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 17/20] lnet: Ensure round robin selection of local NIs James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 18/20] lnet: Ensure round robin selection of peer NIs James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 19/20] lustre: mdc: update max_easize on reconnect James Simmons
2021-10-11 17:40 ` [lustre-devel] [PATCH 20/20] lnet: include linux/ethtool.h James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1633974049-26490-7-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=green@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.