lustre-devel-lustre.org archive mirror
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Vitaly Fertman <c17818@cray.com>,
	Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 34/49] lustre: lov: cancel layout lock on replay deadlock
Date: Thu, 15 Apr 2021 00:02:26 -0400	[thread overview]
Message-ID: <1618459361-17909-35-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1618459361-17909-1-git-send-email-jsimmons@infradead.org>

From: Vitaly Fertman <c17818@cray.com>

layout locks are not replayed and instead cancelled as unused, what
requires to take lov_conf_lock. the semaphore may be already taken by
cl_lock_flush() which prepares a new IO which is not be able to be
sent to MDS as it is in the recovery.

HPE-bug-id: LUS-9232
WC-bug-id: https://jira.whamcloud.com/browse/LU-14182
Lustre-commit: 68fb53ad4bb2dbc ("LU-14182 lov: cancel layout lock on replay deadlock")
Signed-off-by: Vitaly Fertman <c17818@cray.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Reviewed-on: https://review.whamcloud.com/40867
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Andriy Skulysh <askulysh@gmail.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/include/obd_support.h |  1 +
 fs/lustre/ldlm/ldlm_request.c   |  2 ++
 fs/lustre/llite/namei.c         |  2 ++
 fs/lustre/lov/lov_cl_internal.h | 10 +++++++---
 fs/lustre/lov/lov_object.c      | 44 ++++++++++++++++++++++++-----------------
 5 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/fs/lustre/include/obd_support.h b/fs/lustre/include/obd_support.h
index 152f95c..b2f97f1 100644
--- a/fs/lustre/include/obd_support.h
+++ b/fs/lustre/include/obd_support.h
@@ -308,6 +308,7 @@
 
 #define OBD_FAIL_LDLM_GRANT_CHECK			0x32a
 #define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE		0x32c
+#define OBD_FAIL_LDLM_REPLAY_PAUSE			0x32e
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION			0x385
diff --git a/fs/lustre/ldlm/ldlm_request.c b/fs/lustre/ldlm/ldlm_request.c
index d8ca744..3527678 100644
--- a/fs/lustre/ldlm/ldlm_request.c
+++ b/fs/lustre/ldlm/ldlm_request.c
@@ -2220,6 +2220,8 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
 	       "Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
 	       ldlm_ns_name(ns), ns->ns_nr_unused);
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
 	/*
 	 * We don't need to care whether or not LRU resize is enabled
 	 * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
diff --git a/fs/lustre/llite/namei.c b/fs/lustre/llite/namei.c
index 1095fa9..654d065 100644
--- a/fs/lustre/llite/namei.c
+++ b/fs/lustre/llite/namei.c
@@ -204,6 +204,8 @@ static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 	if (IS_ERR(env))
 		return PTR_ERR(env);
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
 	/* reach MDC layer to flush data under  the DoM ldlm lock */
 	rc = cl_object_flush(env, lli->lli_clob, lock);
 	if (rc == -ENODATA) {
diff --git a/fs/lustre/lov/lov_cl_internal.h b/fs/lustre/lov/lov_cl_internal.h
index e9ef5aa..f231be9 100644
--- a/fs/lustre/lov/lov_cl_internal.h
+++ b/fs/lustre/lov/lov_cl_internal.h
@@ -251,6 +251,11 @@ struct lov_mirror_entry {
 	unsigned short	lre_end;	/* end index of this mirror */
 };
 
+enum lov_object_flags {
+	/* Layout is invalid, set when layout lock is lost */
+	LO_LAYOUT_INVALID	= 0x1,
+};
+
 /**
  * lov-specific file state.
  *
@@ -281,10 +286,9 @@ struct lov_object {
 	 */
 	enum lov_layout_type	lo_type;
 	/**
-	 * True if layout is invalid. This bit is cleared when layout lock
-	 * is lost.
+	 * Object flags.
 	 */
-	bool			lo_layout_invalid;
+	unsigned long		lo_obj_flags;
 	/**
 	 * How many IOs are on going on this object. Layout can be changed
 	 * only if there is no active IO.
diff --git a/fs/lustre/lov/lov_object.c b/fs/lustre/lov/lov_object.c
index abe1cee..db4070f 100644
--- a/fs/lustre/lov/lov_object.c
+++ b/fs/lustre/lov/lov_object.c
@@ -177,7 +177,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
 		LASSERT(old_obj);
 		old_lov = cl2lov(lu2cl(old_obj));
-		if (old_lov->lo_layout_invalid) {
+		if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) {
 			/* the object's layout has already changed but isn't
 			 * refreshed
 			 */
@@ -628,7 +628,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 	LASSERT(lsm->lsm_entry_count > 0);
 	LASSERT(!lov->lo_lsm);
 	lov->lo_lsm = lsm_addref(lsm);
-	lov->lo_layout_invalid = true;
+	set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 
 	dump_lsm(D_INODE, lsm);
 
@@ -910,7 +910,8 @@ static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
 static int lov_print_empty(const struct lu_env *env, void *cookie,
 			   lu_printer_t p, const struct lu_object *o)
 {
-	(*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+	(*p)(env, cookie, "empty %d\n",
+	     test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags));
 	return 0;
 }
 
@@ -923,8 +924,8 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n",
 	     lsm->lsm_entry_count,
-	     lov->lo_layout_invalid ? "invalid" : "valid", lsm,
-	     lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+	     "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 	     lsm->lsm_layout_gen);
 
 	for (i = 0; i < lsm->lsm_entry_count; i++) {
@@ -953,8 +954,8 @@ static int lov_print_released(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie,
 	     "released: %s, lsm{%p 0x%08X %d %u}:\n",
-	     lov->lo_layout_invalid ? "invalid" : "valid", lsm,
-	     lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+	     "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 	     lsm->lsm_layout_gen);
 	return 0;
 }
@@ -967,7 +968,8 @@ static int lov_print_foreign(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie,
 		"foreign: %s, lsm{%p 0x%08X %d %u}:\n",
-		lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+		test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ?
+		"invalid" : "valid", lsm,
 		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 		lsm->lsm_layout_gen);
 	(*p)(env, cookie,
@@ -1352,15 +1354,15 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
 		dump_lsm(D_INODE, lsm);
 	}
 
-	lov_conf_lock(lov);
 	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
-		lov->lo_layout_invalid = true;
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = 0;
-		goto out;
+		goto out_lsm;
 	}
 
+	lov_conf_lock(lov);
 	if (conf->coc_opc == OBJECT_CONF_WAIT) {
-		if (lov->lo_layout_invalid &&
+		if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) &&
 		    atomic_read(&lov->lo_active_ios) > 0) {
 			lov_conf_unlock(lov);
 			result = lov_layout_wait(env, lov);
@@ -1378,26 +1380,31 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
 	     (lov->lo_lsm->lsm_entries[0]->lsme_pattern ==
 	      lsm->lsm_entries[0]->lsme_pattern))) {
 		/* same version of layout */
-		lov->lo_layout_invalid = false;
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = 0;
 		goto out;
 	}
 
 	/* will change layout - check if there still exists active IO. */
 	if (atomic_read(&lov->lo_active_ios) > 0) {
-		lov->lo_layout_invalid = true;
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = -EBUSY;
 		goto out;
 	}
 
 	result = lov_layout_change(env, lov, lsm, conf);
-	lov->lo_layout_invalid = result != 0;
+	if (result)
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+	else
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 
 out:
 	lov_conf_unlock(lov);
+out_lsm:
 	lov_lsm_put(lsm);
-	CDEBUG(D_INODE, DFID " lo_layout_invalid=%d\n",
-	       PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+	CDEBUG(D_INODE, DFID " lo_layout_invalid=%u\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags));
 	return result;
 }
 
@@ -2254,7 +2261,8 @@ static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
 		lsm = lsm_addref(lov->lo_lsm);
 		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
 		       lsm, atomic_read(&lsm->lsm_refc),
-		       lov->lo_layout_invalid, current);
+		       test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags),
+		       current);
 	}
 	lov_conf_thaw(lov);
 	return lsm;
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  parent reply	other threads:[~2021-04-15  4:05 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-15  4:01 [lustre-devel] [PATCH 00/49] lustre: sync to OpenSFS as of March 30 2021 James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 01/49] lnet: libcfs: Fix for unconfigured arch_stackwalk James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 02/49] lustre: lmv: iput() can safely be passed NULL James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 03/49] lustre: llite: mark extended attr and inode flags James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 04/49] lnet: lnet_notify sets route aliveness incorrectly James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 05/49] lnet: Prevent discovery on peer marked deletion James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 06/49] lnet: Prevent discovery on deleted peer James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 07/49] lnet: Transfer disc src NID when merging peers James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 08/49] lnet: Lookup lpni after discovery James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 09/49] lustre: llite: update and fix module loading bug in mounting code James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 10/49] lnet: socklnd: change various ints to bool James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 11/49] lnet: Correct asymmetric route detection James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 12/49] lustre: fixup ldlm_pool and lu_object shrinker failure cases James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 13/49] lustre: log: Add ending newline for some messages James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 14/49] lustre: use with_imp_locked() more broadly James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 15/49] lnet: o2iblnd: change some ints to bool James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 16/49] lustre: lmv: striped directory as subdirectory mount James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 17/49] lustre: llite: create file_operations registration function James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 18/49] lustre: osc: fix performance regression in osc_extent_merge() James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 19/49] lustre: mds: add enums for MDS_ATTR flags James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 20/49] lustre: uapi: remove OBD_IOC_LOV_GET_CONFIG James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 21/49] lustre: sec: fix migrate for encrypted dir James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 22/49] lnet: libcfs: restore LNET_DUMP_ON_PANIC functionality James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 23/49] lustre: ptlrpc: fix ASSERTION on scp_rqbd_posted James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 24/49] lustre: ldlm: not freed req on enqueue James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 25/49] lnet: uapi: move userland only nidstr.h handling James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 26/49] lnet: libcfs: don't depend on sysctl support for debugfs James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 27/49] lustre: ptlrpc: Add a binary heap implementation James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 28/49] lustre: ptlrpc: Implement NRS Delay Policy James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 29/49] lustre: ptlrpc: rename cfs_binheap to simply binheap James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 30/49] lustre: ptlrpc: mark some functions as static James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 31/49] lustre: use tgt_pool for lov layer James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 32/49] lustre: quota: make used for pool correct James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 33/49] lustre: quota: call rhashtable_lookup near params decl James Simmons
2021-04-15  4:02 ` James Simmons [this message]
2021-04-15  4:02 ` [lustre-devel] [PATCH 35/49] lustre: obdclass: Protect cl_env_percpu[] James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 36/49] lnet: libcfs: discard cfs_trace_console_buffers[] James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 37/49] lnet: libcfs: discard cfs_trace_copyin_string() James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 38/49] lustre: lmv: don't use lqr_alloc spinlock in lmv James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 39/49] lustre: lov: fault page update cp_lov_index James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 40/49] lustre: update version to 2.14.51 James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 41/49] lustre: llite: mirror extend/copy keeps sparseness James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 42/49] lustre: ptlrpc: don't use list_for_each_entry_safe unnecessarily James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 43/49] lnet: Age peer NI out of recovery James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 44/49] lnet: Only recover known good peer NIs James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 45/49] lnet: Recover peer NI w/exponential backoff interval James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 46/49] lustre: lov: return valid stripe_count/size for PFL files James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 47/49] lnet: convert lpni_refcount to a kref James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 48/49] lustre: lmv: handle default stripe_count=-1 properly James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 49/49] lnet: libcfs: discard cfs_array_alloc() James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1618459361-17909-35-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=c17818@cray.com \
    --cc=green@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    --subject='Re: [lustre-devel] [PATCH 34/49] lustre: lov: cancel layout lock on replay deadlock' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).