All of lore.kernel.org
 help / color / mirror / Atom feed
From: James Simmons <jsimmons@infradead.org>
To: Andreas Dilger <adilger@whamcloud.com>,
	Oleg Drokin <green@whamcloud.com>, NeilBrown <neilb@suse.de>
Cc: Vitaly Fertman <c17818@cray.com>,
	Lustre Development List <lustre-devel@lists.lustre.org>
Subject: [lustre-devel] [PATCH 34/49] lustre: lov: cancel layout lock on replay deadlock
Date: Thu, 15 Apr 2021 00:02:26 -0400	[thread overview]
Message-ID: <1618459361-17909-35-git-send-email-jsimmons@infradead.org> (raw)
In-Reply-To: <1618459361-17909-1-git-send-email-jsimmons@infradead.org>

From: Vitaly Fertman <c17818@cray.com>

layout locks are not replayed and instead cancelled as unused, what
requires to take lov_conf_lock. the semaphore may be already taken by
cl_lock_flush() which prepares a new IO which is not be able to be
sent to MDS as it is in the recovery.

HPE-bug-id: LUS-9232
WC-bug-id: https://jira.whamcloud.com/browse/LU-14182
Lustre-commit: 68fb53ad4bb2dbc ("LU-14182 lov: cancel layout lock on replay deadlock")
Signed-off-by: Vitaly Fertman <c17818@cray.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Reviewed-on: https://review.whamcloud.com/40867
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Andriy Skulysh <askulysh@gmail.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/include/obd_support.h |  1 +
 fs/lustre/ldlm/ldlm_request.c   |  2 ++
 fs/lustre/llite/namei.c         |  2 ++
 fs/lustre/lov/lov_cl_internal.h | 10 +++++++---
 fs/lustre/lov/lov_object.c      | 44 ++++++++++++++++++++++++-----------------
 5 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/fs/lustre/include/obd_support.h b/fs/lustre/include/obd_support.h
index 152f95c..b2f97f1 100644
--- a/fs/lustre/include/obd_support.h
+++ b/fs/lustre/include/obd_support.h
@@ -308,6 +308,7 @@
 
 #define OBD_FAIL_LDLM_GRANT_CHECK			0x32a
 #define OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE		0x32c
+#define OBD_FAIL_LDLM_REPLAY_PAUSE			0x32e
 
 /* LOCKLESS IO */
 #define OBD_FAIL_LDLM_SET_CONTENTION			0x385
diff --git a/fs/lustre/ldlm/ldlm_request.c b/fs/lustre/ldlm/ldlm_request.c
index d8ca744..3527678 100644
--- a/fs/lustre/ldlm/ldlm_request.c
+++ b/fs/lustre/ldlm/ldlm_request.c
@@ -2220,6 +2220,8 @@ static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
 	       "Dropping as many unused locks as possible before replay for namespace %s (%d)\n",
 	       ldlm_ns_name(ns), ns->ns_nr_unused);
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
 	/*
 	 * We don't need to care whether or not LRU resize is enabled
 	 * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
diff --git a/fs/lustre/llite/namei.c b/fs/lustre/llite/namei.c
index 1095fa9..654d065 100644
--- a/fs/lustre/llite/namei.c
+++ b/fs/lustre/llite/namei.c
@@ -204,6 +204,8 @@ static int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 	if (IS_ERR(env))
 		return PTR_ERR(env);
 
+	OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_REPLAY_PAUSE, cfs_fail_val);
+
 	/* reach MDC layer to flush data under  the DoM ldlm lock */
 	rc = cl_object_flush(env, lli->lli_clob, lock);
 	if (rc == -ENODATA) {
diff --git a/fs/lustre/lov/lov_cl_internal.h b/fs/lustre/lov/lov_cl_internal.h
index e9ef5aa..f231be9 100644
--- a/fs/lustre/lov/lov_cl_internal.h
+++ b/fs/lustre/lov/lov_cl_internal.h
@@ -251,6 +251,11 @@ struct lov_mirror_entry {
 	unsigned short	lre_end;	/* end index of this mirror */
 };
 
+enum lov_object_flags {
+	/* Layout is invalid, set when layout lock is lost */
+	LO_LAYOUT_INVALID	= 0x1,
+};
+
 /**
  * lov-specific file state.
  *
@@ -281,10 +286,9 @@ struct lov_object {
 	 */
 	enum lov_layout_type	lo_type;
 	/**
-	 * True if layout is invalid. This bit is cleared when layout lock
-	 * is lost.
+	 * Object flags.
 	 */
-	bool			lo_layout_invalid;
+	unsigned long		lo_obj_flags;
 	/**
 	 * How many IOs are on going on this object. Layout can be changed
 	 * only if there is no active IO.
diff --git a/fs/lustre/lov/lov_object.c b/fs/lustre/lov/lov_object.c
index abe1cee..db4070f 100644
--- a/fs/lustre/lov/lov_object.c
+++ b/fs/lustre/lov/lov_object.c
@@ -177,7 +177,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
 		old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
 		LASSERT(old_obj);
 		old_lov = cl2lov(lu2cl(old_obj));
-		if (old_lov->lo_layout_invalid) {
+		if (test_bit(LO_LAYOUT_INVALID, &old_lov->lo_obj_flags)) {
 			/* the object's layout has already changed but isn't
 			 * refreshed
 			 */
@@ -628,7 +628,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 	LASSERT(lsm->lsm_entry_count > 0);
 	LASSERT(!lov->lo_lsm);
 	lov->lo_lsm = lsm_addref(lsm);
-	lov->lo_layout_invalid = true;
+	set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 
 	dump_lsm(D_INODE, lsm);
 
@@ -910,7 +910,8 @@ static void lov_fini_released(const struct lu_env *env, struct lov_object *lov,
 static int lov_print_empty(const struct lu_env *env, void *cookie,
 			   lu_printer_t p, const struct lu_object *o)
 {
-	(*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+	(*p)(env, cookie, "empty %d\n",
+	     test_bit(LO_LAYOUT_INVALID, &lu2lov(o)->lo_obj_flags));
 	return 0;
 }
 
@@ -923,8 +924,8 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie, "entries: %d, %s, lsm{%p 0x%08X %d %u}:\n",
 	     lsm->lsm_entry_count,
-	     lov->lo_layout_invalid ? "invalid" : "valid", lsm,
-	     lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+	     "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 	     lsm->lsm_layout_gen);
 
 	for (i = 0; i < lsm->lsm_entry_count; i++) {
@@ -953,8 +954,8 @@ static int lov_print_released(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie,
 	     "released: %s, lsm{%p 0x%08X %d %u}:\n",
-	     lov->lo_layout_invalid ? "invalid" : "valid", lsm,
-	     lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+	     test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ? "invalid" :
+	     "valid", lsm, lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 	     lsm->lsm_layout_gen);
 	return 0;
 }
@@ -967,7 +968,8 @@ static int lov_print_foreign(const struct lu_env *env, void *cookie,
 
 	(*p)(env, cookie,
 		"foreign: %s, lsm{%p 0x%08X %d %u}:\n",
-		lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+		test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) ?
+		"invalid" : "valid", lsm,
 		lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
 		lsm->lsm_layout_gen);
 	(*p)(env, cookie,
@@ -1352,15 +1354,15 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
 		dump_lsm(D_INODE, lsm);
 	}
 
-	lov_conf_lock(lov);
 	if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
-		lov->lo_layout_invalid = true;
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = 0;
-		goto out;
+		goto out_lsm;
 	}
 
+	lov_conf_lock(lov);
 	if (conf->coc_opc == OBJECT_CONF_WAIT) {
-		if (lov->lo_layout_invalid &&
+		if (test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags) &&
 		    atomic_read(&lov->lo_active_ios) > 0) {
 			lov_conf_unlock(lov);
 			result = lov_layout_wait(env, lov);
@@ -1378,26 +1380,31 @@ static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
 	     (lov->lo_lsm->lsm_entries[0]->lsme_pattern ==
 	      lsm->lsm_entries[0]->lsme_pattern))) {
 		/* same version of layout */
-		lov->lo_layout_invalid = false;
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = 0;
 		goto out;
 	}
 
 	/* will change layout - check if there still exists active IO. */
 	if (atomic_read(&lov->lo_active_ios) > 0) {
-		lov->lo_layout_invalid = true;
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 		result = -EBUSY;
 		goto out;
 	}
 
 	result = lov_layout_change(env, lov, lsm, conf);
-	lov->lo_layout_invalid = result != 0;
+	if (result)
+		set_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
+	else
+		clear_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags);
 
 out:
 	lov_conf_unlock(lov);
+out_lsm:
 	lov_lsm_put(lsm);
-	CDEBUG(D_INODE, DFID " lo_layout_invalid=%d\n",
-	       PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid);
+	CDEBUG(D_INODE, DFID " lo_layout_invalid=%u\n",
+	       PFID(lu_object_fid(lov2lu(lov))),
+	       test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags));
 	return result;
 }
 
@@ -2254,7 +2261,8 @@ static struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
 		lsm = lsm_addref(lov->lo_lsm);
 		CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
 		       lsm, atomic_read(&lsm->lsm_refc),
-		       lov->lo_layout_invalid, current);
+		       test_bit(LO_LAYOUT_INVALID, &lov->lo_obj_flags),
+		       current);
 	}
 	lov_conf_thaw(lov);
 	return lsm;
-- 
1.8.3.1

_______________________________________________
lustre-devel mailing list
lustre-devel@lists.lustre.org
http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

  parent reply	other threads:[~2021-04-15  4:05 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-15  4:01 [lustre-devel] [PATCH 00/49] lustre: sync to OpenSFS as of March 30 2021 James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 01/49] lnet: libcfs: Fix for unconfigured arch_stackwalk James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 02/49] lustre: lmv: iput() can safely be passed NULL James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 03/49] lustre: llite: mark extended attr and inode flags James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 04/49] lnet: lnet_notify sets route aliveness incorrectly James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 05/49] lnet: Prevent discovery on peer marked deletion James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 06/49] lnet: Prevent discovery on deleted peer James Simmons
2021-04-15  4:01 ` [lustre-devel] [PATCH 07/49] lnet: Transfer disc src NID when merging peers James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 08/49] lnet: Lookup lpni after discovery James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 09/49] lustre: llite: update and fix module loading bug in mounting code James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 10/49] lnet: socklnd: change various ints to bool James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 11/49] lnet: Correct asymmetric route detection James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 12/49] lustre: fixup ldlm_pool and lu_object shrinker failure cases James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 13/49] lustre: log: Add ending newline for some messages James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 14/49] lustre: use with_imp_locked() more broadly James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 15/49] lnet: o2iblnd: change some ints to bool James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 16/49] lustre: lmv: striped directory as subdirectory mount James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 17/49] lustre: llite: create file_operations registration function James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 18/49] lustre: osc: fix performance regression in osc_extent_merge() James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 19/49] lustre: mds: add enums for MDS_ATTR flags James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 20/49] lustre: uapi: remove OBD_IOC_LOV_GET_CONFIG James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 21/49] lustre: sec: fix migrate for encrypted dir James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 22/49] lnet: libcfs: restore LNET_DUMP_ON_PANIC functionality James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 23/49] lustre: ptlrpc: fix ASSERTION on scp_rqbd_posted James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 24/49] lustre: ldlm: not freed req on enqueue James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 25/49] lnet: uapi: move userland only nidstr.h handling James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 26/49] lnet: libcfs: don't depend on sysctl support for debugfs James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 27/49] lustre: ptlrpc: Add a binary heap implementation James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 28/49] lustre: ptlrpc: Implement NRS Delay Policy James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 29/49] lustre: ptlrpc: rename cfs_binheap to simply binheap James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 30/49] lustre: ptlrpc: mark some functions as static James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 31/49] lustre: use tgt_pool for lov layer James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 32/49] lustre: quota: make used for pool correct James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 33/49] lustre: quota: call rhashtable_lookup near params decl James Simmons
2021-04-15  4:02 ` James Simmons [this message]
2021-04-15  4:02 ` [lustre-devel] [PATCH 35/49] lustre: obdclass: Protect cl_env_percpu[] James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 36/49] lnet: libcfs: discard cfs_trace_console_buffers[] James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 37/49] lnet: libcfs: discard cfs_trace_copyin_string() James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 38/49] lustre: lmv: don't use lqr_alloc spinlock in lmv James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 39/49] lustre: lov: fault page update cp_lov_index James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 40/49] lustre: update version to 2.14.51 James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 41/49] lustre: llite: mirror extend/copy keeps sparseness James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 42/49] lustre: ptlrpc: don't use list_for_each_entry_safe unnecessarily James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 43/49] lnet: Age peer NI out of recovery James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 44/49] lnet: Only recover known good peer NIs James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 45/49] lnet: Recover peer NI w/exponential backoff interval James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 46/49] lustre: lov: return valid stripe_count/size for PFL files James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 47/49] lnet: convert lpni_refcount to a kref James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 48/49] lustre: lmv: handle default stripe_count=-1 properly James Simmons
2021-04-15  4:02 ` [lustre-devel] [PATCH 49/49] lnet: libcfs: discard cfs_array_alloc() James Simmons

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1618459361-17909-35-git-send-email-jsimmons@infradead.org \
    --to=jsimmons@infradead.org \
    --cc=adilger@whamcloud.com \
    --cc=c17818@cray.com \
    --cc=green@whamcloud.com \
    --cc=lustre-devel@lists.lustre.org \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.