Linux-NFS Archive on lore.kernel.org
 help / Atom feed
* [PATCH v2 0/3] un-deprecate nfsdcld
@ 2018-12-18 14:29 Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 1/3] nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed char array Scott Mayhew
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Scott Mayhew @ 2018-12-18 14:29 UTC (permalink / raw)
  To: bfields, jlayton; +Cc: linux-nfs

When nfsdcld was released, it was quickly deprecated in favor of the
nfsdcltrack usermodehelper, so as to not require another running daemon.
That prevents NFSv4 clients from reclaiming locks from nfsd's running in
containers, since neither nfsdcltrack nor the legacy client tracking
code work in containers.  These patches un-deprecate the use of nfsdcld
for NFSv4 client tracking.

These patches are intended to go alongside some nfs-utils patches that
introduce an enhancement that allows nfsd to "slurp" up the client
records during client tracking initialization and store them internally
in hash table.  This enables nfsd to check whether an NFSv4 client is
allowed to reclaim without having to do an upcall to nfsdcld.  It also
allows nfsd to decide to end the v4 grace period early if the number of
RECLAIM_COMPLETE operations it has received from "known" clients is
equal to the number of entries in the hash table.  It also allows nfsd
to skip the v4 grace period altogether if it knows there are no clients
allowed to reclaim.

There is a fallback to allow nfsd to continue to work with older nfsdcld
daemons in the event that any are out in the wild (unlikely).
Everything should work fine except nfsd will not be able to exit the
grace period early or skip the grace period altogether.

v2:
- Addressed some coding style issues in nfsd4_create_clid_dir() &
  nfsd4_remove_clid_dir()

Scott Mayhew (3):
  nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed
    char array
  nfsd: un-deprecate nfsdcld
  nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld

 fs/nfsd/netns.h               |   3 +
 fs/nfsd/nfs4recover.c         | 339 +++++++++++++++++++++++++++++++---
 fs/nfsd/nfs4state.c           |  82 ++++++--
 fs/nfsd/nfsctl.c              |   1 +
 fs/nfsd/state.h               |   8 +-
 include/uapi/linux/nfsd/cld.h |   1 +
 6 files changed, 389 insertions(+), 45 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v2 1/3] nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed char array
  2018-12-18 14:29 [PATCH v2 0/3] un-deprecate nfsdcld Scott Mayhew
@ 2018-12-18 14:29 ` Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 2/3] nfsd: un-deprecate nfsdcld Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
  2 siblings, 0 replies; 23+ messages in thread
From: Scott Mayhew @ 2018-12-18 14:29 UTC (permalink / raw)
  To: bfields, jlayton; +Cc: linux-nfs

This will allow the reclaim_str_hashtbl to store either the recovery
directory names used by the legacy client tracking code or the full
client strings used by the nfsdcld client tracking code.

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
---
 fs/nfsd/nfs4recover.c | 109 +++++++++++++++++++++++++++++++++++-------
 fs/nfsd/nfs4state.c   |  30 ++++++------
 fs/nfsd/state.h       |   8 ++--
 3 files changed, 110 insertions(+), 37 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5188f9f70c78..2243b909b407 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -169,13 +169,34 @@ legacy_recdir_name_error(struct nfs4_client *clp, int error)
 	}
 }
 
+static void
+__nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
+		const char *dname, int len, struct nfsd_net *nn)
+{
+	struct xdr_netobj name;
+	struct nfs4_client_reclaim *crp;
+
+	name.data = kmemdup(dname, len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		return;
+	}
+	name.len = len;
+	crp = nfs4_client_to_reclaim(name, nn);
+	if (!crp) {
+		kfree(name.data);
+		return;
+	}
+	crp->cr_clp = clp;
+}
+
 static void
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
 	char dname[HEXDIR_LEN];
 	struct dentry *dir, *dentry;
-	struct nfs4_client_reclaim *crp;
 	int status;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
@@ -221,11 +242,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_unlock:
 	inode_unlock(d_inode(dir));
 	if (status == 0) {
-		if (nn->in_grace) {
-			crp = nfs4_client_to_reclaim(dname, nn);
-			if (crp)
-				crp->cr_clp = clp;
-		}
+		if (nn->in_grace)
+			__nfsd4_create_reclaim_record_grace(clp, dname,
+					HEXDIR_LEN, nn);
 		vfs_fsync(nn->rec_file, 0);
 	} else {
 		printk(KERN_ERR "NFSD: failed to write recovery record"
@@ -345,11 +364,30 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	return status;
 }
 
+static void
+__nfsd4_remove_reclaim_record_grace(const char *dname, int len,
+		struct nfsd_net *nn)
+{
+	struct xdr_netobj name;
+	struct nfs4_client_reclaim *crp;
+
+	name.data = kmemdup(dname, len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		return;
+	}
+	name.len = len;
+	crp = nfsd4_find_reclaim_client(name, nn);
+	kfree(name.data);
+	if (crp)
+		nfs4_remove_reclaim_record(crp, nn);
+}
+
 static void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
-	struct nfs4_client_reclaim *crp;
 	char dname[HEXDIR_LEN];
 	int status;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -374,12 +412,9 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	nfs4_reset_creds(original_cred);
 	if (status == 0) {
 		vfs_fsync(nn->rec_file, 0);
-		if (nn->in_grace) {
-			/* remove reclaim record */
-			crp = nfsd4_find_reclaim_client(dname, nn);
-			if (crp)
-				nfs4_remove_reclaim_record(crp, nn);
-		}
+		if (nn->in_grace)
+			__nfsd4_remove_reclaim_record_grace(dname,
+					HEXDIR_LEN, nn);
 	}
 out_drop_write:
 	mnt_drop_write_file(nn->rec_file);
@@ -393,14 +428,31 @@ static int
 purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	int status;
+	struct xdr_netobj name;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name, nn))
+	if (child->d_name.len != HEXDIR_LEN - 1) {
+		printk("%s: illegal name %pd in recovery directory\n",
+				__func__, child);
+		/* Keep trying; maybe the others are OK: */
 		return 0;
+	}
+	name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out;
+	}
+	name.len = HEXDIR_LEN;
+	if (nfs4_has_reclaimed_state(name, nn))
+		goto out_free;
 
 	status = vfs_rmdir(d_inode(parent), child);
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
+out_free:
+	kfree(name.data);
+out:
 	/* Keep trying, success or failure: */
 	return 0;
 }
@@ -430,13 +482,24 @@ nfsd4_recdir_purge_old(struct nfsd_net *nn)
 static int
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
+	struct xdr_netobj name;
+
 	if (child->d_name.len != HEXDIR_LEN - 1) {
-		printk("nfsd4: illegal name %pd in recovery directory\n",
-				child);
+		printk("%s: illegal name %pd in recovery directory\n",
+				__func__, child);
 		/* Keep trying; maybe the others are OK: */
 		return 0;
 	}
-	nfs4_client_to_reclaim(child->d_name.name, nn);
+	name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out;
+	}
+	name.len = HEXDIR_LEN;
+	if (!nfs4_client_to_reclaim(name, nn))
+		kfree(name.data);
+out:
 	return 0;
 }
 
@@ -616,6 +679,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
 	char dname[HEXDIR_LEN];
 	struct nfs4_client_reclaim *crp;
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct xdr_netobj name;
 
 	/* did we already find that this client is stable? */
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
@@ -628,13 +692,22 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
 	}
 
 	/* look for it in the reclaim hashtable otherwise */
-	crp = nfsd4_find_reclaim_client(dname, nn);
+	name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+	if (!name.data) {
+		dprintk("%s: failed to allocate memory for name.data!\n",
+			__func__);
+		goto out_enoent;
+	}
+	name.len = HEXDIR_LEN;
+	crp = nfsd4_find_reclaim_client(name, nn);
+	kfree(name.data);
 	if (crp) {
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 		crp->cr_clp = clp;
 		return 0;
 	}
 
+out_enoent:
 	return -ENOENT;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f093fbe47133..777fbb0d2761 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1058,9 +1058,9 @@ static unsigned int clientid_hashval(u32 id)
 	return id & CLIENT_HASH_MASK;
 }
 
-static unsigned int clientstr_hashval(const char *name)
+static unsigned int clientstr_hashval(struct xdr_netobj name)
 {
-	return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+	return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK;
 }
 
 /*
@@ -2039,11 +2039,6 @@ compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
 	return memcmp(o1->data, o2->data, o1->len);
 }
 
-static int same_name(const char *n1, const char *n2)
-{
-	return 0 == memcmp(n1, n2, HEXDIR_LEN);
-}
-
 static int
 same_verf(nfs4_verifier *v1, nfs4_verifier *v2)
 {
@@ -6449,7 +6444,7 @@ alloc_reclaim(void)
 }
 
 bool
-nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
+nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
 {
 	struct nfs4_client_reclaim *crp;
 
@@ -6459,20 +6454,24 @@ nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
 
 /*
  * failure => all reset bets are off, nfserr_no_grace...
+ *
+ * The caller is responsible for freeing name.data if NULL is returned (it
+ * will be freed in nfs4_remove_reclaim_record in the normal case).
  */
 struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
+nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp;
 
-	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
+	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", name.len, name.data);
 	crp = alloc_reclaim();
 	if (crp) {
 		strhashval = clientstr_hashval(name);
 		INIT_LIST_HEAD(&crp->cr_strhash);
 		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
-		memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+		crp->cr_name.data = name.data;
+		crp->cr_name.len = name.len;
 		crp->cr_clp = NULL;
 		nn->reclaim_str_hashtbl_size++;
 	}
@@ -6483,6 +6482,7 @@ void
 nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
 {
 	list_del(&crp->cr_strhash);
+	kfree(crp->cr_name.data);
 	kfree(crp);
 	nn->reclaim_str_hashtbl_size--;
 }
@@ -6506,16 +6506,16 @@ nfs4_release_reclaim(struct nfsd_net *nn)
 /*
  * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
 struct nfs4_client_reclaim *
-nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
+nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp = NULL;
 
-	dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
+	dprintk("NFSD: nfs4_find_reclaim_client for name %.*s\n", name.len, name.data);
 
-	strhashval = clientstr_hashval(recdir);
+	strhashval = clientstr_hashval(name);
 	list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
-		if (same_name(crp->cr_recdir, recdir)) {
+		if (compare_blob(&crp->cr_name, &name) == 0) {
 			return crp;
 		}
 	}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6aacb325b6a0..95dd298c4620 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -367,7 +367,7 @@ struct nfs4_client {
 struct nfs4_client_reclaim {
 	struct list_head	cr_strhash;	/* hash by cr_name */
 	struct nfs4_client	*cr_clp;	/* pointer to associated clp */
-	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
+	struct xdr_netobj	cr_name;	/* recovery dir name */
 };
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -619,7 +619,7 @@ void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
-extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name,
 							struct nfsd_net *nn);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
 		struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
@@ -634,9 +634,9 @@ extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
 extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
-extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
 							struct nfsd_net *nn);
-extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
 struct nfs4_file *find_file(struct knfsd_fh *fh);
 void put_nfs4_file(struct nfs4_file *fi);
-- 
2.17.1


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-18 14:29 [PATCH v2 0/3] un-deprecate nfsdcld Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 1/3] nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed char array Scott Mayhew
@ 2018-12-18 14:29 ` Scott Mayhew
  2018-12-19 21:23   ` Jeff Layton
  2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
  2 siblings, 1 reply; 23+ messages in thread
From: Scott Mayhew @ 2018-12-18 14:29 UTC (permalink / raw)
  To: bfields, jlayton; +Cc: linux-nfs

When nfsdcld was released, it was quickly deprecated in favor of the
nfsdcltrack usermodehelper, so as to not require another running daemon.
That prevents NFSv4 clients from reclaiming locks from nfsd's running in
containers, since neither nfsdcltrack nor the legacy client tracking
code work in containers.

This commit un-deprecates the use of nfsdcld, with one twist: we will
populate the reclaim_str_hashtbl on startup.

During client tracking initialization, do an upcall ("GraceStart") to
nfsdcld to get a list of clients from the database.  nfsdcld will do
one downcall with a status of -EINPROGRESS for each client record in
the database, which in turn will cause an nfs4_client_reclaim to be
added to the reclaim_str_hashtbl.  When complete, nfsdcld will do a
final downcall with a status of 0.

This will save nfsd from having to do an upcall to the daemon during
nfs4_check_open_reclaim() processing.

Even though nfsdcld was quickly deprecated, there is a very small chance
of old nfsdcld daemons running in the wild.  These will respond to the
new "GraceStart" upcall with -EOPNOTSUPP, in which case we will log a
message and fall back to the original nfsdcld tracking ops (now called
nfsd4_cld_tracking_ops_v0).

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
---
 fs/nfsd/nfs4recover.c         | 225 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/nfsd/cld.h |   1 +
 2 files changed, 218 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 2243b909b407..89c2a27956d0 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -779,6 +779,34 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
 	return ret;
 }
 
+static ssize_t
+__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
+		struct nfsd_net *nn)
+{
+	uint8_t cmd;
+	struct xdr_netobj name;
+	uint16_t namelen;
+
+	if (get_user(cmd, &cmsg->cm_cmd)) {
+		dprintk("%s: error when copying cmd from userspace", __func__);
+		return -EFAULT;
+	}
+	if (cmd == Cld_GraceStart) {
+		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
+			return -EFAULT;
+		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
+		if (IS_ERR_OR_NULL(name.data))
+			return -EFAULT;
+		name.len = namelen;
+		if (!nfs4_client_to_reclaim(name, nn)) {
+			kfree(name.data);
+			return -EFAULT;
+		}
+		return sizeof(*cmsg);
+	}
+	return -EFAULT;
+}
+
 static ssize_t
 cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
@@ -788,6 +816,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
 						nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
+	int16_t status;
 
 	if (mlen != sizeof(*cmsg)) {
 		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
@@ -801,13 +830,24 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 		return -EFAULT;
 	}
 
+	/*
+	 * copy the status so we know whether to remove the upcall from the
+	 * list (for -EINPROGRESS, we just want to make sure the xid is
+	 * valid, not remove the upcall from the list)
+	 */
+	if (get_user(status, &cmsg->cm_status)) {
+		dprintk("%s: error when copying status from userspace", __func__);
+		return -EFAULT;
+	}
+
 	/* walk the list and find corresponding xid */
 	cup = NULL;
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
 		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
 			cup = tmp;
-			list_del_init(&cup->cu_list);
+			if (status != -EINPROGRESS)
+				list_del_init(&cup->cu_list);
 			break;
 		}
 	}
@@ -819,6 +859,9 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 		return -EINVAL;
 	}
 
+	if (status == -EINPROGRESS)
+		return __cld_pipe_inprogress_downcall(cmsg, nn);
+
 	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
 		return -EFAULT;
 
@@ -1065,9 +1108,14 @@ nfsd4_cld_remove(struct nfs4_client *clp)
 				"record from stable storage: %d\n", ret);
 }
 
-/* Check for presence of a record, and update its timestamp */
+/*
+ * For older nfsdcld's that do not allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record, and update its timestamp
+ */
 static int
-nfsd4_cld_check(struct nfs4_client *clp)
+nfsd4_cld_check_v0(struct nfs4_client *clp)
 {
 	int ret;
 	struct cld_upcall *cup;
@@ -1100,8 +1148,61 @@ nfsd4_cld_check(struct nfs4_client *clp)
 	return ret;
 }
 
+/*
+ * For newer nfsdcld's that allow us to "slurp" the clients
+ * from the tracking database during startup.
+ *
+ * Check for presence of a record in the reclaim_str_hashtbl
+ */
+static int
+nfsd4_cld_check(struct nfs4_client *clp)
+{
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+	if (crp) {
+		crp->cr_clp = clp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int
+nfsd4_cld_grace_start(struct nfsd_net *nn)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_GraceStart;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret)
+		ret = cup->cu_msg.cm_status;
+
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		dprintk("%s: Unable to get clients from userspace: %d\n",
+			__func__, ret);
+	return ret;
+}
+
+/* For older nfsdcld's that need cm_gracetime */
 static void
-nfsd4_cld_grace_done(struct nfsd_net *nn)
+nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
 {
 	int ret;
 	struct cld_upcall *cup;
@@ -1125,11 +1226,118 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
 		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
 }
 
-static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+/*
+ * For newer nfsdcld's that do not need cm_gracetime.  We also need to call
+ * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
+ */
+static void
+nfsd4_cld_grace_done(struct nfsd_net *nn)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+
+	cup = alloc_cld_upcall(cn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cup->cu_msg.cm_cmd = Cld_GraceDone;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	if (!ret)
+		ret = cup->cu_msg.cm_status;
+
+	free_cld_upcall(cup);
+out_err:
+	nfs4_release_reclaim(nn);
+	if (ret)
+		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
+}
+
+static int
+nfs4_cld_state_init(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	int i;
+
+	nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
+						sizeof(struct list_head),
+						GFP_KERNEL);
+	if (!nn->reclaim_str_hashtbl)
+		return -ENOMEM;
+
+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+	nn->reclaim_str_hashtbl_size = 0;
+
+	return 0;
+}
+
+static void
+nfs4_cld_state_shutdown(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	kfree(nn->reclaim_str_hashtbl);
+}
+
+static int
+nfsd4_cld_tracking_init(struct net *net)
+{
+	int status;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	status = nfs4_cld_state_init(net);
+	if (status)
+		return status;
+
+	status = nfsd4_init_cld_pipe(net);
+	if (status)
+		goto err_shutdown;
+
+	status = nfsd4_cld_grace_start(nn);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
+		nfs4_release_reclaim(nn);
+		goto err_remove;
+	}
+	return 0;
+
+err_remove:
+	nfsd4_remove_cld_pipe(net);
+err_shutdown:
+	nfs4_cld_state_shutdown(net);
+	return status;
+}
+
+static void
+nfsd4_cld_tracking_exit(struct net *net)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+	nfs4_release_reclaim(nn);
+	nfsd4_remove_cld_pipe(net);
+	nfs4_cld_state_shutdown(net);
+}
+
+/* For older nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
 	.init		= nfsd4_init_cld_pipe,
 	.exit		= nfsd4_remove_cld_pipe,
 	.create		= nfsd4_cld_create,
 	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check_v0,
+	.grace_done	= nfsd4_cld_grace_done_v0,
+};
+
+/* For newer nfsdcld's */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+	.init		= nfsd4_cld_tracking_init,
+	.exit		= nfsd4_cld_tracking_exit,
+	.create		= nfsd4_cld_create,
+	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check,
 	.grace_done	= nfsd4_cld_grace_done,
 };
@@ -1514,9 +1722,10 @@ nfsd4_client_tracking_init(struct net *net)
 
 	/* Finally, try to use nfsdcld */
 	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
-	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
-			"removed in 3.10. Please transition to using "
-			"nfsdcltrack.\n");
+	status = nn->client_tracking_ops->init(net);
+	if (!status)
+		return status;
+	nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
 do_init:
 	status = nn->client_tracking_ops->init(net);
 	if (status) {
diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
index f8f5cccad749..b1e9de4f07d5 100644
--- a/include/uapi/linux/nfsd/cld.h
+++ b/include/uapi/linux/nfsd/cld.h
@@ -36,6 +36,7 @@ enum cld_command {
 	Cld_Remove,		/* remove record of this cm_id */
 	Cld_Check,		/* is this cm_id allowed? */
 	Cld_GraceDone,		/* grace period is complete */
+	Cld_GraceStart,
 };
 
 /* representation of long-form NFSv4 client ID */
-- 
2.17.1


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-18 14:29 [PATCH v2 0/3] un-deprecate nfsdcld Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 1/3] nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed char array Scott Mayhew
  2018-12-18 14:29 ` [PATCH v2 2/3] nfsd: un-deprecate nfsdcld Scott Mayhew
@ 2018-12-18 14:29 ` Scott Mayhew
  2018-12-19 17:46   ` J. Bruce Fields
                     ` (2 more replies)
  2 siblings, 3 replies; 23+ messages in thread
From: Scott Mayhew @ 2018-12-18 14:29 UTC (permalink / raw)
  To: bfields, jlayton; +Cc: linux-nfs

When using nfsdcld for NFSv4 client tracking, track the number of
RECLAIM_COMPLETE operations we receive from "known" clients to help in
deciding if we can lift the grace period early (or whether we need to
start a v4 grace period at all).

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
---
 fs/nfsd/netns.h       |  3 +++
 fs/nfsd/nfs4recover.c |  5 +++++
 fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsctl.c      |  1 +
 4 files changed, 61 insertions(+)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 32cb8c027483..b42aaa22fba2 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -104,6 +104,9 @@ struct nfsd_net {
 	time_t nfsd4_grace;
 	bool somebody_reclaimed;
 
+	bool track_reclaim_completes;
+	atomic_t nr_reclaim_complete;
+
 	bool nfsd_net_up;
 	bool lockd_up;
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 89c2a27956d0..ae74814b2397 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
 	free_cld_upcall(cup);
 out_err:
 	nfs4_release_reclaim(nn);
+	atomic_set(&nn->nr_reclaim_complete, 0);
 	if (ret)
 		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
 }
@@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
 	for (i = 0; i < CLIENT_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
 	nn->reclaim_str_hashtbl_size = 0;
+	nn->track_reclaim_completes = true;
+	atomic_set(&nn->nr_reclaim_complete, 0);
 
 	return 0;
 }
@@ -1279,6 +1282,7 @@ nfs4_cld_state_shutdown(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nn->track_reclaim_completes = false;
 	kfree(nn->reclaim_str_hashtbl);
 }
 
@@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfs4_release_reclaim(nn);
+	atomic_set(&nn->nr_reclaim_complete, 0);
 	nfsd4_remove_cld_pipe(net);
 	nfs4_cld_state_shutdown(net);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 777fbb0d2761..5dfc3cd51d08 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -77,6 +77,7 @@ static u64 current_sessionid = 1;
 /* forward declarations */
 static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
 static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+void nfsd4_end_grace(struct nfsd_net *nn);
 
 /* Locking: */
 
@@ -1988,10 +1989,41 @@ destroy_client(struct nfs4_client *clp)
 	__destroy_client(clp);
 }
 
+static void inc_reclaim_complete(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->track_reclaim_completes)
+		return;
+	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+		return;
+	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
+		return;
+	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size) {
+		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
+				clp->net->ns.inum);
+		nfsd4_end_grace(nn);
+	}
+}
+
+static void dec_reclaim_complete(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->track_reclaim_completes)
+		return;
+	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+		return;
+	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
+		atomic_dec(&nn->nr_reclaim_complete);
+}
+
 static void expire_client(struct nfs4_client *clp)
 {
 	unhash_client(clp);
 	nfsd4_client_record_remove(clp);
+	dec_reclaim_complete(clp);
 	__destroy_client(clp);
 }
 
@@ -3339,6 +3371,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
 
 	status = nfs_ok;
 	nfsd4_client_record_create(cstate->session->se_client);
+	inc_reclaim_complete(cstate->session->se_client);
 out:
 	return status;
 }
@@ -4734,6 +4767,10 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
 	unsigned long double_grace_period_end = nn->boot_time +
 						2 * nn->nfsd4_lease;
 
+	if (nn->track_reclaim_completes &&
+			atomic_read(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size)
+		return false;
 	if (!nn->somebody_reclaimed)
 		return false;
 	nn->somebody_reclaimed = false;
@@ -7253,10 +7290,25 @@ nfs4_state_start_net(struct net *net)
 		return ret;
 	locks_start_grace(net, &nn->nfsd4_manager);
 	nfsd4_client_tracking_init(net);
+	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
+		goto skip_grace;
 	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
 	       nn->nfsd4_grace, net->ns.inum);
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
 	return 0;
+
+skip_grace:
+	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
+			net->ns.inum);
+	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
+	/*
+	 * we could call nfsd4_end_grace() here, but it has a dprintk()
+	 * that would be confusing if debug logging is enabled
+	 */
+	nn->grace_ended = true;
+	nfsd4_record_grace_done(nn);
+	locks_end_grace(&nn->nfsd4_manager);
+	return 0;
 }
 
 /* initialization to perform when the nfsd service is started: */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6384c9b94898..950ac6683be9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
 	nn->nfsd4_lease = 45;	/* default lease time */
 	nn->nfsd4_grace = 45;
 	nn->somebody_reclaimed = false;
+	nn->track_reclaim_completes = false;
 	nn->clverifier_counter = prandom_u32();
 	nn->clientid_counter = prandom_u32();
 	nn->s2s_cp_cl_id = nn->clientid_counter++;
-- 
2.17.1


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
@ 2018-12-19 17:46   ` J. Bruce Fields
  2018-12-19 21:57     ` Scott Mayhew
  2018-12-19 18:28   ` J. Bruce Fields
  2018-12-19 18:36   ` J. Bruce Fields
  2 siblings, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-19 17:46 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: jlayton, linux-nfs

On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> +skip_grace:
> +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> +			net->ns.inum);
> +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> +	/*
> +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> +	 * that would be confusing if debug logging is enabled
> +	 */

In that case, I'd rather pull the dprintk out of nfsd4_end_grace into
its only other caller (nfs4_laundromat), instead of duplicating this
stuff.

> +	nn->grace_ended = true;
> +	nfsd4_record_grace_done(nn);
> +	locks_end_grace(&nn->nfsd4_manager);

(Yes, it's only three lines, but it's a little subtle, I'd rather have
it all in one place.)

--b.

> +	return 0;
>  }
>  
>  /* initialization to perform when the nfsd service is started: */
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 6384c9b94898..950ac6683be9 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
>  	nn->nfsd4_lease = 45;	/* default lease time */
>  	nn->nfsd4_grace = 45;
>  	nn->somebody_reclaimed = false;
> +	nn->track_reclaim_completes = false;
>  	nn->clverifier_counter = prandom_u32();
>  	nn->clientid_counter = prandom_u32();
>  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> -- 
> 2.17.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
  2018-12-19 17:46   ` J. Bruce Fields
@ 2018-12-19 18:28   ` J. Bruce Fields
  2018-12-19 22:01     ` Scott Mayhew
  2018-12-19 18:36   ` J. Bruce Fields
  2 siblings, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-19 18:28 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: jlayton, linux-nfs

On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 89c2a27956d0..ae74814b2397 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
>  	free_cld_upcall(cup);
>  out_err:
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	if (ret)
>  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
>  }
> @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
>  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
>  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
>  	nn->reclaim_str_hashtbl_size = 0;
> +	nn->track_reclaim_completes = true;
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  
>  	return 0;
>  }
...
> @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	nfsd4_remove_cld_pipe(net);
>  	nfs4_cld_state_shutdown(net);
>  }

We're initializing nr_reclaim_complete in 3 different places, probably
only one of those is really necessary?

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
  2018-12-19 17:46   ` J. Bruce Fields
  2018-12-19 18:28   ` J. Bruce Fields
@ 2018-12-19 18:36   ` J. Bruce Fields
  2018-12-19 22:05     ` Scott Mayhew
  2 siblings, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-19 18:36 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: jlayton, linux-nfs

On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> When using nfsdcld for NFSv4 client tracking, track the number of
> RECLAIM_COMPLETE operations we receive from "known" clients to help in
> deciding if we can lift the grace period early (or whether we need to
> start a v4 grace period at all).
> 
> Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> ---
>  fs/nfsd/netns.h       |  3 +++
>  fs/nfsd/nfs4recover.c |  5 +++++
>  fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfsctl.c      |  1 +
>  4 files changed, 61 insertions(+)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 32cb8c027483..b42aaa22fba2 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -104,6 +104,9 @@ struct nfsd_net {
>  	time_t nfsd4_grace;
>  	bool somebody_reclaimed;
>  
> +	bool track_reclaim_completes;
> +	atomic_t nr_reclaim_complete;
> +
>  	bool nfsd_net_up;
>  	bool lockd_up;
>  
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 89c2a27956d0..ae74814b2397 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
>  	free_cld_upcall(cup);
>  out_err:
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	if (ret)
>  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
>  }
> @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
>  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
>  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
>  	nn->reclaim_str_hashtbl_size = 0;
> +	nn->track_reclaim_completes = true;
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  
>  	return 0;
>  }
> @@ -1279,6 +1282,7 @@ nfs4_cld_state_shutdown(struct net *net)
>  {
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
> +	nn->track_reclaim_completes = false;
>  	kfree(nn->reclaim_str_hashtbl);
>  }
>  
> @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	nfsd4_remove_cld_pipe(net);
>  	nfs4_cld_state_shutdown(net);
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 777fbb0d2761..5dfc3cd51d08 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -77,6 +77,7 @@ static u64 current_sessionid = 1;
>  /* forward declarations */
>  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
>  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
> +void nfsd4_end_grace(struct nfsd_net *nn);
>  
>  /* Locking: */
>  
> @@ -1988,10 +1989,41 @@ destroy_client(struct nfs4_client *clp)
>  	__destroy_client(clp);
>  }
>  
> +static void inc_reclaim_complete(struct nfs4_client *clp)
> +{
> +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> +
> +	if (!nn->track_reclaim_completes)
> +		return;
> +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> +		return;

Looking at the code....  This test will never fail, will it?  We could
make it a WARN_ON_ONCE(!test_bit(...)) but it doesn't look like a likely
bug to me.

> +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> +		return;
> +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> +			nn->reclaim_str_hashtbl_size) {
> +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> +				clp->net->ns.inum);
> +		nfsd4_end_grace(nn);
> +	}
> +}
> +
> +static void dec_reclaim_complete(struct nfs4_client *clp)
> +{
> +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> +
> +	if (!nn->track_reclaim_completes)
> +		return;
> +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> +		return;
> +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> +		atomic_dec(&nn->nr_reclaim_complete);
> +}
> +
>  static void expire_client(struct nfs4_client *clp)
>  {
>  	unhash_client(clp);
>  	nfsd4_client_record_remove(clp);
> +	dec_reclaim_complete(clp);
>  	__destroy_client(clp);
>  }

This doesn't look right to me.  If a client reclaims and then
immediately calls DESTROY_CLIENTID or something--that should still count
as a reclaim, and that shouldn't prevent us from ending the grace period
early.

I think dec_reclaim_complete is unnecessary.

--b.

>  
> @@ -3339,6 +3371,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
>  
>  	status = nfs_ok;
>  	nfsd4_client_record_create(cstate->session->se_client);
> +	inc_reclaim_complete(cstate->session->se_client);
>  out:
>  	return status;
>  }
> @@ -4734,6 +4767,10 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
>  	unsigned long double_grace_period_end = nn->boot_time +
>  						2 * nn->nfsd4_lease;
>  
> +	if (nn->track_reclaim_completes &&
> +			atomic_read(&nn->nr_reclaim_complete) ==
> +			nn->reclaim_str_hashtbl_size)
> +		return false;
>  	if (!nn->somebody_reclaimed)
>  		return false;
>  	nn->somebody_reclaimed = false;
> @@ -7253,10 +7290,25 @@ nfs4_state_start_net(struct net *net)
>  		return ret;
>  	locks_start_grace(net, &nn->nfsd4_manager);
>  	nfsd4_client_tracking_init(net);
> +	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
> +		goto skip_grace;
>  	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
>  	       nn->nfsd4_grace, net->ns.inum);
>  	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
>  	return 0;
> +
> +skip_grace:
> +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> +			net->ns.inum);
> +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> +	/*
> +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> +	 * that would be confusing if debug logging is enabled
> +	 */
> +	nn->grace_ended = true;
> +	nfsd4_record_grace_done(nn);
> +	locks_end_grace(&nn->nfsd4_manager);
> +	return 0;
>  }
>  
>  /* initialization to perform when the nfsd service is started: */
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 6384c9b94898..950ac6683be9 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
>  	nn->nfsd4_lease = 45;	/* default lease time */
>  	nn->nfsd4_grace = 45;
>  	nn->somebody_reclaimed = false;
> +	nn->track_reclaim_completes = false;
>  	nn->clverifier_counter = prandom_u32();
>  	nn->clientid_counter = prandom_u32();
>  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> -- 
> 2.17.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-18 14:29 ` [PATCH v2 2/3] nfsd: un-deprecate nfsdcld Scott Mayhew
@ 2018-12-19 21:23   ` Jeff Layton
  2018-12-19 22:11     ` Scott Mayhew
  0 siblings, 1 reply; 23+ messages in thread
From: Jeff Layton @ 2018-12-19 21:23 UTC (permalink / raw)
  To: Scott Mayhew, bfields; +Cc: linux-nfs

On Tue, 2018-12-18 at 09:29 -0500, Scott Mayhew wrote:
> When nfsdcld was released, it was quickly deprecated in favor of the
> nfsdcltrack usermodehelper, so as to not require another running daemon.
> That prevents NFSv4 clients from reclaiming locks from nfsd's running in
> containers, since neither nfsdcltrack nor the legacy client tracking
> code work in containers.
> 
> This commit un-deprecates the use of nfsdcld, with one twist: we will
> populate the reclaim_str_hashtbl on startup.
> 
> During client tracking initialization, do an upcall ("GraceStart") to
> nfsdcld to get a list of clients from the database.  nfsdcld will do
> one downcall with a status of -EINPROGRESS for each client record in
> the database, which in turn will cause an nfs4_client_reclaim to be
> added to the reclaim_str_hashtbl.  When complete, nfsdcld will do a
> final downcall with a status of 0.
> 
> This will save nfsd from having to do an upcall to the daemon during
> nfs4_check_open_reclaim() processing.
> 
> Even though nfsdcld was quickly deprecated, there is a very small chance
> of old nfsdcld daemons running in the wild.  These will respond to the
> new "GraceStart" upcall with -EOPNOTSUPP, in which case we will log a
> message and fall back to the original nfsdcld tracking ops (now called
> nfsd4_cld_tracking_ops_v0).
> 
> Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> ---
>  fs/nfsd/nfs4recover.c         | 225 ++++++++++++++++++++++++++++++++--
>  include/uapi/linux/nfsd/cld.h |   1 +
>  2 files changed, 218 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 2243b909b407..89c2a27956d0 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -779,6 +779,34 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
>  	return ret;
>  }
>  
> +static ssize_t
> +__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
> +		struct nfsd_net *nn)
> +{
> +	uint8_t cmd;
> +	struct xdr_netobj name;
> +	uint16_t namelen;
> +
> +	if (get_user(cmd, &cmsg->cm_cmd)) {
> +		dprintk("%s: error when copying cmd from userspace", __func__);
> +		return -EFAULT;
> +	}
> +	if (cmd == Cld_GraceStart) {
> +		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
> +			return -EFAULT;
> +		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
> +		if (IS_ERR_OR_NULL(name.data))
> +			return -EFAULT;
> +		name.len = namelen;
> +		if (!nfs4_client_to_reclaim(name, nn)) {
> +			kfree(name.data);
> +			return -EFAULT;
> +		}
> +		return sizeof(*cmsg);
> +	}
> +	return -EFAULT;
> +}
> +
>  static ssize_t
>  cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
>  {
> @@ -788,6 +816,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
>  	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
>  						nfsd_net_id);
>  	struct cld_net *cn = nn->cld_net;
> +	int16_t status;
>  
>  	if (mlen != sizeof(*cmsg)) {
>  		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
> @@ -801,13 +830,24 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
>  		return -EFAULT;
>  	}
>  
> +	/*
> +	 * copy the status so we know whether to remove the upcall from the
> +	 * list (for -EINPROGRESS, we just want to make sure the xid is
> +	 * valid, not remove the upcall from the list)
> +	 */
> +	if (get_user(status, &cmsg->cm_status)) {
> +		dprintk("%s: error when copying status from userspace", __func__);
> +		return -EFAULT;
> +	}
> +
>  	/* walk the list and find corresponding xid */
>  	cup = NULL;
>  	spin_lock(&cn->cn_lock);
>  	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
>  		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
>  			cup = tmp;
> -			list_del_init(&cup->cu_list);
> +			if (status != -EINPROGRESS)
> +				list_del_init(&cup->cu_list);
>  			break;
>  		}
>  	}
> @@ -819,6 +859,9 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
>  		return -EINVAL;
>  	}
>  
> +	if (status == -EINPROGRESS)
> +		return __cld_pipe_inprogress_downcall(cmsg, nn);
> +
>  	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
>  		return -EFAULT;
>  
> @@ -1065,9 +1108,14 @@ nfsd4_cld_remove(struct nfs4_client *clp)
>  				"record from stable storage: %d\n", ret);
>  }
>  
> -/* Check for presence of a record, and update its timestamp */
> +/*
> + * For older nfsdcld's that do not allow us to "slurp" the clients
> + * from the tracking database during startup.
> + *
> + * Check for presence of a record, and update its timestamp
> + */
>  static int
> -nfsd4_cld_check(struct nfs4_client *clp)
> +nfsd4_cld_check_v0(struct nfs4_client *clp)
>  {
>  	int ret;
>  	struct cld_upcall *cup;
> @@ -1100,8 +1148,61 @@ nfsd4_cld_check(struct nfs4_client *clp)
>  	return ret;
>  }
>  
> +/*
> + * For newer nfsdcld's that allow us to "slurp" the clients
> + * from the tracking database during startup.
> + *
> + * Check for presence of a record in the reclaim_str_hashtbl
> + */
> +static int
> +nfsd4_cld_check(struct nfs4_client *clp)
> +{
> +	struct nfs4_client_reclaim *crp;
> +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> +
> +	/* did we already find that this client is stable? */
> +	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> +		return 0;
> +
> +	/* look for it in the reclaim hashtable otherwise */
> +	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
> +	if (crp) {
> +		crp->cr_clp = clp;
> +		return 0;
> +	}
> +
> +	return -ENOENT;
> +}
> +
> +static int
> +nfsd4_cld_grace_start(struct nfsd_net *nn)
> +{
> +	int ret;
> +	struct cld_upcall *cup;
> +	struct cld_net *cn = nn->cld_net;
> +
> +	cup = alloc_cld_upcall(cn);
> +	if (!cup) {
> +		ret = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	cup->cu_msg.cm_cmd = Cld_GraceStart;
> +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> +	if (!ret)
> +		ret = cup->cu_msg.cm_status;
> +
> +	free_cld_upcall(cup);
> +out_err:
> +	if (ret)
> +		dprintk("%s: Unable to get clients from userspace: %d\n",
> +			__func__, ret);
> +	return ret;
> +}
> +
> +/* For older nfsdcld's that need cm_gracetime */
>  static void
> -nfsd4_cld_grace_done(struct nfsd_net *nn)
> +nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
>  {
>  	int ret;
>  	struct cld_upcall *cup;
> @@ -1125,11 +1226,118 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
>  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
>  }
>  
> -static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> +/*
> + * For newer nfsdcld's that do not need cm_gracetime.  We also need to call
> + * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
> + */
> +static void
> +nfsd4_cld_grace_done(struct nfsd_net *nn)
> +{
> +	int ret;
> +	struct cld_upcall *cup;
> +	struct cld_net *cn = nn->cld_net;
> +
> +	cup = alloc_cld_upcall(cn);
> +	if (!cup) {
> +		ret = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	cup->cu_msg.cm_cmd = Cld_GraceDone;
> +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> +	if (!ret)
> +		ret = cup->cu_msg.cm_status;
> +
> +	free_cld_upcall(cup);
> +out_err:
> +	nfs4_release_reclaim(nn);
> +	if (ret)
> +		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> +}
> +
> +static int
> +nfs4_cld_state_init(struct net *net)
> +{
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	int i;
> +
> +	nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
> +						sizeof(struct list_head),
> +						GFP_KERNEL);
> +	if (!nn->reclaim_str_hashtbl)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> +		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> +	nn->reclaim_str_hashtbl_size = 0;
> +
> +	return 0;
> +}
> +
> +static void
> +nfs4_cld_state_shutdown(struct net *net)
> +{
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +
> +	kfree(nn->reclaim_str_hashtbl);
> +}
> +
> +static int
> +nfsd4_cld_tracking_init(struct net *net)
> +{
> +	int status;
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +
> +	status = nfs4_cld_state_init(net);
> +	if (status)
> +		return status;
> +
> +	status = nfsd4_init_cld_pipe(net);
> +	if (status)
> +		goto err_shutdown;
> +
> +	status = nfsd4_cld_grace_start(nn);
> +	if (status) {
> +		if (status == -EOPNOTSUPP)
> +			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
> +		nfs4_release_reclaim(nn);
> +		goto err_remove;
> +	}
> +	return 0;
> +
> +err_remove:
> +	nfsd4_remove_cld_pipe(net);
> +err_shutdown:
> +	nfs4_cld_state_shutdown(net);
> +	return status;
> +}
> +
> +static void
> +nfsd4_cld_tracking_exit(struct net *net)
> +{
> +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +
> +	nfs4_release_reclaim(nn);
> +	nfsd4_remove_cld_pipe(net);
> +	nfs4_cld_state_shutdown(net);
> +}
> +
> +/* For older nfsdcld's */
> +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
>  	.init		= nfsd4_init_cld_pipe,
>  	.exit		= nfsd4_remove_cld_pipe,
>  	.create		= nfsd4_cld_create,
>  	.remove		= nfsd4_cld_remove,
> +	.check		= nfsd4_cld_check_v0,
> +	.grace_done	= nfsd4_cld_grace_done_v0,
> +};
> +
> +/* For newer nfsdcld's */
> +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> +	.init		= nfsd4_cld_tracking_init,
> +	.exit		= nfsd4_cld_tracking_exit,
> +	.create		= nfsd4_cld_create,
> +	.remove		= nfsd4_cld_remove,
>  	.check		= nfsd4_cld_check,
>  	.grace_done	= nfsd4_cld_grace_done,
>  };
> @@ -1514,9 +1722,10 @@ nfsd4_client_tracking_init(struct net *net)
>  
>  	/* Finally, try to use nfsdcld */
>  	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
> -	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
> -			"removed in 3.10. Please transition to using "
> -			"nfsdcltrack.\n");
> +	status = nn->client_tracking_ops->init(net);
> +	if (!status)
> +		return status;
> +	nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;

It seems like we probably ought to check to see if the daemon is up
before attempting a UMH upcall now? If someone starts up the daemon
they'll probably be surprised that it didn't get used because there was
a UMH upcall program present.

>  do_init:
>  	status = nn->client_tracking_ops->init(net);
>  	if (status) {
> diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
> index f8f5cccad749..b1e9de4f07d5 100644
> --- a/include/uapi/linux/nfsd/cld.h
> +++ b/include/uapi/linux/nfsd/cld.h
> @@ -36,6 +36,7 @@ enum cld_command {
>  	Cld_Remove,		/* remove record of this cm_id */
>  	Cld_Check,		/* is this cm_id allowed? */
>  	Cld_GraceDone,		/* grace period is complete */
> +	Cld_GraceStart,
>  };
>  
>  /* representation of long-form NFSv4 client ID */

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 17:46   ` J. Bruce Fields
@ 2018-12-19 21:57     ` Scott Mayhew
  0 siblings, 0 replies; 23+ messages in thread
From: Scott Mayhew @ 2018-12-19 21:57 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: jlayton, linux-nfs

On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > +skip_grace:
> > +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> > +			net->ns.inum);
> > +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> > +	/*
> > +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> > +	 * that would be confusing if debug logging is enabled
> > +	 */
> 
> In that case, I'd rather pull the dprintk out of nfsd4_end_grace into
> its only other caller (nfs4_laundromat), instead of duplicating this
> stuff.

Okay, will do.

-Scott
> 
> > +	nn->grace_ended = true;
> > +	nfsd4_record_grace_done(nn);
> > +	locks_end_grace(&nn->nfsd4_manager);
> 
> (Yes, it's only three lines, but it's a little subtle, I'd rather have
> it all in one place.)
> 
> --b.
> 
> > +	return 0;
> >  }
> >  
> >  /* initialization to perform when the nfsd service is started: */
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 6384c9b94898..950ac6683be9 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
> >  	nn->nfsd4_lease = 45;	/* default lease time */
> >  	nn->nfsd4_grace = 45;
> >  	nn->somebody_reclaimed = false;
> > +	nn->track_reclaim_completes = false;
> >  	nn->clverifier_counter = prandom_u32();
> >  	nn->clientid_counter = prandom_u32();
> >  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> > -- 
> > 2.17.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 18:28   ` J. Bruce Fields
@ 2018-12-19 22:01     ` Scott Mayhew
  0 siblings, 0 replies; 23+ messages in thread
From: Scott Mayhew @ 2018-12-19 22:01 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: jlayton, linux-nfs

On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 89c2a27956d0..ae74814b2397 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> >  	free_cld_upcall(cup);
> >  out_err:
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	if (ret)
> >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> >  }
> > @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
> >  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> >  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> >  	nn->reclaim_str_hashtbl_size = 0;
> > +	nn->track_reclaim_completes = true;
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  
> >  	return 0;
> >  }
> ...
> > @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	nfsd4_remove_cld_pipe(net);
> >  	nfs4_cld_state_shutdown(net);
> >  }
> 
> We're initializing nr_reclaim_complete in 3 different places, probably
> only one of those is really necessary?

Yes, only the one in nfs4_cld_state_init() is really necessary.  If the
ability to put a running server into grace is ever added, then I think
the counter would need to be reset there too.

-Scott
> 
> --b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 18:36   ` J. Bruce Fields
@ 2018-12-19 22:05     ` Scott Mayhew
  2018-12-19 22:21       ` J. Bruce Fields
  0 siblings, 1 reply; 23+ messages in thread
From: Scott Mayhew @ 2018-12-19 22:05 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: jlayton, linux-nfs

On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > When using nfsdcld for NFSv4 client tracking, track the number of
> > RECLAIM_COMPLETE operations we receive from "known" clients to help in
> > deciding if we can lift the grace period early (or whether we need to
> > start a v4 grace period at all).
> > 
> > Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> > ---
> >  fs/nfsd/netns.h       |  3 +++
> >  fs/nfsd/nfs4recover.c |  5 +++++
> >  fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfsctl.c      |  1 +
> >  4 files changed, 61 insertions(+)
> > 
> > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > index 32cb8c027483..b42aaa22fba2 100644
> > --- a/fs/nfsd/netns.h
> > +++ b/fs/nfsd/netns.h
> > @@ -104,6 +104,9 @@ struct nfsd_net {
> >  	time_t nfsd4_grace;
> >  	bool somebody_reclaimed;
> >  
> > +	bool track_reclaim_completes;
> > +	atomic_t nr_reclaim_complete;
> > +
> >  	bool nfsd_net_up;
> >  	bool lockd_up;
> >  
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 89c2a27956d0..ae74814b2397 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> >  	free_cld_upcall(cup);
> >  out_err:
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	if (ret)
> >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> >  }
> > @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
> >  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> >  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> >  	nn->reclaim_str_hashtbl_size = 0;
> > +	nn->track_reclaim_completes = true;
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  
> >  	return 0;
> >  }
> > @@ -1279,6 +1282,7 @@ nfs4_cld_state_shutdown(struct net *net)
> >  {
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> > +	nn->track_reclaim_completes = false;
> >  	kfree(nn->reclaim_str_hashtbl);
> >  }
> >  
> > @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	nfsd4_remove_cld_pipe(net);
> >  	nfs4_cld_state_shutdown(net);
> >  }
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 777fbb0d2761..5dfc3cd51d08 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -77,6 +77,7 @@ static u64 current_sessionid = 1;
> >  /* forward declarations */
> >  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
> >  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
> > +void nfsd4_end_grace(struct nfsd_net *nn);
> >  
> >  /* Locking: */
> >  
> > @@ -1988,10 +1989,41 @@ destroy_client(struct nfs4_client *clp)
> >  	__destroy_client(clp);
> >  }
> >  
> > +static void inc_reclaim_complete(struct nfs4_client *clp)
> > +{
> > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +
> > +	if (!nn->track_reclaim_completes)
> > +		return;
> > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > +		return;
> 
> Looking at the code....  This test will never fail, will it?  We could
> make it a WARN_ON_ONCE(!test_bit(...)) but it doesn't look like a likely
> bug to me.

No, that test should never fail... I think I was being overly cautious.

> 
> > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > +		return;
> > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > +			nn->reclaim_str_hashtbl_size) {
> > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > +				clp->net->ns.inum);
> > +		nfsd4_end_grace(nn);
> > +	}
> > +}
> > +
> > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > +{
> > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +
> > +	if (!nn->track_reclaim_completes)
> > +		return;
> > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > +		return;
> > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > +		atomic_dec(&nn->nr_reclaim_complete);
> > +}
> > +
> >  static void expire_client(struct nfs4_client *clp)
> >  {
> >  	unhash_client(clp);
> >  	nfsd4_client_record_remove(clp);
> > +	dec_reclaim_complete(clp);
> >  	__destroy_client(clp);
> >  }
> 
> This doesn't look right to me.  If a client reclaims and then
> immediately calls DESTROY_CLIENTID or something--that should still count
> as a reclaim, and that shouldn't prevent us from ending the grace period
> early.
> 
> I think dec_reclaim_complete is unnecessary.

What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
still in grace?  The count would be too high then and the server could
exit grace before all the clients have reclaimed.  I actually added
that at Jeff's suggestion because he was seeing it with nfs-ganesha.  

-Scott
> 
> --b.
> 
> >  
> > @@ -3339,6 +3371,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
> >  
> >  	status = nfs_ok;
> >  	nfsd4_client_record_create(cstate->session->se_client);
> > +	inc_reclaim_complete(cstate->session->se_client);
> >  out:
> >  	return status;
> >  }
> > @@ -4734,6 +4767,10 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
> >  	unsigned long double_grace_period_end = nn->boot_time +
> >  						2 * nn->nfsd4_lease;
> >  
> > +	if (nn->track_reclaim_completes &&
> > +			atomic_read(&nn->nr_reclaim_complete) ==
> > +			nn->reclaim_str_hashtbl_size)
> > +		return false;
> >  	if (!nn->somebody_reclaimed)
> >  		return false;
> >  	nn->somebody_reclaimed = false;
> > @@ -7253,10 +7290,25 @@ nfs4_state_start_net(struct net *net)
> >  		return ret;
> >  	locks_start_grace(net, &nn->nfsd4_manager);
> >  	nfsd4_client_tracking_init(net);
> > +	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
> > +		goto skip_grace;
> >  	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
> >  	       nn->nfsd4_grace, net->ns.inum);
> >  	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
> >  	return 0;
> > +
> > +skip_grace:
> > +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> > +			net->ns.inum);
> > +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> > +	/*
> > +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> > +	 * that would be confusing if debug logging is enabled
> > +	 */
> > +	nn->grace_ended = true;
> > +	nfsd4_record_grace_done(nn);
> > +	locks_end_grace(&nn->nfsd4_manager);
> > +	return 0;
> >  }
> >  
> >  /* initialization to perform when the nfsd service is started: */
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 6384c9b94898..950ac6683be9 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
> >  	nn->nfsd4_lease = 45;	/* default lease time */
> >  	nn->nfsd4_grace = 45;
> >  	nn->somebody_reclaimed = false;
> > +	nn->track_reclaim_completes = false;
> >  	nn->clverifier_counter = prandom_u32();
> >  	nn->clientid_counter = prandom_u32();
> >  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> > -- 
> > 2.17.1

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-19 21:23   ` Jeff Layton
@ 2018-12-19 22:11     ` Scott Mayhew
  2018-12-20  0:19       ` Jeff Layton
  0 siblings, 1 reply; 23+ messages in thread
From: Scott Mayhew @ 2018-12-19 22:11 UTC (permalink / raw)
  To: Jeff Layton; +Cc: bfields, linux-nfs

On Wed, 19 Dec 2018, Jeff Layton wrote:

> On Tue, 2018-12-18 at 09:29 -0500, Scott Mayhew wrote:
> > When nfsdcld was released, it was quickly deprecated in favor of the
> > nfsdcltrack usermodehelper, so as to not require another running daemon.
> > That prevents NFSv4 clients from reclaiming locks from nfsd's running in
> > containers, since neither nfsdcltrack nor the legacy client tracking
> > code work in containers.
> > 
> > This commit un-deprecates the use of nfsdcld, with one twist: we will
> > populate the reclaim_str_hashtbl on startup.
> > 
> > During client tracking initialization, do an upcall ("GraceStart") to
> > nfsdcld to get a list of clients from the database.  nfsdcld will do
> > one downcall with a status of -EINPROGRESS for each client record in
> > the database, which in turn will cause an nfs4_client_reclaim to be
> > added to the reclaim_str_hashtbl.  When complete, nfsdcld will do a
> > final downcall with a status of 0.
> > 
> > This will save nfsd from having to do an upcall to the daemon during
> > nfs4_check_open_reclaim() processing.
> > 
> > Even though nfsdcld was quickly deprecated, there is a very small chance
> > of old nfsdcld daemons running in the wild.  These will respond to the
> > new "GraceStart" upcall with -EOPNOTSUPP, in which case we will log a
> > message and fall back to the original nfsdcld tracking ops (now called
> > nfsd4_cld_tracking_ops_v0).
> > 
> > Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> > ---
> >  fs/nfsd/nfs4recover.c         | 225 ++++++++++++++++++++++++++++++++--
> >  include/uapi/linux/nfsd/cld.h |   1 +
> >  2 files changed, 218 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 2243b909b407..89c2a27956d0 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -779,6 +779,34 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
> >  	return ret;
> >  }
> >  
> > +static ssize_t
> > +__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
> > +		struct nfsd_net *nn)
> > +{
> > +	uint8_t cmd;
> > +	struct xdr_netobj name;
> > +	uint16_t namelen;
> > +
> > +	if (get_user(cmd, &cmsg->cm_cmd)) {
> > +		dprintk("%s: error when copying cmd from userspace", __func__);
> > +		return -EFAULT;
> > +	}
> > +	if (cmd == Cld_GraceStart) {
> > +		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
> > +			return -EFAULT;
> > +		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
> > +		if (IS_ERR_OR_NULL(name.data))
> > +			return -EFAULT;
> > +		name.len = namelen;
> > +		if (!nfs4_client_to_reclaim(name, nn)) {
> > +			kfree(name.data);
> > +			return -EFAULT;
> > +		}
> > +		return sizeof(*cmsg);
> > +	}
> > +	return -EFAULT;
> > +}
> > +
> >  static ssize_t
> >  cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> >  {
> > @@ -788,6 +816,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> >  	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
> >  						nfsd_net_id);
> >  	struct cld_net *cn = nn->cld_net;
> > +	int16_t status;
> >  
> >  	if (mlen != sizeof(*cmsg)) {
> >  		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
> > @@ -801,13 +830,24 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> >  		return -EFAULT;
> >  	}
> >  
> > +	/*
> > +	 * copy the status so we know whether to remove the upcall from the
> > +	 * list (for -EINPROGRESS, we just want to make sure the xid is
> > +	 * valid, not remove the upcall from the list)
> > +	 */
> > +	if (get_user(status, &cmsg->cm_status)) {
> > +		dprintk("%s: error when copying status from userspace", __func__);
> > +		return -EFAULT;
> > +	}
> > +
> >  	/* walk the list and find corresponding xid */
> >  	cup = NULL;
> >  	spin_lock(&cn->cn_lock);
> >  	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
> >  		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
> >  			cup = tmp;
> > -			list_del_init(&cup->cu_list);
> > +			if (status != -EINPROGRESS)
> > +				list_del_init(&cup->cu_list);
> >  			break;
> >  		}
> >  	}
> > @@ -819,6 +859,9 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> >  		return -EINVAL;
> >  	}
> >  
> > +	if (status == -EINPROGRESS)
> > +		return __cld_pipe_inprogress_downcall(cmsg, nn);
> > +
> >  	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
> >  		return -EFAULT;
> >  
> > @@ -1065,9 +1108,14 @@ nfsd4_cld_remove(struct nfs4_client *clp)
> >  				"record from stable storage: %d\n", ret);
> >  }
> >  
> > -/* Check for presence of a record, and update its timestamp */
> > +/*
> > + * For older nfsdcld's that do not allow us to "slurp" the clients
> > + * from the tracking database during startup.
> > + *
> > + * Check for presence of a record, and update its timestamp
> > + */
> >  static int
> > -nfsd4_cld_check(struct nfs4_client *clp)
> > +nfsd4_cld_check_v0(struct nfs4_client *clp)
> >  {
> >  	int ret;
> >  	struct cld_upcall *cup;
> > @@ -1100,8 +1148,61 @@ nfsd4_cld_check(struct nfs4_client *clp)
> >  	return ret;
> >  }
> >  
> > +/*
> > + * For newer nfsdcld's that allow us to "slurp" the clients
> > + * from the tracking database during startup.
> > + *
> > + * Check for presence of a record in the reclaim_str_hashtbl
> > + */
> > +static int
> > +nfsd4_cld_check(struct nfs4_client *clp)
> > +{
> > +	struct nfs4_client_reclaim *crp;
> > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +
> > +	/* did we already find that this client is stable? */
> > +	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > +		return 0;
> > +
> > +	/* look for it in the reclaim hashtable otherwise */
> > +	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
> > +	if (crp) {
> > +		crp->cr_clp = clp;
> > +		return 0;
> > +	}
> > +
> > +	return -ENOENT;
> > +}
> > +
> > +static int
> > +nfsd4_cld_grace_start(struct nfsd_net *nn)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +	struct cld_net *cn = nn->cld_net;
> > +
> > +	cup = alloc_cld_upcall(cn);
> > +	if (!cup) {
> > +		ret = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_GraceStart;
> > +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> > +	if (!ret)
> > +		ret = cup->cu_msg.cm_status;
> > +
> > +	free_cld_upcall(cup);
> > +out_err:
> > +	if (ret)
> > +		dprintk("%s: Unable to get clients from userspace: %d\n",
> > +			__func__, ret);
> > +	return ret;
> > +}
> > +
> > +/* For older nfsdcld's that need cm_gracetime */
> >  static void
> > -nfsd4_cld_grace_done(struct nfsd_net *nn)
> > +nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
> >  {
> >  	int ret;
> >  	struct cld_upcall *cup;
> > @@ -1125,11 +1226,118 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> >  }
> >  
> > -static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> > +/*
> > + * For newer nfsdcld's that do not need cm_gracetime.  We also need to call
> > + * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
> > + */
> > +static void
> > +nfsd4_cld_grace_done(struct nfsd_net *nn)
> > +{
> > +	int ret;
> > +	struct cld_upcall *cup;
> > +	struct cld_net *cn = nn->cld_net;
> > +
> > +	cup = alloc_cld_upcall(cn);
> > +	if (!cup) {
> > +		ret = -ENOMEM;
> > +		goto out_err;
> > +	}
> > +
> > +	cup->cu_msg.cm_cmd = Cld_GraceDone;
> > +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> > +	if (!ret)
> > +		ret = cup->cu_msg.cm_status;
> > +
> > +	free_cld_upcall(cup);
> > +out_err:
> > +	nfs4_release_reclaim(nn);
> > +	if (ret)
> > +		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> > +}
> > +
> > +static int
> > +nfs4_cld_state_init(struct net *net)
> > +{
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	int i;
> > +
> > +	nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
> > +						sizeof(struct list_head),
> > +						GFP_KERNEL);
> > +	if (!nn->reclaim_str_hashtbl)
> > +		return -ENOMEM;
> > +
> > +	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> > +		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> > +	nn->reclaim_str_hashtbl_size = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +static void
> > +nfs4_cld_state_shutdown(struct net *net)
> > +{
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +
> > +	kfree(nn->reclaim_str_hashtbl);
> > +}
> > +
> > +static int
> > +nfsd4_cld_tracking_init(struct net *net)
> > +{
> > +	int status;
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +
> > +	status = nfs4_cld_state_init(net);
> > +	if (status)
> > +		return status;
> > +
> > +	status = nfsd4_init_cld_pipe(net);
> > +	if (status)
> > +		goto err_shutdown;
> > +
> > +	status = nfsd4_cld_grace_start(nn);
> > +	if (status) {
> > +		if (status == -EOPNOTSUPP)
> > +			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
> > +		nfs4_release_reclaim(nn);
> > +		goto err_remove;
> > +	}
> > +	return 0;
> > +
> > +err_remove:
> > +	nfsd4_remove_cld_pipe(net);
> > +err_shutdown:
> > +	nfs4_cld_state_shutdown(net);
> > +	return status;
> > +}
> > +
> > +static void
> > +nfsd4_cld_tracking_exit(struct net *net)
> > +{
> > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +
> > +	nfs4_release_reclaim(nn);
> > +	nfsd4_remove_cld_pipe(net);
> > +	nfs4_cld_state_shutdown(net);
> > +}
> > +
> > +/* For older nfsdcld's */
> > +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
> >  	.init		= nfsd4_init_cld_pipe,
> >  	.exit		= nfsd4_remove_cld_pipe,
> >  	.create		= nfsd4_cld_create,
> >  	.remove		= nfsd4_cld_remove,
> > +	.check		= nfsd4_cld_check_v0,
> > +	.grace_done	= nfsd4_cld_grace_done_v0,
> > +};
> > +
> > +/* For newer nfsdcld's */
> > +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> > +	.init		= nfsd4_cld_tracking_init,
> > +	.exit		= nfsd4_cld_tracking_exit,
> > +	.create		= nfsd4_cld_create,
> > +	.remove		= nfsd4_cld_remove,
> >  	.check		= nfsd4_cld_check,
> >  	.grace_done	= nfsd4_cld_grace_done,
> >  };
> > @@ -1514,9 +1722,10 @@ nfsd4_client_tracking_init(struct net *net)
> >  
> >  	/* Finally, try to use nfsdcld */
> >  	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
> > -	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
> > -			"removed in 3.10. Please transition to using "
> > -			"nfsdcltrack.\n");
> > +	status = nn->client_tracking_ops->init(net);
> > +	if (!status)
> > +		return status;
> > +	nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
> 
> It seems like we probably ought to check to see if the daemon is up
> before attempting a UMH upcall now? If someone starts up the daemon
> they'll probably be surprised that it didn't get used because there was
> a UMH upcall program present.

I figured that the UMH upcall program would still be the default and
that the admin would have to do some extra configuration to use nfsdcld,
namely remove the /var/lib/nfs/v4recovery directory and set an empty
value for nfsd's 'cltrack_prog' module option.  Do you think that's a
bad idea?

-Scott

> 
> >  do_init:
> >  	status = nn->client_tracking_ops->init(net);
> >  	if (status) {
> > diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
> > index f8f5cccad749..b1e9de4f07d5 100644
> > --- a/include/uapi/linux/nfsd/cld.h
> > +++ b/include/uapi/linux/nfsd/cld.h
> > @@ -36,6 +36,7 @@ enum cld_command {
> >  	Cld_Remove,		/* remove record of this cm_id */
> >  	Cld_Check,		/* is this cm_id allowed? */
> >  	Cld_GraceDone,		/* grace period is complete */
> > +	Cld_GraceStart,
> >  };
> >  
> >  /* representation of long-form NFSv4 client ID */
> 
> -- 
> Jeff Layton <jlayton@kernel.org>
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 22:05     ` Scott Mayhew
@ 2018-12-19 22:21       ` J. Bruce Fields
  2018-12-19 22:43         ` J. Bruce Fields
  2018-12-20 17:29         ` Jeff Layton
  0 siblings, 2 replies; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-19 22:21 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: jlayton, linux-nfs

On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> 
> > On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > +		return;
> > > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > > +			nn->reclaim_str_hashtbl_size) {
> > > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > > +				clp->net->ns.inum);
> > > +		nfsd4_end_grace(nn);
> > > +	}
> > > +}
> > > +
> > > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > > +{
> > > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > +
> > > +	if (!nn->track_reclaim_completes)
> > > +		return;
> > > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > > +		return;
> > > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > +		atomic_dec(&nn->nr_reclaim_complete);
> > > +}
> > > +
> > >  static void expire_client(struct nfs4_client *clp)
> > >  {
> > >  	unhash_client(clp);
> > >  	nfsd4_client_record_remove(clp);
> > > +	dec_reclaim_complete(clp);
> > >  	__destroy_client(clp);
> > >  }
> > 
> > This doesn't look right to me.  If a client reclaims and then
> > immediately calls DESTROY_CLIENTID or something--that should still count
> > as a reclaim, and that shouldn't prevent us from ending the grace period
> > early.
> > 
> > I think dec_reclaim_complete is unnecessary.
> 
> What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> still in grace?  The count would be too high then and the server could
> exit grace before all the clients have reclaimed.  I actually added
> that at Jeff's suggestion because he was seeing it with nfs-ganesha.  

Oh boy.

(Thinks.)

Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
previous client instance's state, it's got no locks to reclaim any more.
(It can't have gotten any *new* ones, since we're still in the grace
period.)

It's effectively a brand new client.  Only reclaiming clients should
bump that counter.

We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
grace period, that client just doesn't matter any more.

I think?

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 22:21       ` J. Bruce Fields
@ 2018-12-19 22:43         ` J. Bruce Fields
  2018-12-20 16:36           ` Scott Mayhew
  2018-12-20 17:29         ` Jeff Layton
  1 sibling, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-19 22:43 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: jlayton, linux-nfs

On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > still in grace?  The count would be too high then and the server could
> > exit grace before all the clients have reclaimed.  I actually added
> > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> 
> Oh boy.
> 
> (Thinks.)
> 
> Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> previous client instance's state, it's got no locks to reclaim any more.
> (It can't have gotten any *new* ones, since we're still in the grace
> period.)
> 
> It's effectively a brand new client.  Only reclaiming clients should
> bump that counter.
> 
> We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> grace period, that client just doesn't matter any more.

Actually, once the client's destroyed, it shouldn't matter whether the
previous incarnation of the client reclaimed or not.  It's never going
to reclaim now....  So expire_client should probably just be removing
the client from the table of reclaimable clients at the same time that
it removes its stable storage record.

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-19 22:11     ` Scott Mayhew
@ 2018-12-20  0:19       ` Jeff Layton
  2018-12-20  1:59         ` J. Bruce Fields
  0 siblings, 1 reply; 23+ messages in thread
From: Jeff Layton @ 2018-12-20  0:19 UTC (permalink / raw)
  To: Scott Mayhew; +Cc: bfields, linux-nfs

On Wed, 2018-12-19 at 17:11 -0500, Scott Mayhew wrote:
> On Wed, 19 Dec 2018, Jeff Layton wrote:
> 
> > On Tue, 2018-12-18 at 09:29 -0500, Scott Mayhew wrote:
> > > When nfsdcld was released, it was quickly deprecated in favor of the
> > > nfsdcltrack usermodehelper, so as to not require another running daemon.
> > > That prevents NFSv4 clients from reclaiming locks from nfsd's running in
> > > containers, since neither nfsdcltrack nor the legacy client tracking
> > > code work in containers.
> > > 
> > > This commit un-deprecates the use of nfsdcld, with one twist: we will
> > > populate the reclaim_str_hashtbl on startup.
> > > 
> > > During client tracking initialization, do an upcall ("GraceStart") to
> > > nfsdcld to get a list of clients from the database.  nfsdcld will do
> > > one downcall with a status of -EINPROGRESS for each client record in
> > > the database, which in turn will cause an nfs4_client_reclaim to be
> > > added to the reclaim_str_hashtbl.  When complete, nfsdcld will do a
> > > final downcall with a status of 0.
> > > 
> > > This will save nfsd from having to do an upcall to the daemon during
> > > nfs4_check_open_reclaim() processing.
> > > 
> > > Even though nfsdcld was quickly deprecated, there is a very small chance
> > > of old nfsdcld daemons running in the wild.  These will respond to the
> > > new "GraceStart" upcall with -EOPNOTSUPP, in which case we will log a
> > > message and fall back to the original nfsdcld tracking ops (now called
> > > nfsd4_cld_tracking_ops_v0).
> > > 
> > > Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> > > ---
> > >  fs/nfsd/nfs4recover.c         | 225 ++++++++++++++++++++++++++++++++--
> > >  include/uapi/linux/nfsd/cld.h |   1 +
> > >  2 files changed, 218 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > > index 2243b909b407..89c2a27956d0 100644
> > > --- a/fs/nfsd/nfs4recover.c
> > > +++ b/fs/nfsd/nfs4recover.c
> > > @@ -779,6 +779,34 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
> > >  	return ret;
> > >  }
> > >  
> > > +static ssize_t
> > > +__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
> > > +		struct nfsd_net *nn)
> > > +{
> > > +	uint8_t cmd;
> > > +	struct xdr_netobj name;
> > > +	uint16_t namelen;
> > > +
> > > +	if (get_user(cmd, &cmsg->cm_cmd)) {
> > > +		dprintk("%s: error when copying cmd from userspace", __func__);
> > > +		return -EFAULT;
> > > +	}
> > > +	if (cmd == Cld_GraceStart) {
> > > +		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
> > > +			return -EFAULT;
> > > +		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
> > > +		if (IS_ERR_OR_NULL(name.data))
> > > +			return -EFAULT;
> > > +		name.len = namelen;
> > > +		if (!nfs4_client_to_reclaim(name, nn)) {
> > > +			kfree(name.data);
> > > +			return -EFAULT;
> > > +		}
> > > +		return sizeof(*cmsg);
> > > +	}
> > > +	return -EFAULT;
> > > +}
> > > +
> > >  static ssize_t
> > >  cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> > >  {
> > > @@ -788,6 +816,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> > >  	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
> > >  						nfsd_net_id);
> > >  	struct cld_net *cn = nn->cld_net;
> > > +	int16_t status;
> > >  
> > >  	if (mlen != sizeof(*cmsg)) {
> > >  		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
> > > @@ -801,13 +830,24 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> > >  		return -EFAULT;
> > >  	}
> > >  
> > > +	/*
> > > +	 * copy the status so we know whether to remove the upcall from the
> > > +	 * list (for -EINPROGRESS, we just want to make sure the xid is
> > > +	 * valid, not remove the upcall from the list)
> > > +	 */
> > > +	if (get_user(status, &cmsg->cm_status)) {
> > > +		dprintk("%s: error when copying status from userspace", __func__);
> > > +		return -EFAULT;
> > > +	}
> > > +
> > >  	/* walk the list and find corresponding xid */
> > >  	cup = NULL;
> > >  	spin_lock(&cn->cn_lock);
> > >  	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
> > >  		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
> > >  			cup = tmp;
> > > -			list_del_init(&cup->cu_list);
> > > +			if (status != -EINPROGRESS)
> > > +				list_del_init(&cup->cu_list);
> > >  			break;
> > >  		}
> > >  	}
> > > @@ -819,6 +859,9 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
> > >  		return -EINVAL;
> > >  	}
> > >  
> > > +	if (status == -EINPROGRESS)
> > > +		return __cld_pipe_inprogress_downcall(cmsg, nn);
> > > +
> > >  	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
> > >  		return -EFAULT;
> > >  
> > > @@ -1065,9 +1108,14 @@ nfsd4_cld_remove(struct nfs4_client *clp)
> > >  				"record from stable storage: %d\n", ret);
> > >  }
> > >  
> > > -/* Check for presence of a record, and update its timestamp */
> > > +/*
> > > + * For older nfsdcld's that do not allow us to "slurp" the clients
> > > + * from the tracking database during startup.
> > > + *
> > > + * Check for presence of a record, and update its timestamp
> > > + */
> > >  static int
> > > -nfsd4_cld_check(struct nfs4_client *clp)
> > > +nfsd4_cld_check_v0(struct nfs4_client *clp)
> > >  {
> > >  	int ret;
> > >  	struct cld_upcall *cup;
> > > @@ -1100,8 +1148,61 @@ nfsd4_cld_check(struct nfs4_client *clp)
> > >  	return ret;
> > >  }
> > >  
> > > +/*
> > > + * For newer nfsdcld's that allow us to "slurp" the clients
> > > + * from the tracking database during startup.
> > > + *
> > > + * Check for presence of a record in the reclaim_str_hashtbl
> > > + */
> > > +static int
> > > +nfsd4_cld_check(struct nfs4_client *clp)
> > > +{
> > > +	struct nfs4_client_reclaim *crp;
> > > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > +
> > > +	/* did we already find that this client is stable? */
> > > +	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
> > > +		return 0;
> > > +
> > > +	/* look for it in the reclaim hashtable otherwise */
> > > +	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
> > > +	if (crp) {
> > > +		crp->cr_clp = clp;
> > > +		return 0;
> > > +	}
> > > +
> > > +	return -ENOENT;
> > > +}
> > > +
> > > +static int
> > > +nfsd4_cld_grace_start(struct nfsd_net *nn)
> > > +{
> > > +	int ret;
> > > +	struct cld_upcall *cup;
> > > +	struct cld_net *cn = nn->cld_net;
> > > +
> > > +	cup = alloc_cld_upcall(cn);
> > > +	if (!cup) {
> > > +		ret = -ENOMEM;
> > > +		goto out_err;
> > > +	}
> > > +
> > > +	cup->cu_msg.cm_cmd = Cld_GraceStart;
> > > +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> > > +	if (!ret)
> > > +		ret = cup->cu_msg.cm_status;
> > > +
> > > +	free_cld_upcall(cup);
> > > +out_err:
> > > +	if (ret)
> > > +		dprintk("%s: Unable to get clients from userspace: %d\n",
> > > +			__func__, ret);
> > > +	return ret;
> > > +}
> > > +
> > > +/* For older nfsdcld's that need cm_gracetime */
> > >  static void
> > > -nfsd4_cld_grace_done(struct nfsd_net *nn)
> > > +nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
> > >  {
> > >  	int ret;
> > >  	struct cld_upcall *cup;
> > > @@ -1125,11 +1226,118 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> > >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> > >  }
> > >  
> > > -static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> > > +/*
> > > + * For newer nfsdcld's that do not need cm_gracetime.  We also need to call
> > > + * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl.
> > > + */
> > > +static void
> > > +nfsd4_cld_grace_done(struct nfsd_net *nn)
> > > +{
> > > +	int ret;
> > > +	struct cld_upcall *cup;
> > > +	struct cld_net *cn = nn->cld_net;
> > > +
> > > +	cup = alloc_cld_upcall(cn);
> > > +	if (!cup) {
> > > +		ret = -ENOMEM;
> > > +		goto out_err;
> > > +	}
> > > +
> > > +	cup->cu_msg.cm_cmd = Cld_GraceDone;
> > > +	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
> > > +	if (!ret)
> > > +		ret = cup->cu_msg.cm_status;
> > > +
> > > +	free_cld_upcall(cup);
> > > +out_err:
> > > +	nfs4_release_reclaim(nn);
> > > +	if (ret)
> > > +		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> > > +}
> > > +
> > > +static int
> > > +nfs4_cld_state_init(struct net *net)
> > > +{
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +	int i;
> > > +
> > > +	nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE,
> > > +						sizeof(struct list_head),
> > > +						GFP_KERNEL);
> > > +	if (!nn->reclaim_str_hashtbl)
> > > +		return -ENOMEM;
> > > +
> > > +	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> > > +		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> > > +	nn->reclaim_str_hashtbl_size = 0;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void
> > > +nfs4_cld_state_shutdown(struct net *net)
> > > +{
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +
> > > +	kfree(nn->reclaim_str_hashtbl);
> > > +}
> > > +
> > > +static int
> > > +nfsd4_cld_tracking_init(struct net *net)
> > > +{
> > > +	int status;
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +
> > > +	status = nfs4_cld_state_init(net);
> > > +	if (status)
> > > +		return status;
> > > +
> > > +	status = nfsd4_init_cld_pipe(net);
> > > +	if (status)
> > > +		goto err_shutdown;
> > > +
> > > +	status = nfsd4_cld_grace_start(nn);
> > > +	if (status) {
> > > +		if (status == -EOPNOTSUPP)
> > > +			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
> > > +		nfs4_release_reclaim(nn);
> > > +		goto err_remove;
> > > +	}
> > > +	return 0;
> > > +
> > > +err_remove:
> > > +	nfsd4_remove_cld_pipe(net);
> > > +err_shutdown:
> > > +	nfs4_cld_state_shutdown(net);
> > > +	return status;
> > > +}
> > > +
> > > +static void
> > > +nfsd4_cld_tracking_exit(struct net *net)
> > > +{
> > > +	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > > +
> > > +	nfs4_release_reclaim(nn);
> > > +	nfsd4_remove_cld_pipe(net);
> > > +	nfs4_cld_state_shutdown(net);
> > > +}
> > > +
> > > +/* For older nfsdcld's */
> > > +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
> > >  	.init		= nfsd4_init_cld_pipe,
> > >  	.exit		= nfsd4_remove_cld_pipe,
> > >  	.create		= nfsd4_cld_create,
> > >  	.remove		= nfsd4_cld_remove,
> > > +	.check		= nfsd4_cld_check_v0,
> > > +	.grace_done	= nfsd4_cld_grace_done_v0,
> > > +};
> > > +
> > > +/* For newer nfsdcld's */
> > > +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
> > > +	.init		= nfsd4_cld_tracking_init,
> > > +	.exit		= nfsd4_cld_tracking_exit,
> > > +	.create		= nfsd4_cld_create,
> > > +	.remove		= nfsd4_cld_remove,
> > >  	.check		= nfsd4_cld_check,
> > >  	.grace_done	= nfsd4_cld_grace_done,
> > >  };
> > > @@ -1514,9 +1722,10 @@ nfsd4_client_tracking_init(struct net *net)
> > >  
> > >  	/* Finally, try to use nfsdcld */
> > >  	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
> > > -	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
> > > -			"removed in 3.10. Please transition to using "
> > > -			"nfsdcltrack.\n");
> > > +	status = nn->client_tracking_ops->init(net);
> > > +	if (!status)
> > > +		return status;
> > > +	nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
> > 
> > It seems like we probably ought to check to see if the daemon is up
> > before attempting a UMH upcall now? If someone starts up the daemon
> > they'll probably be surprised that it didn't get used because there was
> > a UMH upcall program present.
> 
> I figured that the UMH upcall program would still be the default and
> that the admin would have to do some extra configuration to use nfsdcld,
> namely remove the /var/lib/nfs/v4recovery directory and set an empty
> value for nfsd's 'cltrack_prog' module option.  Do you think that's a
> bad idea?
> 
> -Scott
> 

The only real issue there is that that is several steps, any of which
someone could screw up. I think we probably do want to aim for allowing
someone to enable nfsdcld (via systemd or whatever) and have it all
"just work" without needing to do anything else.

Assuming that daemon works better and in more places than the umh
helper, should we aim for it to eventually become the default?


> > >  do_init:
> > >  	status = nn->client_tracking_ops->init(net);
> > >  	if (status) {
> > > diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
> > > index f8f5cccad749..b1e9de4f07d5 100644
> > > --- a/include/uapi/linux/nfsd/cld.h
> > > +++ b/include/uapi/linux/nfsd/cld.h
> > > @@ -36,6 +36,7 @@ enum cld_command {
> > >  	Cld_Remove,		/* remove record of this cm_id */
> > >  	Cld_Check,		/* is this cm_id allowed? */
> > >  	Cld_GraceDone,		/* grace period is complete */
> > > +	Cld_GraceStart,
> > >  };
> > >  
> > >  /* representation of long-form NFSv4 client ID */
> > 
> > -- 
> > Jeff Layton <jlayton@kernel.org>
> > 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-20  0:19       ` Jeff Layton
@ 2018-12-20  1:59         ` J. Bruce Fields
  2018-12-20 15:24           ` Jeff Layton
  0 siblings, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-20  1:59 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Scott Mayhew, linux-nfs

On Wed, Dec 19, 2018 at 07:19:53PM -0500, Jeff Layton wrote:
> On Wed, 2018-12-19 at 17:11 -0500, Scott Mayhew wrote:
> > On Wed, 19 Dec 2018, Jeff Layton wrote:
> > > It seems like we probably ought to check to see if the daemon is up
> > > before attempting a UMH upcall now? If someone starts up the daemon
> > > they'll probably be surprised that it didn't get used because there was
> > > a UMH upcall program present.
> > 
> > I figured that the UMH upcall program would still be the default and
> > that the admin would have to do some extra configuration to use nfsdcld,
> > namely remove the /var/lib/nfs/v4recovery directory and set an empty
> > value for nfsd's 'cltrack_prog' module option.  Do you think that's a
> > bad idea?
> > 
> > -Scott
> > 
> 
> The only real issue there is that that is several steps, any of which
> someone could screw up. I think we probably do want to aim for allowing
> someone to enable nfsdcld (via systemd or whatever) and have it all
> "just work" without needing to do anything else.

If we just care about being able to set this up for users, the rpm (or
other package) install could remove /var/lib/nfs/v4recovery, drop a
configuration file in /etc/modprobe.d to set cltrack_prog, and enable a
systemd unit.

But you're thinking somebody might want to switch a system back and
forth?  I guess that could be useful for debugging and, yeah the
multiple steps would be more error prone.

> Assuming that daemon works better and in more places than the umh
> helper, should we aim for it to eventually become the default?

I hope so.  If we have to switch this again I'm going to quit and go
into some other business....

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 2/3] nfsd: un-deprecate nfsdcld
  2018-12-20  1:59         ` J. Bruce Fields
@ 2018-12-20 15:24           ` Jeff Layton
  0 siblings, 0 replies; 23+ messages in thread
From: Jeff Layton @ 2018-12-20 15:24 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Scott Mayhew, linux-nfs

On Wed, 2018-12-19 at 20:59 -0500, J. Bruce Fields wrote:
> On Wed, Dec 19, 2018 at 07:19:53PM -0500, Jeff Layton wrote:
> > On Wed, 2018-12-19 at 17:11 -0500, Scott Mayhew wrote:
> > > On Wed, 19 Dec 2018, Jeff Layton wrote:
> > > > It seems like we probably ought to check to see if the daemon is up
> > > > before attempting a UMH upcall now? If someone starts up the daemon
> > > > they'll probably be surprised that it didn't get used because there was
> > > > a UMH upcall program present.
> > > 
> > > I figured that the UMH upcall program would still be the default and
> > > that the admin would have to do some extra configuration to use nfsdcld,
> > > namely remove the /var/lib/nfs/v4recovery directory and set an empty
> > > value for nfsd's 'cltrack_prog' module option.  Do you think that's a
> > > bad idea?
> > > 
> > > -Scott
> > > 
> > 
> > The only real issue there is that that is several steps, any of which
> > someone could screw up. I think we probably do want to aim for allowing
> > someone to enable nfsdcld (via systemd or whatever) and have it all
> > "just work" without needing to do anything else.
> 
> If we just care about being able to set this up for users, the rpm (or
> other package) install could remove /var/lib/nfs/v4recovery, drop a
> configuration file in /etc/modprobe.d to set cltrack_prog, and enable a
> systemd unit.
> 
> But you're thinking somebody might want to switch a system back and
> forth?  I guess that could be useful for debugging and, yeah the
> multiple steps would be more error prone.
> 

Yes.

That said, you wouldn't have compatible databases when switching back
(and I don't think we want to try to handle that case automatically
anyhow). At the very least though, I think we need to shoot for seamless
migration from legacy or umh upcalls to nfsdcld.

> > Assuming that daemon works better and in more places than the umh
> > helper, should we aim for it to eventually become the default?
> 
> I hope so.  If we have to switch this again I'm going to quit and go
> into some other business....
> 

Indeed!

Given that, we should be quite careful about introducing this. How do we
make that seamless for the vast majority of users who are just doing
simple serving and won't likely be downgrading?

Part of that may mean reworking the upcall method selection, so I'd like
to get that part hashed out before we merge this.
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 22:43         ` J. Bruce Fields
@ 2018-12-20 16:36           ` Scott Mayhew
  2018-12-20 17:32             ` Jeff Layton
  0 siblings, 1 reply; 23+ messages in thread
From: Scott Mayhew @ 2018-12-20 16:36 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: jlayton, linux-nfs

On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> > On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > > still in grace?  The count would be too high then and the server could
> > > exit grace before all the clients have reclaimed.  I actually added
> > > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> > 
> > Oh boy.
> > 
> > (Thinks.)
> > 
> > Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> > previous client instance's state, it's got no locks to reclaim any more.
> > (It can't have gotten any *new* ones, since we're still in the grace
> > period.)
> > 
> > It's effectively a brand new client.  Only reclaiming clients should
> > bump that counter.
> > 
> > We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> > grace period, that client just doesn't matter any more.
> 
> Actually, once the client's destroyed, it shouldn't matter whether the
> previous incarnation of the client reclaimed or not.  It's never going
> to reclaim now....  So expire_client should probably just be removing
> the client from the table of reclaimable clients at the same time that
> it removes its stable storage record.

Okay, come to think of it, if we're in grace then nfsdcld should be
removing the client record from both the current and recovery epoch's db
tables too...
> 
> --b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-19 22:21       ` J. Bruce Fields
  2018-12-19 22:43         ` J. Bruce Fields
@ 2018-12-20 17:29         ` Jeff Layton
  2018-12-20 18:05           ` J. Bruce Fields
  1 sibling, 1 reply; 23+ messages in thread
From: Jeff Layton @ 2018-12-20 17:29 UTC (permalink / raw)
  To: J. Bruce Fields, Scott Mayhew; +Cc: linux-nfs

On Wed, 2018-12-19 at 17:21 -0500, J. Bruce Fields wrote:
> On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> > 
> > > On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > > > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > > +		return;
> > > > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > > > +			nn->reclaim_str_hashtbl_size) {
> > > > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > > > +				clp->net->ns.inum);
> > > > +		nfsd4_end_grace(nn);
> > > > +	}
> > > > +}
> > > > +
> > > > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > > > +{
> > > > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > > +
> > > > +	if (!nn->track_reclaim_completes)
> > > > +		return;
> > > > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > > > +		return;
> > > > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > > +		atomic_dec(&nn->nr_reclaim_complete);
> > > > +}
> > > > +
> > > >  static void expire_client(struct nfs4_client *clp)
> > > >  {
> > > >  	unhash_client(clp);
> > > >  	nfsd4_client_record_remove(clp);
> > > > +	dec_reclaim_complete(clp);
> > > >  	__destroy_client(clp);
> > > >  }
> > > 
> > > This doesn't look right to me.  If a client reclaims and then
> > > immediately calls DESTROY_CLIENTID or something--that should still count
> > > as a reclaim, and that shouldn't prevent us from ending the grace period
> > > early.
> > > 
> > > I think dec_reclaim_complete is unnecessary.
> > 
> > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > still in grace?  The count would be too high then and the server could
> > exit grace before all the clients have reclaimed.  I actually added
> > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> 
> Oh boy.
> 
> (Thinks.)
> 
> Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> previous client instance's state, it's got no locks to reclaim any more.
> (It can't have gotten any *new* ones, since we're still in the grace
> period.)
> 
> It's effectively a brand new client.  Only reclaiming clients should
> bump that counter.
> 
> We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> grace period, that client just doesn't matter any more.
> 
> I think?
> 

That wasn't my thinking here.

Suppose we have a client that holds some locks. Server reboots and we do
EXCHANGE_ID and start reclaiming, and eventually send a
RECLAIM_COMPLETE.

Now, there is a network partition and we lose contact with the server
for more than a lease period. The client record gets tossed out. Client
eventually reestablishes the connection before the grace period ends and
attempts to reclaim.

That reclaim should succeed, IMO, as there is no reason that it
shouldn't. Nothing can have claimed competing state since we're still in
the grace period.

The thing you don't want to do here is to double count the
RECLAIM_COMPLETE for this client. So decrementing the counter when you
tear down a client is reasonable.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-20 16:36           ` Scott Mayhew
@ 2018-12-20 17:32             ` Jeff Layton
  0 siblings, 0 replies; 23+ messages in thread
From: Jeff Layton @ 2018-12-20 17:32 UTC (permalink / raw)
  To: Scott Mayhew, J. Bruce Fields; +Cc: linux-nfs

On Thu, 2018-12-20 at 11:36 -0500, Scott Mayhew wrote:
> On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> 
> > On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> > > On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > > > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > > > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > > > still in grace?  The count would be too high then and the server could
> > > > exit grace before all the clients have reclaimed.  I actually added
> > > > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> > > 
> > > Oh boy.
> > > 
> > > (Thinks.)
> > > 
> > > Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> > > previous client instance's state, it's got no locks to reclaim any more.
> > > (It can't have gotten any *new* ones, since we're still in the grace
> > > period.)
> > > 
> > > It's effectively a brand new client.  Only reclaiming clients should
> > > bump that counter.
> > > 
> > > We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> > > grace period, that client just doesn't matter any more.
> > 
> > Actually, once the client's destroyed, it shouldn't matter whether the
> > previous incarnation of the client reclaimed or not.  It's never going
> > to reclaim now....  So expire_client should probably just be removing
> > the client from the table of reclaimable clients at the same time that
> > it removes its stable storage record.
> 
> Okay, come to think of it, if we're in grace then nfsdcld should be
> removing the client record from both the current and recovery epoch's db
> tables too...

No!

The recovery table is a record of who held state at the end of the prior
epoch. That does not change just because the client went away during the
current epoch.

You should only ever alter the table for the epoch that you are in. Once
the grace period is lifted, you can delete the tables for any prior
epoch.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-20 17:29         ` Jeff Layton
@ 2018-12-20 18:05           ` J. Bruce Fields
  2018-12-20 18:26             ` Jeff Layton
  0 siblings, 1 reply; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-20 18:05 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Scott Mayhew, linux-nfs

On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> That wasn't my thinking here.
> 
> Suppose we have a client that holds some locks. Server reboots and we do
> EXCHANGE_ID and start reclaiming, and eventually send a
> RECLAIM_COMPLETE.
> 
> Now, there is a network partition and we lose contact with the server
> for more than a lease period. The client record gets tossed out. Client
> eventually reestablishes the connection before the grace period ends and
> attempts to reclaim.
> 
> That reclaim should succeed, IMO, as there is no reason that it
> shouldn't. Nothing can have claimed competing state since we're still in
> the grace period.

That scenario requires a grace period longer than the lease period,
which isn't impossible but sounds rare?  I guess you're thinking in the
cluster case about the possibility of a second node failure extending
the grace period.

Still, that's different from the case where the client explicitly
destroys its own state.  That could happen in less than a lease period
and in that case there won't be a reclaim.  I think that case could
happen if a client rebooted quickly or maybe just unmounted.

Hm.

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-20 18:05           ` J. Bruce Fields
@ 2018-12-20 18:26             ` Jeff Layton
  2018-12-20 19:02               ` J. Bruce Fields
  0 siblings, 1 reply; 23+ messages in thread
From: Jeff Layton @ 2018-12-20 18:26 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Scott Mayhew, linux-nfs

On Thu, 2018-12-20 at 13:05 -0500, J. Bruce Fields wrote:
> On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> > That wasn't my thinking here.
> > 
> > Suppose we have a client that holds some locks. Server reboots and we do
> > EXCHANGE_ID and start reclaiming, and eventually send a
> > RECLAIM_COMPLETE.
> > 
> > Now, there is a network partition and we lose contact with the server
> > for more than a lease period. The client record gets tossed out. Client
> > eventually reestablishes the connection before the grace period ends and
> > attempts to reclaim.
> > 
> > That reclaim should succeed, IMO, as there is no reason that it
> > shouldn't. Nothing can have claimed competing state since we're still in
> > the grace period.
> 
> That scenario requires a grace period longer than the lease period,
> which isn't impossible but sounds rare?  I guess you're thinking in the
> cluster case about the possibility of a second node failure extending
> the grace period.
> 

Isn't our grace period twice the lease period by default? I think we do
have to assume that it may take an entire lease period before the
client notices that the server has rebooted. If grace period == lease
period then you aren't leaving much time for reclaim to occur.

> Still, that's different from the case where the client explicitly
> destroys its own state.  That could happen in less than a lease period
> and in that case there won't be a reclaim.  I think that case could
> happen if a client rebooted quickly or maybe just unmounted.
> 
> Hm.
> 

True. You're right that we don't want to delay lifting the grace period
because we're waiting for clients that have unmounted and aren't coming
back. Unfortunately, it's difficult to distinguish the two cases. Could
we just decrement the counter when we're tearing down a clientid
because of lease expiration and not on DESTROY_CLIENT?

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld
  2018-12-20 18:26             ` Jeff Layton
@ 2018-12-20 19:02               ` J. Bruce Fields
  0 siblings, 0 replies; 23+ messages in thread
From: J. Bruce Fields @ 2018-12-20 19:02 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Scott Mayhew, linux-nfs

On Thu, Dec 20, 2018 at 01:26:34PM -0500, Jeff Layton wrote:
> On Thu, 2018-12-20 at 13:05 -0500, J. Bruce Fields wrote:
> > On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> > > That wasn't my thinking here.
> > > 
> > > Suppose we have a client that holds some locks. Server reboots and we do
> > > EXCHANGE_ID and start reclaiming, and eventually send a
> > > RECLAIM_COMPLETE.
> > > 
> > > Now, there is a network partition and we lose contact with the server
> > > for more than a lease period. The client record gets tossed out. Client
> > > eventually reestablishes the connection before the grace period ends and
> > > attempts to reclaim.
> > > 
> > > That reclaim should succeed, IMO, as there is no reason that it
> > > shouldn't. Nothing can have claimed competing state since we're still in
> > > the grace period.
> > 
> > That scenario requires a grace period longer than the lease period,
> > which isn't impossible but sounds rare?  I guess you're thinking in the
> > cluster case about the possibility of a second node failure extending
> > the grace period.
> 
> Isn't our grace period twice the lease period by default?

Reminding myself.... Upstream now it will end the grace period after one
grace period, but will extend it up to two grace periods if someone has
reclaimed in the last second.

> I think we do
> have to assume that it may take an entire lease period before the
> client notices that the server has rebooted. If grace period == lease
> period then you aren't leaving much time for reclaim to occur.

My assumption is that it's mainly the client's responsibility to allow
enough time, by renewing its lease somewhat more frequently than once
per lease period.

That may be wrong--there's some support for that assumption in
https://tools.ietf.org/html/rfc7530#section-9.5, but that's talking only
about network delays, not about allowing additional time for the
recovery.

> > Still, that's different from the case where the client explicitly
> > destroys its own state.  That could happen in less than a lease period
> > and in that case there won't be a reclaim.  I think that case could
> > happen if a client rebooted quickly or maybe just unmounted.
> > 
> > Hm.
> > 
> 
> True. You're right that we don't want to delay lifting the grace period
> because we're waiting for clients that have unmounted and aren't coming
> back. Unfortunately, it's difficult to distinguish the two cases. Could
> we just decrement the counter when we're tearing down a clientid
> because of lease expiration and not on DESTROY_CLIENT?

Right, either DESTROY_CLIENTID or (in the 4.0 case) a
SETCLIENTID_CONFIRM.  So those two cases wouldn't be difficult to treat
differently.  OK, maybe that's the best choice.

--b.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, back to index

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-18 14:29 [PATCH v2 0/3] un-deprecate nfsdcld Scott Mayhew
2018-12-18 14:29 ` [PATCH v2 1/3] nfsd: make nfs4_client_reclaim use an xdr_netobj instead of a fixed char array Scott Mayhew
2018-12-18 14:29 ` [PATCH v2 2/3] nfsd: un-deprecate nfsdcld Scott Mayhew
2018-12-19 21:23   ` Jeff Layton
2018-12-19 22:11     ` Scott Mayhew
2018-12-20  0:19       ` Jeff Layton
2018-12-20  1:59         ` J. Bruce Fields
2018-12-20 15:24           ` Jeff Layton
2018-12-18 14:29 ` [PATCH v2 3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld Scott Mayhew
2018-12-19 17:46   ` J. Bruce Fields
2018-12-19 21:57     ` Scott Mayhew
2018-12-19 18:28   ` J. Bruce Fields
2018-12-19 22:01     ` Scott Mayhew
2018-12-19 18:36   ` J. Bruce Fields
2018-12-19 22:05     ` Scott Mayhew
2018-12-19 22:21       ` J. Bruce Fields
2018-12-19 22:43         ` J. Bruce Fields
2018-12-20 16:36           ` Scott Mayhew
2018-12-20 17:32             ` Jeff Layton
2018-12-20 17:29         ` Jeff Layton
2018-12-20 18:05           ` J. Bruce Fields
2018-12-20 18:26             ` Jeff Layton
2018-12-20 19:02               ` J. Bruce Fields

Linux-NFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-nfs/0 linux-nfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-nfs linux-nfs/ https://lore.kernel.org/linux-nfs \
		linux-nfs@vger.kernel.org linux-nfs@archiver.kernel.org
	public-inbox-index linux-nfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-nfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox