linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed.
@ 2021-04-23 20:59 Dai Ngo
  2021-04-23 20:59 ` [PATCH v4 1/2] " Dai Ngo
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Dai Ngo @ 2021-04-23 20:59 UTC (permalink / raw)
  To: olga.kornievskaia; +Cc: trondmy, bfields, chuck.lever, linux-nfs

Hi Olga,

Currently the source's export is mounted and unmounted on every
inter-server copy operation. This causes unnecessary overhead
for each copy.

This patch series is an enhancement to allow the export to remain
mounted for a configurable period (default to 15 minutes). If the 
export is not being used for the configured time it will be unmounted
by a delayed task. If it's used again then its expiration time is
extended for another period.

Since mount and unmount are no longer done on every copy request,
the restriction of copy size (14*rsize), in __nfs4_copy_file_range,
is removed.

-Dai

v2: fix compiler warning of missing prototype.
v3: remove the used of semaphore.
    eliminated all RPC calls for subsequence mount by allowing
       all exports from one server to share one vfsmount.
    make inter-server threshold a module configuration parameter.
v4: convert nsui_refcnt to use refcount_t.
    add note about 20secs wait in nfsd4_interssc_connect.
    removed (14*rsize) restriction from __nfs4_copy_file_range.




^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v4 1/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-04-23 20:59 [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed Dai Ngo
@ 2021-04-23 20:59 ` Dai Ngo
  2021-05-14 18:12   ` Olga Kornievskaia
  2021-04-23 20:59 ` [PATCH v4 2/2] NFSv4.2: remove restriction of copy size for inter-server copy Dai Ngo
  2021-05-03 16:37 ` [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed dai.ngo
  2 siblings, 1 reply; 9+ messages in thread
From: Dai Ngo @ 2021-04-23 20:59 UTC (permalink / raw)
  To: olga.kornievskaia; +Cc: trondmy, bfields, chuck.lever, linux-nfs

Currently the source's export is mounted and unmounted on every
inter-server copy operation. This patch is an enhancement to delay
the unmount of the source export for a certain period of time to
eliminate the mount and unmount overhead on subsequent copy operations.

After a copy operation completes, a delayed task is scheduled to
unmount the export after a configurable idle time. Each time the
export is being used again, its expire time is extended to allow
the export to remain mounted.

The unmount task and the mount operation of the copy request are
synced to make sure the export is not unmounted while it's being
used.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4proc.c      | 178 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/nfsd.h          |   4 ++
 fs/nfsd/nfssvc.c        |   3 +
 include/linux/nfs_ssc.h |  20 ++++++
 4 files changed, 201 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index dd9f38d072dd..a4b110cbcab5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,6 +55,81 @@ module_param(inter_copy_offload_enable, bool, 0644);
 MODULE_PARM_DESC(inter_copy_offload_enable,
 		 "Enable inter server to server copy offload. Default: false");
 
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+static int nfsd4_ssc_umount_timeout = 900000;		/* default to 15 mins */
+module_param(nfsd4_ssc_umount_timeout, int, 0644);
+MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
+		"idle msecs before unmount export from source server");
+
+static void nfsd4_ssc_expire_umount(struct work_struct *work);
+static struct nfsd4_ssc_umount nfsd4_ssc_umount;
+
+/* nfsd4_ssc_umount.nsu_lock must be held */
+static void nfsd4_scc_update_umnt_timo(void)
+{
+	struct nfsd4_ssc_umount_item *ni = 0;
+
+	cancel_delayed_work(&nfsd4_ssc_umount.nsu_umount_work);
+	if (!list_empty(&nfsd4_ssc_umount.nsu_list)) {
+		ni = list_first_entry(&nfsd4_ssc_umount.nsu_list,
+			struct nfsd4_ssc_umount_item, nsui_list);
+		nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
+		schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
+			ni->nsui_expire - jiffies);
+	} else
+		nfsd4_ssc_umount.nsu_expire = 0;
+}
+
+static void nfsd4_ssc_expire_umount(struct work_struct *work)
+{
+	bool do_wakeup = false;
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd4_ssc_umount_item *tmp;
+
+	spin_lock(&nfsd4_ssc_umount.nsu_lock);
+	list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
+		if (time_after(jiffies, ni->nsui_expire)) {
+			if (refcount_read(&ni->nsui_refcnt) > 0)
+				continue;
+
+			/* mark being unmount */
+			ni->nsui_busy = true;
+			spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+			mntput(ni->nsui_vfsmount);
+			spin_lock(&nfsd4_ssc_umount.nsu_lock);
+
+			/* waiters need to start from begin of list */
+			list_del(&ni->nsui_list);
+			kfree(ni);
+
+			/* wakeup ssc_connect waiters */
+			do_wakeup = true;
+			continue;
+		}
+		break;
+	}
+	nfsd4_scc_update_umnt_timo();
+	if (do_wakeup)
+		wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
+	spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+}
+
+static DECLARE_DELAYED_WORK(nfsd4, nfsd4_ssc_expire_umount);
+
+void nfsd4_ssc_init_umount_work(void)
+{
+	if (nfsd4_ssc_umount.nsu_inited)
+		return;
+	INIT_DELAYED_WORK(&nfsd4_ssc_umount.nsu_umount_work,
+		nfsd4_ssc_expire_umount);
+	INIT_LIST_HEAD(&nfsd4_ssc_umount.nsu_list);
+	spin_lock_init(&nfsd4_ssc_umount.nsu_lock);
+	init_waitqueue_head(&nfsd4_ssc_umount.nsu_waitq);
+	nfsd4_ssc_umount.nsu_inited = true;
+}
+EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
+#endif
+
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
 
@@ -1181,6 +1256,9 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 	char *ipaddr, *dev_name, *raw_data;
 	int len, raw_len;
 	__be32 status = nfserr_inval;
+	struct nfsd4_ssc_umount_item *ni = 0;
+	struct nfsd4_ssc_umount_item *work, *tmp;
+	DEFINE_WAIT(wait);
 
 	naddr = &nss->u.nl4_addr;
 	tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
@@ -1229,12 +1307,68 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 		goto out_free_rawdata;
 	snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
 
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+try_again:
+	spin_lock(&nfsd4_ssc_umount.nsu_lock);
+	list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
+		if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
+			continue;
+		/* found a match */
+		if (ni->nsui_busy) {
+			/*  wait - and try again */
+			prepare_to_wait(&nfsd4_ssc_umount.nsu_waitq, &wait,
+				TASK_INTERRUPTIBLE);
+			spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+
+			/* allow 20secs for mount/unmount for now - revisit */
+			if (signal_pending(current) ||
+					(schedule_timeout(20*HZ) == 0)) {
+				status = nfserr_eagain;
+				kfree(work);
+				goto out_free_devname;
+			}
+			finish_wait(&nfsd4_ssc_umount.nsu_waitq, &wait);
+			goto try_again;
+		}
+		ss_mnt = ni->nsui_vfsmount;
+		if (refcount_read(&ni->nsui_refcnt) == 0)
+			refcount_set(&ni->nsui_refcnt, 1);
+		else
+			refcount_inc(&ni->nsui_refcnt);
+		spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+		kfree(work);
+		goto out_done;
+	}
+	/* create new entry, set busy, insert list, clear busy after mount */
+	if (work) {
+		strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+		refcount_set(&work->nsui_refcnt, 1);
+		work->nsui_busy = true;
+		list_add_tail(&work->nsui_list, &nfsd4_ssc_umount.nsu_list);
+	}
+	spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+
 	/* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
 	ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
 	module_put(type->owner);
-	if (IS_ERR(ss_mnt))
+	if (IS_ERR(ss_mnt)) {
+		if (work) {
+			spin_lock(&nfsd4_ssc_umount.nsu_lock);
+			list_del(&work->nsui_list);
+			wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
+			spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+			kfree(work);
+		}
 		goto out_free_devname;
-
+	}
+	if (work) {
+		spin_lock(&nfsd4_ssc_umount.nsu_lock);
+		work->nsui_vfsmount = ss_mnt;
+		work->nsui_busy = false;
+		wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
+		spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+	}
+out_done:
 	status = 0;
 	*mount = ss_mnt;
 
@@ -1301,10 +1435,46 @@ static void
 nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
 			struct nfsd_file *dst)
 {
+	bool found = false;
+	bool resched = false;
+	long timeout;
+	struct nfsd4_ssc_umount_item *tmp;
+	struct nfsd4_ssc_umount_item *ni = 0;
+
 	nfs42_ssc_close(src->nf_file);
-	fput(src->nf_file);
 	nfsd_file_put(dst);
-	mntput(ss_mnt);
+	fput(src->nf_file);
+
+	timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
+	spin_lock(&nfsd4_ssc_umount.nsu_lock);
+	list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list,
+		nsui_list) {
+		if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
+			list_del(&ni->nsui_list);
+			/*
+			 * vfsmount can be shared by multiple exports,
+			 * decrement refcnt and schedule delayed task
+			 * if it drops to 0.
+			 */
+			if (refcount_dec_and_test(&ni->nsui_refcnt))
+				resched = true;
+			ni->nsui_expire = jiffies + timeout;
+			list_add_tail(&ni->nsui_list, &nfsd4_ssc_umount.nsu_list);
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		spin_unlock(&nfsd4_ssc_umount.nsu_lock);
+		mntput(ss_mnt);
+		return;
+	}
+	if (resched && !nfsd4_ssc_umount.nsu_expire) {
+		nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
+		schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
+				timeout);
+	}
+	spin_unlock(&nfsd4_ssc_umount.nsu_lock);
 }
 
 #else /* CONFIG_NFSD_V4_2_INTER_SSC */
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 8bdc37aa2c2e..b3bf8a5f4472 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -483,6 +483,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
 extern int nfsd4_is_junction(struct dentry *dentry);
 extern int register_cld_notifier(void);
 extern void unregister_cld_notifier(void);
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+extern void nfsd4_ssc_init_umount_work(void);
+#endif
+
 #else /* CONFIG_NFSD_V4 */
 static inline int nfsd4_is_junction(struct dentry *dentry)
 {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6de406322106..2558db55b88b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -322,6 +322,9 @@ static int nfsd_startup_generic(int nrservs)
 	ret = nfs4_state_start();
 	if (ret)
 		goto out_file_cache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+	nfsd4_ssc_init_umount_work();
+#endif
 	return 0;
 
 out_file_cache:
diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
index f5ba0fbff72f..1e07be2a89fa 100644
--- a/include/linux/nfs_ssc.h
+++ b/include/linux/nfs_ssc.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/nfs_fs.h>
+#include <linux/sunrpc/svc.h>
 
 extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
 
@@ -52,6 +53,25 @@ static inline void nfs42_ssc_close(struct file *filep)
 	if (nfs_ssc_client_tbl.ssc_nfs4_ops)
 		(*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
 }
+
+struct nfsd4_ssc_umount_item {
+	struct list_head nsui_list;
+	bool nsui_busy;
+	refcount_t nsui_refcnt;
+	unsigned long nsui_expire;
+	struct vfsmount *nsui_vfsmount;
+	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
+};
+
+struct nfsd4_ssc_umount {
+	struct list_head nsu_list;
+	struct delayed_work nsu_umount_work;
+	spinlock_t nsu_lock;
+	unsigned long nsu_expire;
+	wait_queue_head_t nsu_waitq;
+	bool nsu_inited;
+};
+
 #endif
 
 /*
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH v4 2/2] NFSv4.2: remove restriction of copy size for inter-server copy.
  2021-04-23 20:59 [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed Dai Ngo
  2021-04-23 20:59 ` [PATCH v4 1/2] " Dai Ngo
@ 2021-04-23 20:59 ` Dai Ngo
  2021-05-03 16:37 ` [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed dai.ngo
  2 siblings, 0 replies; 9+ messages in thread
From: Dai Ngo @ 2021-04-23 20:59 UTC (permalink / raw)
  To: olga.kornievskaia; +Cc: trondmy, bfields, chuck.lever, linux-nfs

Currently inter-server copy is allowed only if the copy size is larger
than (rsize*14) which is the over-head of the mount operation of the
source export. This patch, relying on the delayed unmount feature,
removes this restriction since the mount and unmount overhead is now
not applicable for every inter-server copy.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfs/nfs4file.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 441a2fa073c8..b5821ed46994 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -158,13 +158,11 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 		sync = true;
 retry:
 	if (!nfs42_files_from_same_server(file_in, file_out)) {
-		/* for inter copy, if copy size if smaller than 12 RPC
-		 * payloads, fallback to traditional copy. There are
-		 * 14 RPCs during an NFSv4.x mount between source/dest
-		 * servers.
+		/*
+		 * for inter copy, if copy size is too small
+		 * then fallback to generic copy.
 		 */
-		if (sync ||
-			count <= 14 * NFS_SERVER(file_inode(file_in))->rsize)
+		if (sync)
 			return -EOPNOTSUPP;
 		cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
 				GFP_NOFS);
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-04-23 20:59 [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed Dai Ngo
  2021-04-23 20:59 ` [PATCH v4 1/2] " Dai Ngo
  2021-04-23 20:59 ` [PATCH v4 2/2] NFSv4.2: remove restriction of copy size for inter-server copy Dai Ngo
@ 2021-05-03 16:37 ` dai.ngo
  2021-05-12 16:53   ` dai.ngo
  2 siblings, 1 reply; 9+ messages in thread
From: dai.ngo @ 2021-05-03 16:37 UTC (permalink / raw)
  To: olga.kornievskaia; +Cc: trondmy, bfields, chuck.lever, linux-nfs

Hi Olga,

Just a reminder that v4 patch is available for your review.

Thanks,

-Dai

On 4/23/21 1:59 PM, Dai Ngo wrote:
> Hi Olga,
>
> Currently the source's export is mounted and unmounted on every
> inter-server copy operation. This causes unnecessary overhead
> for each copy.
>
> This patch series is an enhancement to allow the export to remain
> mounted for a configurable period (default to 15 minutes). If the
> export is not being used for the configured time it will be unmounted
> by a delayed task. If it's used again then its expiration time is
> extended for another period.
>
> Since mount and unmount are no longer done on every copy request,
> the restriction of copy size (14*rsize), in __nfs4_copy_file_range,
> is removed.
>
> -Dai
>
> v2: fix compiler warning of missing prototype.
> v3: remove the used of semaphore.
>      eliminated all RPC calls for subsequence mount by allowing
>         all exports from one server to share one vfsmount.
>      make inter-server threshold a module configuration parameter.
> v4: convert nsui_refcnt to use refcount_t.
>      add note about 20secs wait in nfsd4_interssc_connect.
>      removed (14*rsize) restriction from __nfs4_copy_file_range.
>
>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-05-03 16:37 ` [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed dai.ngo
@ 2021-05-12 16:53   ` dai.ngo
  0 siblings, 0 replies; 9+ messages in thread
From: dai.ngo @ 2021-05-12 16:53 UTC (permalink / raw)
  To: olga.kornievskaia; +Cc: trondmy, bfields, chuck.lever, linux-nfs

On 5/3/21 9:37 AM, dai.ngo@oracle.com wrote:

> Hi Olga,
>
> Just a reminder that v4 patch is available for your review.

Ping again in case you did not get the last reminder.

-Dai

>
> Thanks,
>
> -Dai
>
> On 4/23/21 1:59 PM, Dai Ngo wrote:
>> Hi Olga,
>>
>> Currently the source's export is mounted and unmounted on every
>> inter-server copy operation. This causes unnecessary overhead
>> for each copy.
>>
>> This patch series is an enhancement to allow the export to remain
>> mounted for a configurable period (default to 15 minutes). If the
>> export is not being used for the configured time it will be unmounted
>> by a delayed task. If it's used again then its expiration time is
>> extended for another period.
>>
>> Since mount and unmount are no longer done on every copy request,
>> the restriction of copy size (14*rsize), in __nfs4_copy_file_range,
>> is removed.
>>
>> -Dai
>>
>> v2: fix compiler warning of missing prototype.
>> v3: remove the used of semaphore.
>>      eliminated all RPC calls for subsequence mount by allowing
>>         all exports from one server to share one vfsmount.
>>      make inter-server threshold a module configuration parameter.
>> v4: convert nsui_refcnt to use refcount_t.
>>      add note about 20secs wait in nfsd4_interssc_connect.
>>      removed (14*rsize) restriction from __nfs4_copy_file_range.
>>
>>
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-04-23 20:59 ` [PATCH v4 1/2] " Dai Ngo
@ 2021-05-14 18:12   ` Olga Kornievskaia
  2021-05-14 18:42     ` J. Bruce Fields
  2021-05-14 20:08     ` dai.ngo
  0 siblings, 2 replies; 9+ messages in thread
From: Olga Kornievskaia @ 2021-05-14 18:12 UTC (permalink / raw)
  To: Dai Ngo; +Cc: Trond Myklebust, J. Bruce Fields, Chuck Lever, linux-nfs

On Fri, Apr 23, 2021 at 4:59 PM Dai Ngo <dai.ngo@oracle.com> wrote:
>
> Currently the source's export is mounted and unmounted on every
> inter-server copy operation. This patch is an enhancement to delay
> the unmount of the source export for a certain period of time to
> eliminate the mount and unmount overhead on subsequent copy operations.
>
> After a copy operation completes, a delayed task is scheduled to
> unmount the export after a configurable idle time. Each time the
> export is being used again, its expire time is extended to allow
> the export to remain mounted.
>
> The unmount task and the mount operation of the copy request are
> synced to make sure the export is not unmounted while it's being
> used.
>
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/nfs4proc.c      | 178 ++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/nfsd/nfsd.h          |   4 ++
>  fs/nfsd/nfssvc.c        |   3 +
>  include/linux/nfs_ssc.h |  20 ++++++
>  4 files changed, 201 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index dd9f38d072dd..a4b110cbcab5 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -55,6 +55,81 @@ module_param(inter_copy_offload_enable, bool, 0644);
>  MODULE_PARM_DESC(inter_copy_offload_enable,
>                  "Enable inter server to server copy offload. Default: false");
>
> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> +static int nfsd4_ssc_umount_timeout = 900000;          /* default to 15 mins */
> +module_param(nfsd4_ssc_umount_timeout, int, 0644);
> +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
> +               "idle msecs before unmount export from source server");
> +
> +static void nfsd4_ssc_expire_umount(struct work_struct *work);
> +static struct nfsd4_ssc_umount nfsd4_ssc_umount;
> +
> +/* nfsd4_ssc_umount.nsu_lock must be held */
> +static void nfsd4_scc_update_umnt_timo(void)
> +{
> +       struct nfsd4_ssc_umount_item *ni = 0;
> +
> +       cancel_delayed_work(&nfsd4_ssc_umount.nsu_umount_work);
> +       if (!list_empty(&nfsd4_ssc_umount.nsu_list)) {
> +               ni = list_first_entry(&nfsd4_ssc_umount.nsu_list,
> +                       struct nfsd4_ssc_umount_item, nsui_list);
> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
> +                       ni->nsui_expire - jiffies);
> +       } else
> +               nfsd4_ssc_umount.nsu_expire = 0;
> +}
> +
> +static void nfsd4_ssc_expire_umount(struct work_struct *work)
> +{
> +       bool do_wakeup = false;
> +       struct nfsd4_ssc_umount_item *ni = 0;
> +       struct nfsd4_ssc_umount_item *tmp;
> +
> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
> +               if (time_after(jiffies, ni->nsui_expire)) {
> +                       if (refcount_read(&ni->nsui_refcnt) > 0)
> +                               continue;
> +
> +                       /* mark being unmount */
> +                       ni->nsui_busy = true;
> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +                       mntput(ni->nsui_vfsmount);
> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +
> +                       /* waiters need to start from begin of list */
> +                       list_del(&ni->nsui_list);
> +                       kfree(ni);
> +
> +                       /* wakeup ssc_connect waiters */
> +                       do_wakeup = true;
> +                       continue;
> +               }
> +               break;

Wouldn't you be exiting your iteration loop as soon as you find a
delayed item that hasn't expired? What if other items on the list have
expired?

It looks like the general design is that work for doing umount is
triggered by doing some copy, right? And then it just keeps
rescheduling itself? Shouldn't we just be using the laundrymat instead
that wakes up and goes thru the list mounts that are put to be
unmounted and does so? Bruce previously had suggested using laundrymat
for cleaning up copynotify states on the source server. But if the
existing approach works for Bruce, I'm ok with it too.

> +       }
> +       nfsd4_scc_update_umnt_timo();
> +       if (do_wakeup)
> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +}
> +
> +static DECLARE_DELAYED_WORK(nfsd4, nfsd4_ssc_expire_umount);
> +
> +void nfsd4_ssc_init_umount_work(void)
> +{
> +       if (nfsd4_ssc_umount.nsu_inited)
> +               return;
> +       INIT_DELAYED_WORK(&nfsd4_ssc_umount.nsu_umount_work,
> +               nfsd4_ssc_expire_umount);
> +       INIT_LIST_HEAD(&nfsd4_ssc_umount.nsu_list);
> +       spin_lock_init(&nfsd4_ssc_umount.nsu_lock);
> +       init_waitqueue_head(&nfsd4_ssc_umount.nsu_waitq);
> +       nfsd4_ssc_umount.nsu_inited = true;
> +}
> +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
> +#endif
> +
>  #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
>  #include <linux/security.h>
>
> @@ -1181,6 +1256,9 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>         char *ipaddr, *dev_name, *raw_data;
>         int len, raw_len;
>         __be32 status = nfserr_inval;
> +       struct nfsd4_ssc_umount_item *ni = 0;
> +       struct nfsd4_ssc_umount_item *work, *tmp;
> +       DEFINE_WAIT(wait);
>
>         naddr = &nss->u.nl4_addr;
>         tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
> @@ -1229,12 +1307,68 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>                 goto out_free_rawdata;
>         snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
>
> +       work = kzalloc(sizeof(*work), GFP_KERNEL);
> +try_again:
> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
> +               if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
> +                       continue;
> +               /* found a match */
> +               if (ni->nsui_busy) {
> +                       /*  wait - and try again */
> +                       prepare_to_wait(&nfsd4_ssc_umount.nsu_waitq, &wait,
> +                               TASK_INTERRUPTIBLE);
> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +
> +                       /* allow 20secs for mount/unmount for now - revisit */
> +                       if (signal_pending(current) ||
> +                                       (schedule_timeout(20*HZ) == 0)) {
> +                               status = nfserr_eagain;
> +                               kfree(work);
> +                               goto out_free_devname;
> +                       }
> +                       finish_wait(&nfsd4_ssc_umount.nsu_waitq, &wait);
> +                       goto try_again;
> +               }
> +               ss_mnt = ni->nsui_vfsmount;
> +               if (refcount_read(&ni->nsui_refcnt) == 0)
> +                       refcount_set(&ni->nsui_refcnt, 1);
> +               else
> +                       refcount_inc(&ni->nsui_refcnt);
> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +               kfree(work);
> +               goto out_done;
> +       }
> +       /* create new entry, set busy, insert list, clear busy after mount */
> +       if (work) {
> +               strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
> +               refcount_set(&work->nsui_refcnt, 1);
> +               work->nsui_busy = true;
> +               list_add_tail(&work->nsui_list, &nfsd4_ssc_umount.nsu_list);
> +       }
> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +
>         /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
>         ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
>         module_put(type->owner);
> -       if (IS_ERR(ss_mnt))
> +       if (IS_ERR(ss_mnt)) {
> +               if (work) {
> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +                       list_del(&work->nsui_list);
> +                       wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +                       kfree(work);
> +               }
>                 goto out_free_devname;
> -
> +       }
> +       if (work) {
> +               spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +               work->nsui_vfsmount = ss_mnt;
> +               work->nsui_busy = false;
> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +       }
> +out_done:
>         status = 0;
>         *mount = ss_mnt;
>
> @@ -1301,10 +1435,46 @@ static void
>  nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
>                         struct nfsd_file *dst)
>  {
> +       bool found = false;
> +       bool resched = false;
> +       long timeout;
> +       struct nfsd4_ssc_umount_item *tmp;
> +       struct nfsd4_ssc_umount_item *ni = 0;
> +
>         nfs42_ssc_close(src->nf_file);
> -       fput(src->nf_file);
>         nfsd_file_put(dst);
> -       mntput(ss_mnt);
> +       fput(src->nf_file);
> +
> +       timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list,
> +               nsui_list) {
> +               if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
> +                       list_del(&ni->nsui_list);
> +                       /*
> +                        * vfsmount can be shared by multiple exports,
> +                        * decrement refcnt and schedule delayed task
> +                        * if it drops to 0.
> +                        */
> +                       if (refcount_dec_and_test(&ni->nsui_refcnt))
> +                               resched = true;
> +                       ni->nsui_expire = jiffies + timeout;
> +                       list_add_tail(&ni->nsui_list, &nfsd4_ssc_umount.nsu_list);
> +                       found = true;
> +                       break;
> +               }
> +       }
> +       if (!found) {
> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> +               mntput(ss_mnt);
> +               return;
> +       }
> +       if (resched && !nfsd4_ssc_umount.nsu_expire) {
> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
> +                               timeout);
> +       }
> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>  }
>
>  #else /* CONFIG_NFSD_V4_2_INTER_SSC */
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 8bdc37aa2c2e..b3bf8a5f4472 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -483,6 +483,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
>  extern int nfsd4_is_junction(struct dentry *dentry);
>  extern int register_cld_notifier(void);
>  extern void unregister_cld_notifier(void);
> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> +extern void nfsd4_ssc_init_umount_work(void);
> +#endif
> +
>  #else /* CONFIG_NFSD_V4 */
>  static inline int nfsd4_is_junction(struct dentry *dentry)
>  {
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 6de406322106..2558db55b88b 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -322,6 +322,9 @@ static int nfsd_startup_generic(int nrservs)
>         ret = nfs4_state_start();
>         if (ret)
>                 goto out_file_cache;
> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> +       nfsd4_ssc_init_umount_work();
> +#endif
>         return 0;
>
>  out_file_cache:
> diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
> index f5ba0fbff72f..1e07be2a89fa 100644
> --- a/include/linux/nfs_ssc.h
> +++ b/include/linux/nfs_ssc.h
> @@ -8,6 +8,7 @@
>   */
>
>  #include <linux/nfs_fs.h>
> +#include <linux/sunrpc/svc.h>
>
>  extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
>
> @@ -52,6 +53,25 @@ static inline void nfs42_ssc_close(struct file *filep)
>         if (nfs_ssc_client_tbl.ssc_nfs4_ops)
>                 (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
>  }
> +
> +struct nfsd4_ssc_umount_item {
> +       struct list_head nsui_list;
> +       bool nsui_busy;
> +       refcount_t nsui_refcnt;
> +       unsigned long nsui_expire;
> +       struct vfsmount *nsui_vfsmount;
> +       char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
> +};
> +
> +struct nfsd4_ssc_umount {
> +       struct list_head nsu_list;
> +       struct delayed_work nsu_umount_work;
> +       spinlock_t nsu_lock;
> +       unsigned long nsu_expire;
> +       wait_queue_head_t nsu_waitq;
> +       bool nsu_inited;
> +};
> +
>  #endif
>
>  /*
> --
> 2.9.5
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-05-14 18:12   ` Olga Kornievskaia
@ 2021-05-14 18:42     ` J. Bruce Fields
  2021-05-14 20:19       ` dai.ngo
  2021-05-14 20:08     ` dai.ngo
  1 sibling, 1 reply; 9+ messages in thread
From: J. Bruce Fields @ 2021-05-14 18:42 UTC (permalink / raw)
  To: Olga Kornievskaia; +Cc: Dai Ngo, Trond Myklebust, Chuck Lever, linux-nfs

On Fri, May 14, 2021 at 02:12:38PM -0400, Olga Kornievskaia wrote:
> On Fri, Apr 23, 2021 at 4:59 PM Dai Ngo <dai.ngo@oracle.com> wrote:
> >
> > Currently the source's export is mounted and unmounted on every
> > inter-server copy operation. This patch is an enhancement to delay
> > the unmount of the source export for a certain period of time to
> > eliminate the mount and unmount overhead on subsequent copy operations.
> >
> > After a copy operation completes, a delayed task is scheduled to
> > unmount the export after a configurable idle time. Each time the
> > export is being used again, its expire time is extended to allow
> > the export to remain mounted.
> >
> > The unmount task and the mount operation of the copy request are
> > synced to make sure the export is not unmounted while it's being
> > used.
> >
> > Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> > ---
> >  fs/nfsd/nfs4proc.c      | 178 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  fs/nfsd/nfsd.h          |   4 ++
> >  fs/nfsd/nfssvc.c        |   3 +
> >  include/linux/nfs_ssc.h |  20 ++++++
> >  4 files changed, 201 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > index dd9f38d072dd..a4b110cbcab5 100644
> > --- a/fs/nfsd/nfs4proc.c
> > +++ b/fs/nfsd/nfs4proc.c
> > @@ -55,6 +55,81 @@ module_param(inter_copy_offload_enable, bool, 0644);
> >  MODULE_PARM_DESC(inter_copy_offload_enable,
> >                  "Enable inter server to server copy offload. Default: false");
> >
> > +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> > +static int nfsd4_ssc_umount_timeout = 900000;          /* default to 15 mins */
> > +module_param(nfsd4_ssc_umount_timeout, int, 0644);
> > +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
> > +               "idle msecs before unmount export from source server");
> > +
> > +static void nfsd4_ssc_expire_umount(struct work_struct *work);
> > +static struct nfsd4_ssc_umount nfsd4_ssc_umount;
> > +
> > +/* nfsd4_ssc_umount.nsu_lock must be held */
> > +static void nfsd4_scc_update_umnt_timo(void)
> > +{
> > +       struct nfsd4_ssc_umount_item *ni = 0;
> > +
> > +       cancel_delayed_work(&nfsd4_ssc_umount.nsu_umount_work);
> > +       if (!list_empty(&nfsd4_ssc_umount.nsu_list)) {
> > +               ni = list_first_entry(&nfsd4_ssc_umount.nsu_list,
> > +                       struct nfsd4_ssc_umount_item, nsui_list);
> > +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
> > +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
> > +                       ni->nsui_expire - jiffies);
> > +       } else
> > +               nfsd4_ssc_umount.nsu_expire = 0;
> > +}
> > +
> > +static void nfsd4_ssc_expire_umount(struct work_struct *work)
> > +{
> > +       bool do_wakeup = false;
> > +       struct nfsd4_ssc_umount_item *ni = 0;
> > +       struct nfsd4_ssc_umount_item *tmp;
> > +
> > +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
> > +               if (time_after(jiffies, ni->nsui_expire)) {
> > +                       if (refcount_read(&ni->nsui_refcnt) > 0)
> > +                               continue;
> > +
> > +                       /* mark being unmount */
> > +                       ni->nsui_busy = true;
> > +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +                       mntput(ni->nsui_vfsmount);
> > +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +
> > +                       /* waiters need to start from begin of list */
> > +                       list_del(&ni->nsui_list);
> > +                       kfree(ni);
> > +
> > +                       /* wakeup ssc_connect waiters */
> > +                       do_wakeup = true;
> > +                       continue;
> > +               }
> > +               break;
> 
> Wouldn't you be exiting your iteration loop as soon as you find a
> delayed item that hasn't expired? What if other items on the list have
> expired?
> 
> It looks like the general design is that work for doing umount is
> triggered by doing some copy, right? And then it just keeps
> rescheduling itself? Shouldn't we just be using the laundrymat instead
> that wakes up and goes thru the list mounts that are put to be
> unmounted and does so? Bruce previously had suggested using laundrymat
> for cleaning up copynotify states on the source server. But if the
> existing approach works for Bruce, I'm ok with it too.

I guess I'd prefer to use the laundromat, unless there's some reason to
do this one differently.

Also: on a quick skim, I don't see where this is shut down?  (If the
server is shut down, or the module unloaded?)

--b.

> 
> > +       }
> > +       nfsd4_scc_update_umnt_timo();
> > +       if (do_wakeup)
> > +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> > +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +}
> > +
> > +static DECLARE_DELAYED_WORK(nfsd4, nfsd4_ssc_expire_umount);
> > +
> > +void nfsd4_ssc_init_umount_work(void)
> > +{
> > +       if (nfsd4_ssc_umount.nsu_inited)
> > +               return;
> > +       INIT_DELAYED_WORK(&nfsd4_ssc_umount.nsu_umount_work,
> > +               nfsd4_ssc_expire_umount);
> > +       INIT_LIST_HEAD(&nfsd4_ssc_umount.nsu_list);
> > +       spin_lock_init(&nfsd4_ssc_umount.nsu_lock);
> > +       init_waitqueue_head(&nfsd4_ssc_umount.nsu_waitq);
> > +       nfsd4_ssc_umount.nsu_inited = true;
> > +}
> > +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
> > +#endif
> > +
> >  #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
> >  #include <linux/security.h>
> >
> > @@ -1181,6 +1256,9 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
> >         char *ipaddr, *dev_name, *raw_data;
> >         int len, raw_len;
> >         __be32 status = nfserr_inval;
> > +       struct nfsd4_ssc_umount_item *ni = 0;
> > +       struct nfsd4_ssc_umount_item *work, *tmp;
> > +       DEFINE_WAIT(wait);
> >
> >         naddr = &nss->u.nl4_addr;
> >         tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
> > @@ -1229,12 +1307,68 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
> >                 goto out_free_rawdata;
> >         snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
> >
> > +       work = kzalloc(sizeof(*work), GFP_KERNEL);
> > +try_again:
> > +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
> > +               if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
> > +                       continue;
> > +               /* found a match */
> > +               if (ni->nsui_busy) {
> > +                       /*  wait - and try again */
> > +                       prepare_to_wait(&nfsd4_ssc_umount.nsu_waitq, &wait,
> > +                               TASK_INTERRUPTIBLE);
> > +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +
> > +                       /* allow 20secs for mount/unmount for now - revisit */
> > +                       if (signal_pending(current) ||
> > +                                       (schedule_timeout(20*HZ) == 0)) {
> > +                               status = nfserr_eagain;
> > +                               kfree(work);
> > +                               goto out_free_devname;
> > +                       }
> > +                       finish_wait(&nfsd4_ssc_umount.nsu_waitq, &wait);
> > +                       goto try_again;
> > +               }
> > +               ss_mnt = ni->nsui_vfsmount;
> > +               if (refcount_read(&ni->nsui_refcnt) == 0)
> > +                       refcount_set(&ni->nsui_refcnt, 1);
> > +               else
> > +                       refcount_inc(&ni->nsui_refcnt);
> > +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +               kfree(work);
> > +               goto out_done;
> > +       }
> > +       /* create new entry, set busy, insert list, clear busy after mount */
> > +       if (work) {
> > +               strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
> > +               refcount_set(&work->nsui_refcnt, 1);
> > +               work->nsui_busy = true;
> > +               list_add_tail(&work->nsui_list, &nfsd4_ssc_umount.nsu_list);
> > +       }
> > +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +
> >         /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
> >         ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
> >         module_put(type->owner);
> > -       if (IS_ERR(ss_mnt))
> > +       if (IS_ERR(ss_mnt)) {
> > +               if (work) {
> > +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +                       list_del(&work->nsui_list);
> > +                       wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> > +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +                       kfree(work);
> > +               }
> >                 goto out_free_devname;
> > -
> > +       }
> > +       if (work) {
> > +               spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +               work->nsui_vfsmount = ss_mnt;
> > +               work->nsui_busy = false;
> > +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
> > +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +       }
> > +out_done:
> >         status = 0;
> >         *mount = ss_mnt;
> >
> > @@ -1301,10 +1435,46 @@ static void
> >  nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
> >                         struct nfsd_file *dst)
> >  {
> > +       bool found = false;
> > +       bool resched = false;
> > +       long timeout;
> > +       struct nfsd4_ssc_umount_item *tmp;
> > +       struct nfsd4_ssc_umount_item *ni = 0;
> > +
> >         nfs42_ssc_close(src->nf_file);
> > -       fput(src->nf_file);
> >         nfsd_file_put(dst);
> > -       mntput(ss_mnt);
> > +       fput(src->nf_file);
> > +
> > +       timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
> > +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
> > +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list,
> > +               nsui_list) {
> > +               if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
> > +                       list_del(&ni->nsui_list);
> > +                       /*
> > +                        * vfsmount can be shared by multiple exports,
> > +                        * decrement refcnt and schedule delayed task
> > +                        * if it drops to 0.
> > +                        */
> > +                       if (refcount_dec_and_test(&ni->nsui_refcnt))
> > +                               resched = true;
> > +                       ni->nsui_expire = jiffies + timeout;
> > +                       list_add_tail(&ni->nsui_list, &nfsd4_ssc_umount.nsu_list);
> > +                       found = true;
> > +                       break;
> > +               }
> > +       }
> > +       if (!found) {
> > +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> > +               mntput(ss_mnt);
> > +               return;
> > +       }
> > +       if (resched && !nfsd4_ssc_umount.nsu_expire) {
> > +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
> > +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
> > +                               timeout);
> > +       }
> > +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
> >  }
> >
> >  #else /* CONFIG_NFSD_V4_2_INTER_SSC */
> > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> > index 8bdc37aa2c2e..b3bf8a5f4472 100644
> > --- a/fs/nfsd/nfsd.h
> > +++ b/fs/nfsd/nfsd.h
> > @@ -483,6 +483,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
> >  extern int nfsd4_is_junction(struct dentry *dentry);
> >  extern int register_cld_notifier(void);
> >  extern void unregister_cld_notifier(void);
> > +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> > +extern void nfsd4_ssc_init_umount_work(void);
> > +#endif
> > +
> >  #else /* CONFIG_NFSD_V4 */
> >  static inline int nfsd4_is_junction(struct dentry *dentry)
> >  {
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index 6de406322106..2558db55b88b 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -322,6 +322,9 @@ static int nfsd_startup_generic(int nrservs)
> >         ret = nfs4_state_start();
> >         if (ret)
> >                 goto out_file_cache;
> > +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
> > +       nfsd4_ssc_init_umount_work();
> > +#endif
> >         return 0;
> >
> >  out_file_cache:
> > diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
> > index f5ba0fbff72f..1e07be2a89fa 100644
> > --- a/include/linux/nfs_ssc.h
> > +++ b/include/linux/nfs_ssc.h
> > @@ -8,6 +8,7 @@
> >   */
> >
> >  #include <linux/nfs_fs.h>
> > +#include <linux/sunrpc/svc.h>
> >
> >  extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
> >
> > @@ -52,6 +53,25 @@ static inline void nfs42_ssc_close(struct file *filep)
> >         if (nfs_ssc_client_tbl.ssc_nfs4_ops)
> >                 (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
> >  }
> > +
> > +struct nfsd4_ssc_umount_item {
> > +       struct list_head nsui_list;
> > +       bool nsui_busy;
> > +       refcount_t nsui_refcnt;
> > +       unsigned long nsui_expire;
> > +       struct vfsmount *nsui_vfsmount;
> > +       char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
> > +};
> > +
> > +struct nfsd4_ssc_umount {
> > +       struct list_head nsu_list;
> > +       struct delayed_work nsu_umount_work;
> > +       spinlock_t nsu_lock;
> > +       unsigned long nsu_expire;
> > +       wait_queue_head_t nsu_waitq;
> > +       bool nsu_inited;
> > +};
> > +
> >  #endif
> >
> >  /*
> > --
> > 2.9.5
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-05-14 18:12   ` Olga Kornievskaia
  2021-05-14 18:42     ` J. Bruce Fields
@ 2021-05-14 20:08     ` dai.ngo
  1 sibling, 0 replies; 9+ messages in thread
From: dai.ngo @ 2021-05-14 20:08 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Trond Myklebust, J. Bruce Fields, Chuck Lever, linux-nfs


On 5/14/21 11:12 AM, Olga Kornievskaia wrote:
> On Fri, Apr 23, 2021 at 4:59 PM Dai Ngo <dai.ngo@oracle.com> wrote:
>> Currently the source's export is mounted and unmounted on every
>> inter-server copy operation. This patch is an enhancement to delay
>> the unmount of the source export for a certain period of time to
>> eliminate the mount and unmount overhead on subsequent copy operations.
>>
>> After a copy operation completes, a delayed task is scheduled to
>> unmount the export after a configurable idle time. Each time the
>> export is being used again, its expire time is extended to allow
>> the export to remain mounted.
>>
>> The unmount task and the mount operation of the copy request are
>> synced to make sure the export is not unmounted while it's being
>> used.
>>
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>>   fs/nfsd/nfs4proc.c      | 178 ++++++++++++++++++++++++++++++++++++++++++++++--
>>   fs/nfsd/nfsd.h          |   4 ++
>>   fs/nfsd/nfssvc.c        |   3 +
>>   include/linux/nfs_ssc.h |  20 ++++++
>>   4 files changed, 201 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index dd9f38d072dd..a4b110cbcab5 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -55,6 +55,81 @@ module_param(inter_copy_offload_enable, bool, 0644);
>>   MODULE_PARM_DESC(inter_copy_offload_enable,
>>                   "Enable inter server to server copy offload. Default: false");
>>
>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> +static int nfsd4_ssc_umount_timeout = 900000;          /* default to 15 mins */
>> +module_param(nfsd4_ssc_umount_timeout, int, 0644);
>> +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
>> +               "idle msecs before unmount export from source server");
>> +
>> +static void nfsd4_ssc_expire_umount(struct work_struct *work);
>> +static struct nfsd4_ssc_umount nfsd4_ssc_umount;
>> +
>> +/* nfsd4_ssc_umount.nsu_lock must be held */
>> +static void nfsd4_scc_update_umnt_timo(void)
>> +{
>> +       struct nfsd4_ssc_umount_item *ni = 0;
>> +
>> +       cancel_delayed_work(&nfsd4_ssc_umount.nsu_umount_work);
>> +       if (!list_empty(&nfsd4_ssc_umount.nsu_list)) {
>> +               ni = list_first_entry(&nfsd4_ssc_umount.nsu_list,
>> +                       struct nfsd4_ssc_umount_item, nsui_list);
>> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
>> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
>> +                       ni->nsui_expire - jiffies);
>> +       } else
>> +               nfsd4_ssc_umount.nsu_expire = 0;
>> +}
>> +
>> +static void nfsd4_ssc_expire_umount(struct work_struct *work)
>> +{
>> +       bool do_wakeup = false;
>> +       struct nfsd4_ssc_umount_item *ni = 0;
>> +       struct nfsd4_ssc_umount_item *tmp;
>> +
>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
>> +               if (time_after(jiffies, ni->nsui_expire)) {
>> +                       if (refcount_read(&ni->nsui_refcnt) > 0)
>> +                               continue;
>> +
>> +                       /* mark being unmount */
>> +                       ni->nsui_busy = true;
>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +                       mntput(ni->nsui_vfsmount);
>> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +
>> +                       /* waiters need to start from begin of list */
>> +                       list_del(&ni->nsui_list);
>> +                       kfree(ni);
>> +
>> +                       /* wakeup ssc_connect waiters */
>> +                       do_wakeup = true;
>> +                       continue;
>> +               }
>> +               break;
> Wouldn't you be exiting your iteration loop as soon as you find a
> delayed item that hasn't expired?

Yes, the code exits the loop as soon it finds an item that hasn't expired.

>   What if other items on the list have
> expired?

All expired items are processed in the loop. The loop terminates when
an unexpired item is encountered. Work items are inserted at the tail
of the list so I don't think it will miss any item here.

>
> It looks like the general design is that work for doing umount is
> triggered by doing some copy, right?

After a copy is done, its work item is removed from the work list, its
expired time is set then it's re-inserted to the tail of list and the
delayed task is scheduled if one is not already scheduled.

> And then it just keeps
> rescheduling itself?

When the delayed task runs, it processes all expired items and only
reschedules itself again if there are still items in the list. The
delayed task only exists if there are works. So there is no overhead
or any resources consumed when SSC is not used or after all inter-copy
were completed for awhile.

> Shouldn't we just be using the laundrymat instead
> that wakes up and goes thru the list mounts that are put to be
> unmounted and does so? Bruce previously had suggested using laundrymat
> for cleaning up copynotify states on the source server. But if the
> existing approach works for Bruce, I'm ok with it too.

I can look into the laundrymat if you think there are advantages
with that approach.

-Dai

>
>> +       }
>> +       nfsd4_scc_update_umnt_timo();
>> +       if (do_wakeup)
>> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +}
>> +
>> +static DECLARE_DELAYED_WORK(nfsd4, nfsd4_ssc_expire_umount);
>> +
>> +void nfsd4_ssc_init_umount_work(void)
>> +{
>> +       if (nfsd4_ssc_umount.nsu_inited)
>> +               return;
>> +       INIT_DELAYED_WORK(&nfsd4_ssc_umount.nsu_umount_work,
>> +               nfsd4_ssc_expire_umount);
>> +       INIT_LIST_HEAD(&nfsd4_ssc_umount.nsu_list);
>> +       spin_lock_init(&nfsd4_ssc_umount.nsu_lock);
>> +       init_waitqueue_head(&nfsd4_ssc_umount.nsu_waitq);
>> +       nfsd4_ssc_umount.nsu_inited = true;
>> +}
>> +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
>> +#endif
>> +
>>   #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
>>   #include <linux/security.h>
>>
>> @@ -1181,6 +1256,9 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>>          char *ipaddr, *dev_name, *raw_data;
>>          int len, raw_len;
>>          __be32 status = nfserr_inval;
>> +       struct nfsd4_ssc_umount_item *ni = 0;
>> +       struct nfsd4_ssc_umount_item *work, *tmp;
>> +       DEFINE_WAIT(wait);
>>
>>          naddr = &nss->u.nl4_addr;
>>          tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
>> @@ -1229,12 +1307,68 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>>                  goto out_free_rawdata;
>>          snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
>>
>> +       work = kzalloc(sizeof(*work), GFP_KERNEL);
>> +try_again:
>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
>> +               if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
>> +                       continue;
>> +               /* found a match */
>> +               if (ni->nsui_busy) {
>> +                       /*  wait - and try again */
>> +                       prepare_to_wait(&nfsd4_ssc_umount.nsu_waitq, &wait,
>> +                               TASK_INTERRUPTIBLE);
>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +
>> +                       /* allow 20secs for mount/unmount for now - revisit */
>> +                       if (signal_pending(current) ||
>> +                                       (schedule_timeout(20*HZ) == 0)) {
>> +                               status = nfserr_eagain;
>> +                               kfree(work);
>> +                               goto out_free_devname;
>> +                       }
>> +                       finish_wait(&nfsd4_ssc_umount.nsu_waitq, &wait);
>> +                       goto try_again;
>> +               }
>> +               ss_mnt = ni->nsui_vfsmount;
>> +               if (refcount_read(&ni->nsui_refcnt) == 0)
>> +                       refcount_set(&ni->nsui_refcnt, 1);
>> +               else
>> +                       refcount_inc(&ni->nsui_refcnt);
>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +               kfree(work);
>> +               goto out_done;
>> +       }
>> +       /* create new entry, set busy, insert list, clear busy after mount */
>> +       if (work) {
>> +               strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
>> +               refcount_set(&work->nsui_refcnt, 1);
>> +               work->nsui_busy = true;
>> +               list_add_tail(&work->nsui_list, &nfsd4_ssc_umount.nsu_list);
>> +       }
>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +
>>          /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
>>          ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
>>          module_put(type->owner);
>> -       if (IS_ERR(ss_mnt))
>> +       if (IS_ERR(ss_mnt)) {
>> +               if (work) {
>> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +                       list_del(&work->nsui_list);
>> +                       wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +                       kfree(work);
>> +               }
>>                  goto out_free_devname;
>> -
>> +       }
>> +       if (work) {
>> +               spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +               work->nsui_vfsmount = ss_mnt;
>> +               work->nsui_busy = false;
>> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +       }
>> +out_done:
>>          status = 0;
>>          *mount = ss_mnt;
>>
>> @@ -1301,10 +1435,46 @@ static void
>>   nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
>>                          struct nfsd_file *dst)
>>   {
>> +       bool found = false;
>> +       bool resched = false;
>> +       long timeout;
>> +       struct nfsd4_ssc_umount_item *tmp;
>> +       struct nfsd4_ssc_umount_item *ni = 0;
>> +
>>          nfs42_ssc_close(src->nf_file);
>> -       fput(src->nf_file);
>>          nfsd_file_put(dst);
>> -       mntput(ss_mnt);
>> +       fput(src->nf_file);
>> +
>> +       timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list,
>> +               nsui_list) {
>> +               if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
>> +                       list_del(&ni->nsui_list);
>> +                       /*
>> +                        * vfsmount can be shared by multiple exports,
>> +                        * decrement refcnt and schedule delayed task
>> +                        * if it drops to 0.
>> +                        */
>> +                       if (refcount_dec_and_test(&ni->nsui_refcnt))
>> +                               resched = true;
>> +                       ni->nsui_expire = jiffies + timeout;
>> +                       list_add_tail(&ni->nsui_list, &nfsd4_ssc_umount.nsu_list);
>> +                       found = true;
>> +                       break;
>> +               }
>> +       }
>> +       if (!found) {
>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>> +               mntput(ss_mnt);
>> +               return;
>> +       }
>> +       if (resched && !nfsd4_ssc_umount.nsu_expire) {
>> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
>> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
>> +                               timeout);
>> +       }
>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>   }
>>
>>   #else /* CONFIG_NFSD_V4_2_INTER_SSC */
>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>> index 8bdc37aa2c2e..b3bf8a5f4472 100644
>> --- a/fs/nfsd/nfsd.h
>> +++ b/fs/nfsd/nfsd.h
>> @@ -483,6 +483,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
>>   extern int nfsd4_is_junction(struct dentry *dentry);
>>   extern int register_cld_notifier(void);
>>   extern void unregister_cld_notifier(void);
>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> +extern void nfsd4_ssc_init_umount_work(void);
>> +#endif
>> +
>>   #else /* CONFIG_NFSD_V4 */
>>   static inline int nfsd4_is_junction(struct dentry *dentry)
>>   {
>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>> index 6de406322106..2558db55b88b 100644
>> --- a/fs/nfsd/nfssvc.c
>> +++ b/fs/nfsd/nfssvc.c
>> @@ -322,6 +322,9 @@ static int nfsd_startup_generic(int nrservs)
>>          ret = nfs4_state_start();
>>          if (ret)
>>                  goto out_file_cache;
>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>> +       nfsd4_ssc_init_umount_work();
>> +#endif
>>          return 0;
>>
>>   out_file_cache:
>> diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
>> index f5ba0fbff72f..1e07be2a89fa 100644
>> --- a/include/linux/nfs_ssc.h
>> +++ b/include/linux/nfs_ssc.h
>> @@ -8,6 +8,7 @@
>>    */
>>
>>   #include <linux/nfs_fs.h>
>> +#include <linux/sunrpc/svc.h>
>>
>>   extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
>>
>> @@ -52,6 +53,25 @@ static inline void nfs42_ssc_close(struct file *filep)
>>          if (nfs_ssc_client_tbl.ssc_nfs4_ops)
>>                  (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
>>   }
>> +
>> +struct nfsd4_ssc_umount_item {
>> +       struct list_head nsui_list;
>> +       bool nsui_busy;
>> +       refcount_t nsui_refcnt;
>> +       unsigned long nsui_expire;
>> +       struct vfsmount *nsui_vfsmount;
>> +       char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
>> +};
>> +
>> +struct nfsd4_ssc_umount {
>> +       struct list_head nsu_list;
>> +       struct delayed_work nsu_umount_work;
>> +       spinlock_t nsu_lock;
>> +       unsigned long nsu_expire;
>> +       wait_queue_head_t nsu_waitq;
>> +       bool nsu_inited;
>> +};
>> +
>>   #endif
>>
>>   /*
>> --
>> 2.9.5
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v4 1/2] NFSD: delay unmount source's export after inter-server copy completed.
  2021-05-14 18:42     ` J. Bruce Fields
@ 2021-05-14 20:19       ` dai.ngo
  0 siblings, 0 replies; 9+ messages in thread
From: dai.ngo @ 2021-05-14 20:19 UTC (permalink / raw)
  To: J. Bruce Fields, Olga Kornievskaia
  Cc: Trond Myklebust, Chuck Lever, linux-nfs


On 5/14/21 11:42 AM, J. Bruce Fields wrote:
> On Fri, May 14, 2021 at 02:12:38PM -0400, Olga Kornievskaia wrote:
>> On Fri, Apr 23, 2021 at 4:59 PM Dai Ngo <dai.ngo@oracle.com> wrote:
>>> Currently the source's export is mounted and unmounted on every
>>> inter-server copy operation. This patch is an enhancement to delay
>>> the unmount of the source export for a certain period of time to
>>> eliminate the mount and unmount overhead on subsequent copy operations.
>>>
>>> After a copy operation completes, a delayed task is scheduled to
>>> unmount the export after a configurable idle time. Each time the
>>> export is being used again, its expire time is extended to allow
>>> the export to remain mounted.
>>>
>>> The unmount task and the mount operation of the copy request are
>>> synced to make sure the export is not unmounted while it's being
>>> used.
>>>
>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> ---
>>>   fs/nfsd/nfs4proc.c      | 178 ++++++++++++++++++++++++++++++++++++++++++++++--
>>>   fs/nfsd/nfsd.h          |   4 ++
>>>   fs/nfsd/nfssvc.c        |   3 +
>>>   include/linux/nfs_ssc.h |  20 ++++++
>>>   4 files changed, 201 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>> index dd9f38d072dd..a4b110cbcab5 100644
>>> --- a/fs/nfsd/nfs4proc.c
>>> +++ b/fs/nfsd/nfs4proc.c
>>> @@ -55,6 +55,81 @@ module_param(inter_copy_offload_enable, bool, 0644);
>>>   MODULE_PARM_DESC(inter_copy_offload_enable,
>>>                   "Enable inter server to server copy offload. Default: false");
>>>
>>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>>> +static int nfsd4_ssc_umount_timeout = 900000;          /* default to 15 mins */
>>> +module_param(nfsd4_ssc_umount_timeout, int, 0644);
>>> +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
>>> +               "idle msecs before unmount export from source server");
>>> +
>>> +static void nfsd4_ssc_expire_umount(struct work_struct *work);
>>> +static struct nfsd4_ssc_umount nfsd4_ssc_umount;
>>> +
>>> +/* nfsd4_ssc_umount.nsu_lock must be held */
>>> +static void nfsd4_scc_update_umnt_timo(void)
>>> +{
>>> +       struct nfsd4_ssc_umount_item *ni = 0;
>>> +
>>> +       cancel_delayed_work(&nfsd4_ssc_umount.nsu_umount_work);
>>> +       if (!list_empty(&nfsd4_ssc_umount.nsu_list)) {
>>> +               ni = list_first_entry(&nfsd4_ssc_umount.nsu_list,
>>> +                       struct nfsd4_ssc_umount_item, nsui_list);
>>> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
>>> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
>>> +                       ni->nsui_expire - jiffies);
>>> +       } else
>>> +               nfsd4_ssc_umount.nsu_expire = 0;
>>> +}
>>> +
>>> +static void nfsd4_ssc_expire_umount(struct work_struct *work)
>>> +{
>>> +       bool do_wakeup = false;
>>> +       struct nfsd4_ssc_umount_item *ni = 0;
>>> +       struct nfsd4_ssc_umount_item *tmp;
>>> +
>>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
>>> +               if (time_after(jiffies, ni->nsui_expire)) {
>>> +                       if (refcount_read(&ni->nsui_refcnt) > 0)
>>> +                               continue;
>>> +
>>> +                       /* mark being unmount */
>>> +                       ni->nsui_busy = true;
>>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +                       mntput(ni->nsui_vfsmount);
>>> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +
>>> +                       /* waiters need to start from begin of list */
>>> +                       list_del(&ni->nsui_list);
>>> +                       kfree(ni);
>>> +
>>> +                       /* wakeup ssc_connect waiters */
>>> +                       do_wakeup = true;
>>> +                       continue;
>>> +               }
>>> +               break;
>> Wouldn't you be exiting your iteration loop as soon as you find a
>> delayed item that hasn't expired? What if other items on the list have
>> expired?
>>
>> It looks like the general design is that work for doing umount is
>> triggered by doing some copy, right? And then it just keeps
>> rescheduling itself? Shouldn't we just be using the laundrymat instead
>> that wakes up and goes thru the list mounts that are put to be
>> unmounted and does so? Bruce previously had suggested using laundrymat
>> for cleaning up copynotify states on the source server. But if the
>> existing approach works for Bruce, I'm ok with it too.
> I guess I'd prefer to use the laundromat, unless there's some reason to
> do this one differently.

I will look into the laundromat approach.  I try to avoid destabilizing
existing code by using separate mechanism for SSC.

>
> Also: on a quick skim, I don't see where this is shut down?  (If the
> server is shut down, or the module unloaded?)

Ah thanks! I completely miss this part. I will add this in v5 patch.

-Dai

>
> --b.
>
>>> +       }
>>> +       nfsd4_scc_update_umnt_timo();
>>> +       if (do_wakeup)
>>> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +}
>>> +
>>> +static DECLARE_DELAYED_WORK(nfsd4, nfsd4_ssc_expire_umount);
>>> +
>>> +void nfsd4_ssc_init_umount_work(void)
>>> +{
>>> +       if (nfsd4_ssc_umount.nsu_inited)
>>> +               return;
>>> +       INIT_DELAYED_WORK(&nfsd4_ssc_umount.nsu_umount_work,
>>> +               nfsd4_ssc_expire_umount);
>>> +       INIT_LIST_HEAD(&nfsd4_ssc_umount.nsu_list);
>>> +       spin_lock_init(&nfsd4_ssc_umount.nsu_lock);
>>> +       init_waitqueue_head(&nfsd4_ssc_umount.nsu_waitq);
>>> +       nfsd4_ssc_umount.nsu_inited = true;
>>> +}
>>> +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
>>> +#endif
>>> +
>>>   #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
>>>   #include <linux/security.h>
>>>
>>> @@ -1181,6 +1256,9 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>>>          char *ipaddr, *dev_name, *raw_data;
>>>          int len, raw_len;
>>>          __be32 status = nfserr_inval;
>>> +       struct nfsd4_ssc_umount_item *ni = 0;
>>> +       struct nfsd4_ssc_umount_item *work, *tmp;
>>> +       DEFINE_WAIT(wait);
>>>
>>>          naddr = &nss->u.nl4_addr;
>>>          tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
>>> @@ -1229,12 +1307,68 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
>>>                  goto out_free_rawdata;
>>>          snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
>>>
>>> +       work = kzalloc(sizeof(*work), GFP_KERNEL);
>>> +try_again:
>>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list, nsui_list) {
>>> +               if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
>>> +                       continue;
>>> +               /* found a match */
>>> +               if (ni->nsui_busy) {
>>> +                       /*  wait - and try again */
>>> +                       prepare_to_wait(&nfsd4_ssc_umount.nsu_waitq, &wait,
>>> +                               TASK_INTERRUPTIBLE);
>>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +
>>> +                       /* allow 20secs for mount/unmount for now - revisit */
>>> +                       if (signal_pending(current) ||
>>> +                                       (schedule_timeout(20*HZ) == 0)) {
>>> +                               status = nfserr_eagain;
>>> +                               kfree(work);
>>> +                               goto out_free_devname;
>>> +                       }
>>> +                       finish_wait(&nfsd4_ssc_umount.nsu_waitq, &wait);
>>> +                       goto try_again;
>>> +               }
>>> +               ss_mnt = ni->nsui_vfsmount;
>>> +               if (refcount_read(&ni->nsui_refcnt) == 0)
>>> +                       refcount_set(&ni->nsui_refcnt, 1);
>>> +               else
>>> +                       refcount_inc(&ni->nsui_refcnt);
>>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +               kfree(work);
>>> +               goto out_done;
>>> +       }
>>> +       /* create new entry, set busy, insert list, clear busy after mount */
>>> +       if (work) {
>>> +               strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
>>> +               refcount_set(&work->nsui_refcnt, 1);
>>> +               work->nsui_busy = true;
>>> +               list_add_tail(&work->nsui_list, &nfsd4_ssc_umount.nsu_list);
>>> +       }
>>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +
>>>          /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
>>>          ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
>>>          module_put(type->owner);
>>> -       if (IS_ERR(ss_mnt))
>>> +       if (IS_ERR(ss_mnt)) {
>>> +               if (work) {
>>> +                       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +                       list_del(&work->nsui_list);
>>> +                       wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>>> +                       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +                       kfree(work);
>>> +               }
>>>                  goto out_free_devname;
>>> -
>>> +       }
>>> +       if (work) {
>>> +               spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +               work->nsui_vfsmount = ss_mnt;
>>> +               work->nsui_busy = false;
>>> +               wake_up_all(&nfsd4_ssc_umount.nsu_waitq);
>>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +       }
>>> +out_done:
>>>          status = 0;
>>>          *mount = ss_mnt;
>>>
>>> @@ -1301,10 +1435,46 @@ static void
>>>   nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
>>>                          struct nfsd_file *dst)
>>>   {
>>> +       bool found = false;
>>> +       bool resched = false;
>>> +       long timeout;
>>> +       struct nfsd4_ssc_umount_item *tmp;
>>> +       struct nfsd4_ssc_umount_item *ni = 0;
>>> +
>>>          nfs42_ssc_close(src->nf_file);
>>> -       fput(src->nf_file);
>>>          nfsd_file_put(dst);
>>> -       mntput(ss_mnt);
>>> +       fput(src->nf_file);
>>> +
>>> +       timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
>>> +       spin_lock(&nfsd4_ssc_umount.nsu_lock);
>>> +       list_for_each_entry_safe(ni, tmp, &nfsd4_ssc_umount.nsu_list,
>>> +               nsui_list) {
>>> +               if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
>>> +                       list_del(&ni->nsui_list);
>>> +                       /*
>>> +                        * vfsmount can be shared by multiple exports,
>>> +                        * decrement refcnt and schedule delayed task
>>> +                        * if it drops to 0.
>>> +                        */
>>> +                       if (refcount_dec_and_test(&ni->nsui_refcnt))
>>> +                               resched = true;
>>> +                       ni->nsui_expire = jiffies + timeout;
>>> +                       list_add_tail(&ni->nsui_list, &nfsd4_ssc_umount.nsu_list);
>>> +                       found = true;
>>> +                       break;
>>> +               }
>>> +       }
>>> +       if (!found) {
>>> +               spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>> +               mntput(ss_mnt);
>>> +               return;
>>> +       }
>>> +       if (resched && !nfsd4_ssc_umount.nsu_expire) {
>>> +               nfsd4_ssc_umount.nsu_expire = ni->nsui_expire;
>>> +               schedule_delayed_work(&nfsd4_ssc_umount.nsu_umount_work,
>>> +                               timeout);
>>> +       }
>>> +       spin_unlock(&nfsd4_ssc_umount.nsu_lock);
>>>   }
>>>
>>>   #else /* CONFIG_NFSD_V4_2_INTER_SSC */
>>> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
>>> index 8bdc37aa2c2e..b3bf8a5f4472 100644
>>> --- a/fs/nfsd/nfsd.h
>>> +++ b/fs/nfsd/nfsd.h
>>> @@ -483,6 +483,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
>>>   extern int nfsd4_is_junction(struct dentry *dentry);
>>>   extern int register_cld_notifier(void);
>>>   extern void unregister_cld_notifier(void);
>>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>>> +extern void nfsd4_ssc_init_umount_work(void);
>>> +#endif
>>> +
>>>   #else /* CONFIG_NFSD_V4 */
>>>   static inline int nfsd4_is_junction(struct dentry *dentry)
>>>   {
>>> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
>>> index 6de406322106..2558db55b88b 100644
>>> --- a/fs/nfsd/nfssvc.c
>>> +++ b/fs/nfsd/nfssvc.c
>>> @@ -322,6 +322,9 @@ static int nfsd_startup_generic(int nrservs)
>>>          ret = nfs4_state_start();
>>>          if (ret)
>>>                  goto out_file_cache;
>>> +#ifdef CONFIG_NFSD_V4_2_INTER_SSC
>>> +       nfsd4_ssc_init_umount_work();
>>> +#endif
>>>          return 0;
>>>
>>>   out_file_cache:
>>> diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
>>> index f5ba0fbff72f..1e07be2a89fa 100644
>>> --- a/include/linux/nfs_ssc.h
>>> +++ b/include/linux/nfs_ssc.h
>>> @@ -8,6 +8,7 @@
>>>    */
>>>
>>>   #include <linux/nfs_fs.h>
>>> +#include <linux/sunrpc/svc.h>
>>>
>>>   extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
>>>
>>> @@ -52,6 +53,25 @@ static inline void nfs42_ssc_close(struct file *filep)
>>>          if (nfs_ssc_client_tbl.ssc_nfs4_ops)
>>>                  (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep);
>>>   }
>>> +
>>> +struct nfsd4_ssc_umount_item {
>>> +       struct list_head nsui_list;
>>> +       bool nsui_busy;
>>> +       refcount_t nsui_refcnt;
>>> +       unsigned long nsui_expire;
>>> +       struct vfsmount *nsui_vfsmount;
>>> +       char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
>>> +};
>>> +
>>> +struct nfsd4_ssc_umount {
>>> +       struct list_head nsu_list;
>>> +       struct delayed_work nsu_umount_work;
>>> +       spinlock_t nsu_lock;
>>> +       unsigned long nsu_expire;
>>> +       wait_queue_head_t nsu_waitq;
>>> +       bool nsu_inited;
>>> +};
>>> +
>>>   #endif
>>>
>>>   /*
>>> --
>>> 2.9.5
>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-05-14 20:19 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-23 20:59 [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed Dai Ngo
2021-04-23 20:59 ` [PATCH v4 1/2] " Dai Ngo
2021-05-14 18:12   ` Olga Kornievskaia
2021-05-14 18:42     ` J. Bruce Fields
2021-05-14 20:19       ` dai.ngo
2021-05-14 20:08     ` dai.ngo
2021-04-23 20:59 ` [PATCH v4 2/2] NFSv4.2: remove restriction of copy size for inter-server copy Dai Ngo
2021-05-03 16:37 ` [PATCH v4 0/2] NFSD: delay unmount source's export after inter-server copy completed dai.ngo
2021-05-12 16:53   ` dai.ngo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).