From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-fsdevel-owner@vger.kernel.org>
Received: from zeniv.linux.org.uk ([195.92.253.2]:38369 "EHLO
	ZenIV.linux.org.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1425558AbcBRLL1 (ORCPT
	<rfc822;linux-fsdevel@vger.kernel.org>);
	Thu, 18 Feb 2016 06:11:27 -0500
Date: Thu, 18 Feb 2016 11:11:22 +0000
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Martin Brandenburg <martin@omnibond.com>
Cc: Mike Marshall <hubcap@omnibond.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	Stephen Rothwell <sfr@canb.auug.org.au>
Subject: Re: Orangefs ABI documentation
Message-ID: <20160218111122.GS17997@ZenIV.linux.org.uk>
References: <CA+D=wkgGZ9A8Qa5C6q3cROrr+Gp=jsgowvcbOs-22UU=aVT7Wg@mail.gmail.com>
 <20160215230434.GZ17997@ZenIV.linux.org.uk>
 <CAOg9mSSdsVL8tRiAojg=UCNPQ6iPcthtdowH9kWyiWnXUvTEHg@mail.gmail.com>
 <20160216233609.GE17997@ZenIV.linux.org.uk>
 <20160216235441.GF17997@ZenIV.linux.org.uk>
 <CAOg9mSR0QZOy-JAesN32o51omxdOQyfUh-Zj8k7ZohKLaa3GxQ@mail.gmail.com>
 <alpine.BSO.2.20.1602171722490.29280@server1.sysblue.org>
 <20160217230900.GP17997@ZenIV.linux.org.uk>
 <20160217231524.GQ17997@ZenIV.linux.org.uk>
 <20160218000439.GR17997@ZenIV.linux.org.uk>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20160218000439.GR17997@ZenIV.linux.org.uk>
Sender: linux-fsdevel-owner@vger.kernel.org
List-ID: <linux-fsdevel.vger.kernel.org>

On Thu, Feb 18, 2016 at 12:04:39AM +0000, Al Viro wrote:
> Looks like the right approach is to have orangefs_clean_... hitting the
> sucker being copied to/from daemon to wait until that's finished (and
> discarded).  That, BTW, would have an extra benefit of making life simpler
> for refcounting.
> 
> So...  We need to have them marked as "being copied" for the duration, instead
> of bumping the refcount.  That setting and dropping that flag should happen
> under op->lock.  Setting it should happen only if it's not given up (that would
> be interpreted as "not found").  Cleaning, OTOH, would recheck the "given up"
> and do complete(&op->waitq) in case it's been given up...
> 
> How about this (instead of the previous variant, includes a fix for
> errno bogosity spotted a bit upthread; if it works, it'll need a bit of
> splitup)

Better yet, let's use list_del_init() on op->list instead of those list_del().
Then, seeing that ..._clean_interrupted_... can't be called in case of
serviced (we hadn't dropped op->lock since the time we'd checked it), we
can use list_empty(&op->list) as a test for "given up while copying to/from
daemon", so there's no need for separate flag that way:
	* we never pick given up op from list/hash
	* daemon read/write_iter never modifies op->list after op has
been given up
	* if op is given up while copying to/from userland in daemon
read/write_iter, it will call complete(&op->waitq) once it finds that,
so giveup side can wait for completion if it finds op it's about to give
up not on any list.

Should be equivalent to the previous variant, but IMO it's cleaner that
way...

diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index b27ed1c..f7914f5 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -58,9 +58,9 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
 				 next,
 				 &htable_ops_in_progress[index],
 				 list) {
-		if (op->tag == tag && !op_state_purged(op)) {
+		if (op->tag == tag && !op_state_purged(op) &&
+		    !op_state_given_up(op)) {
 			list_del_init(&op->list);
-			get_op(op); /* increase ref count. */
 			spin_unlock(&htable_ops_in_progress_lock);
 			return op;
 		}
@@ -133,7 +133,7 @@ restart:
 		__s32 fsid;
 		/* This lock is held past the end of the loop when we break. */
 		spin_lock(&op->lock);
-		if (unlikely(op_state_purged(op))) {
+		if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
 			spin_unlock(&op->lock);
 			continue;
 		}
@@ -199,13 +199,12 @@ restart:
 	 */
 	if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
 		gossip_err("orangefs: ERROR: Current op already queued.\n");
-		list_del(&cur_op->list);
+		list_del_init(&cur_op->list);
 		spin_unlock(&cur_op->lock);
 		spin_unlock(&orangefs_request_list_lock);
 		return -EAGAIN;
 	}
 	list_del_init(&cur_op->list);
-	get_op(op);
 	spin_unlock(&orangefs_request_list_lock);
 
 	spin_unlock(&cur_op->lock);
@@ -230,7 +229,7 @@ restart:
 	if (unlikely(op_state_given_up(cur_op))) {
 		spin_unlock(&cur_op->lock);
 		spin_unlock(&htable_ops_in_progress_lock);
-		op_release(cur_op);
+		complete(&cur_op->waitq);
 		goto restart;
 	}
 
@@ -242,7 +241,6 @@ restart:
 	orangefs_devreq_add_op(cur_op);
 	spin_unlock(&cur_op->lock);
 	spin_unlock(&htable_ops_in_progress_lock);
-	op_release(cur_op);
 
 	/* The client only asks to read one size buffer. */
 	return MAX_DEV_REQ_UPSIZE;
@@ -258,10 +256,12 @@ error:
 	if (likely(!op_state_given_up(cur_op))) {
 		set_op_state_waiting(cur_op);
 		list_add(&cur_op->list, &orangefs_request_list);
+		spin_unlock(&cur_op->lock);
+	} else {
+		spin_unlock(&cur_op->lock);
+		complete(&cur_op->waitq);
 	}
-	spin_unlock(&cur_op->lock);
 	spin_unlock(&orangefs_request_list_lock);
-	op_release(cur_op);
 	return -EFAULT;
 }
 
@@ -333,8 +333,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 	n = copy_from_iter(&op->downcall, downcall_size, iter);
 	if (n != downcall_size) {
 		gossip_err("%s: failed to copy downcall.\n", __func__);
-		ret = -EFAULT;
-		goto Broken;
+		goto Efault;
 	}
 
 	if (op->downcall.status)
@@ -354,8 +353,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 			   downcall_size,
 			   op->downcall.trailer_size,
 			   total);
-		ret = -EFAULT;
-		goto Broken;
+		goto Efault;
 	}
 
 	/* Only READDIR operations should have trailers. */
@@ -364,8 +362,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		gossip_err("%s: %x operation with trailer.",
 			   __func__,
 			   op->downcall.type);
-		ret = -EFAULT;
-		goto Broken;
+		goto Efault;
 	}
 
 	/* READDIR operations should always have trailers. */
@@ -374,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		gossip_err("%s: %x operation with no trailer.",
 			   __func__,
 			   op->downcall.type);
-		ret = -EFAULT;
-		goto Broken;
+		goto Efault;
 	}
 
 	if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
@@ -386,8 +382,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 	if (op->downcall.trailer_buf == NULL) {
 		gossip_err("%s: failed trailer vmalloc.\n",
 			   __func__);
-		ret = -ENOMEM;
-		goto Broken;
+		goto Enomem;
 	}
 	memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
 	n = copy_from_iter(op->downcall.trailer_buf,
@@ -396,8 +391,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 	if (n != op->downcall.trailer_size) {
 		gossip_err("%s: failed to copy trailer.\n", __func__);
 		vfree(op->downcall.trailer_buf);
-		ret = -EFAULT;
-		goto Broken;
+		goto Efault;
 	}
 
 wakeup:
@@ -406,38 +400,27 @@ wakeup:
 	 * that this op is done
 	 */
 	spin_lock(&op->lock);
-	if (unlikely(op_state_given_up(op))) {
+	if (unlikely(op_is_cancel(op))) {
 		spin_unlock(&op->lock);
-		goto out;
-	}
-	set_op_state_serviced(op);
-	spin_unlock(&op->lock);
-
-	/*
-	 * If this operation is an I/O operation we need to wait
-	 * for all data to be copied before we can return to avoid
-	 * buffer corruption and races that can pull the buffers
-	 * out from under us.
-	 *
-	 * Essentially we're synchronizing with other parts of the
-	 * vfs implicitly by not allowing the user space
-	 * application reading/writing this device to return until
-	 * the buffers are done being used.
-	 */
-out:
-	if (unlikely(op_is_cancel(op)))
 		put_cancel(op);
-	op_release(op);
-	return ret;
-
-Broken:
-	spin_lock(&op->lock);
-	if (!op_state_given_up(op)) {
-		op->downcall.status = ret;
+	} else if (unlikely(op_state_given_up(op))) {
+		spin_unlock(&op->lock);
+		complete(&op->waitq);
+	} else {
 		set_op_state_serviced(op);
+		spin_unlock(&op->lock);
 	}
-	spin_unlock(&op->lock);
-	goto out;
+	return ret;
+
+Efault:
+	op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+	ret = -EFAULT;
+	goto wakeup;
+
+Enomem:
+	op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+	ret = -ENOMEM;
+	goto wakeup;
 }
 
 /* Returns whether any FS are still pending remounted */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 817092a..900a2e3 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -120,8 +120,6 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
 		spin_lock_init(&new_op->lock);
 		init_completion(&new_op->waitq);
 
-		atomic_set(&new_op->ref_count, 1);
-
 		new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
 		new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
 		new_op->downcall.status = -1;
@@ -149,7 +147,7 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
 	return new_op;
 }
 
-void __op_release(struct orangefs_kernel_op_s *orangefs_op)
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
 {
 	if (orangefs_op) {
 		gossip_debug(GOSSIP_CACHE_DEBUG,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 1f8310c..e387d3c 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -205,8 +205,6 @@ struct orangefs_kernel_op_s {
 	struct completion waitq;
 	spinlock_t lock;
 
-	atomic_t ref_count;
-
 	/* VFS aio fields */
 
 	int attempts;
@@ -230,23 +228,7 @@ static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
 #define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
 #define op_is_cancel(op)         ((op)->downcall.type == ORANGEFS_VFS_OP_CANCEL)
 
-static inline void get_op(struct orangefs_kernel_op_s *op)
-{
-	atomic_inc(&op->ref_count);
-	gossip_debug(GOSSIP_DEV_DEBUG,
-			"(get) Alloced OP (%p:%llu)\n",	op, llu(op->tag));
-}
-
-void __op_release(struct orangefs_kernel_op_s *op);
-
-static inline void op_release(struct orangefs_kernel_op_s *op)
-{
-	if (atomic_dec_and_test(&op->ref_count)) {
-		gossip_debug(GOSSIP_DEV_DEBUG,
-			"(put) Releasing OP (%p:%llu)\n", op, llu((op)->tag));
-		__op_release(op);
-	}
-}
+void op_release(struct orangefs_kernel_op_s *op);
 
 extern void orangefs_bufmap_put(int);
 static inline void put_cancel(struct orangefs_kernel_op_s *op)
@@ -259,7 +241,7 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
 {
 	spin_lock(&op->lock);
 	if (unlikely(op_is_cancel(op))) {
-		list_del(&op->list);
+		list_del_init(&op->list);
 		spin_unlock(&op->lock);
 		put_cancel(op);
 	} else {
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index d980240..3f9e430 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -208,15 +208,20 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
 	 * Called with op->lock held.
 	 */
 	op->op_state |= OP_VFS_STATE_GIVEN_UP;
-
-	if (op_state_waiting(op)) {
+	/* from that point on it can't be moved by anybody else */
+	if (list_empty(&op->list)) {
+		/* caught copying to/from daemon */
+		BUG_ON(op_state_serviced(op));
+		spin_unlock(&op->lock);
+		wait_for_completion(&op->waitq);
+	} else if (op_state_waiting(op)) {
 		/*
 		 * upcall hasn't been read; remove op from upcall request
 		 * list.
 		 */
 		spin_unlock(&op->lock);
 		spin_lock(&orangefs_request_list_lock);
-		list_del(&op->list);
+		list_del_init(&op->list);
 		spin_unlock(&orangefs_request_list_lock);
 		gossip_debug(GOSSIP_WAIT_DEBUG,
 			     "Interrupted: Removed op %p from request_list\n",
@@ -225,23 +230,16 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
 		/* op must be removed from the in progress htable */
 		spin_unlock(&op->lock);
 		spin_lock(&htable_ops_in_progress_lock);
-		list_del(&op->list);
+		list_del_init(&op->list);
 		spin_unlock(&htable_ops_in_progress_lock);
 		gossip_debug(GOSSIP_WAIT_DEBUG,
 			     "Interrupted: Removed op %p"
 			     " from htable_ops_in_progress\n",
 			     op);
-	} else if (!op_state_serviced(op)) {
+	} else {
 		spin_unlock(&op->lock);
 		gossip_err("interrupted operation is in a weird state 0x%x\n",
 			   op->op_state);
-	} else {
-		/*
-		 * It is not intended for execution to flow here,
-		 * but having this unlock here makes sparse happy.
-		 */
-		gossip_err("%s: can't get here.\n", __func__);
-		spin_unlock(&op->lock);
 	}
 	reinit_completion(&op->waitq);
 }