All of lore.kernel.org
 help / color / mirror / Atom feed
From: Trond Myklebust <Trond.Myklebust@netapp.com>
To: Benny Halevy <bhalevy@panasas.com>
Cc: linux-nfs@vger.kernel.org
Subject: Re: [RFC 01/27] pnfs: CB_NOTIFY_DEVICEID
Date: Wed, 20 Apr 2011 15:41:25 -0400	[thread overview]
Message-ID: <1303328485.23206.13.camel@lade.trondhjem.org> (raw)
In-Reply-To: <1303320368-20990-1-git-send-email-bhalevy@panasas.com>

On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
> From: Marc Eshel <eshel@almaden.ibm.com>
> 
> Note: This functionlaity is incomplete as all layout segments referring to
> the 'to be removed device id' need to be reaped, and all in flight I/O drained.
> 
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> ---
>  fs/nfs/callback.h          |   20 +++++++++
>  fs/nfs/callback_proc.c     |   50 +++++++++++++++++++++++
>  fs/nfs/callback_xdr.c      |   96 +++++++++++++++++++++++++++++++++++++++++++-
>  fs/nfs/nfs4filelayout.c    |    1 +
>  fs/nfs/nfs4filelayout.h    |    1 +
>  fs/nfs/nfs4filelayoutdev.c |   38 +++++++++++++++++-
>  fs/nfs/pnfs.h              |    3 +
>  7 files changed, 207 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index 46d93ce..892128f 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -167,6 +167,26 @@ extern unsigned nfs4_callback_layoutrecall(
>  
>  extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
>  extern void nfs4_cb_take_slot(struct nfs_client *clp);
> +
> +struct cb_devicenotifyitem {
> +	uint32_t		cbd_notify_type;
> +	uint32_t		cbd_layout_type;
> +	struct nfs4_deviceid	cbd_dev_id;
> +	uint32_t		cbd_immediate;
> +};
> +
> +/* XXX: Should be dynamic up to max compound size */
> +#define NFS4_DEV_NOTIFY_MAXENTRIES 10
> +struct cb_devicenotifyargs {
> +	struct sockaddr			*addr;

No sockaddr_size parameter?

> +	int				 ndevs;
> +	struct cb_devicenotifyitem	 devs[NFS4_DEV_NOTIFY_MAXENTRIES];
> +};

Why can't we make this dynamic at this time?

> +
> +extern __be32 nfs4_callback_devicenotify(
> +	struct cb_devicenotifyargs *args,
> +	void *dummy, struct cb_process_state *cps);
> +
>  #endif /* CONFIG_NFS_V4_1 */
>  extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
>  extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 2f41dcce..99494f6 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -241,6 +241,56 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
>  	do_callback_layoutrecall(clp, &args);
>  }
>  
> +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
> +				  void *dummy, struct cb_process_state *cps)
> +{
> +	int i;
> +	u32 res = 0;
> +	struct nfs_client *clp = cps->clp;
> +	struct nfs_server *server = NULL;
> +
> +	dprintk("%s: -->\n", __func__);
> +
> +	if (!clp) {
> +		res = NFS4ERR_OP_NOT_IN_SESSION;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < args->ndevs; i++) {
> +		struct cb_devicenotifyitem *dev = &args->devs[i];
> +
> +		if (!server ||
> +		    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
> +			rcu_read_lock();
> +			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
> +				if (server->pnfs_curr_ld &&
> +				    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
> +					rcu_read_unlock();
> +					goto found;
> +				}
> +			rcu_read_unlock();
> +			dprintk("%s: layout type %u not found\n",
> +				__func__, dev->cbd_layout_type);
> +			continue;
> +		}
> +
> +	found:
> +		if (!server->pnfs_curr_ld->delete_deviceid) {
> +			res = NFS4ERR_NOTSUPP;
> +			break;
> +		}
> +		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
> +			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
> +				"deleting instead\n", __func__);
> +		server->pnfs_curr_ld->delete_deviceid(&dev->cbd_dev_id);
> +	}
> +
> +out:
> +	dprintk("%s: exit with status = %u\n",
> +		__func__, res);
> +	return cpu_to_be32(res);
> +}
> +
>  int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
>  {
>  	if (delegation == NULL)
> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
> index 00ecf62..5ec2c12 100644
> --- a/fs/nfs/callback_xdr.c
> +++ b/fs/nfs/callback_xdr.c
> @@ -25,6 +25,7 @@
>  
>  #if defined(CONFIG_NFS_V4_1)
>  #define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
> +#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
>  #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
>  					4 + 1 + 3)
>  #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
> @@ -284,6 +285,93 @@ out:
>  	return status;
>  }
>  
> +static
> +__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
> +				struct xdr_stream *xdr,
> +				struct cb_devicenotifyargs *args)
> +{
> +	__be32 *p;
> +	__be32 status = 0;
> +	u32 tmp;
> +	int n, i;
> +	args->ndevs = 0;
> +
> +	args->addr = svc_addr(rqstp);
> +
> +	/* Num of device notifications */
> +	p = read_buf(xdr, sizeof(uint32_t));
> +	if (unlikely(p == NULL)) {
> +		status = htonl(NFS4ERR_RESOURCE);
> +		goto out;
> +	}
> +	n = ntohl(*p++);
> +	if (n <= 0)
> +		goto out;
> +
> +	/* XXX: need to possibly return error in this case */
> +	if (n > NFS4_DEV_NOTIFY_MAXENTRIES) {
> +		dprintk("%s: Processing (%d) notifications out of (%d)\n",
> +			__func__, NFS4_DEV_NOTIFY_MAXENTRIES, n);
> +		n = NFS4_DEV_NOTIFY_MAXENTRIES;
> +	}
> +
> +	/* Decode each dev notification */
> +	for (i = 0; i < n; i++) {
> +		struct cb_devicenotifyitem *dev = &args->devs[i];
> +
> +		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
> +		if (unlikely(p == NULL)) {
> +			status = htonl(NFS4ERR_RESOURCE);
> +			goto out;
> +		}
> +
> +		tmp = ntohl(*p++);	/* bitmap size */
> +		if (tmp != 1) {
> +			status = htonl(NFS4ERR_INVAL);
> +			goto out;
> +		}
> +		dev->cbd_notify_type = ntohl(*p++);
> +		if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
> +		    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
> +			status = htonl(NFS4ERR_INVAL);
> +			goto out;
> +		}
> +
> +		tmp = ntohl(*p++);	/* opaque size */
> +		if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
> +		     (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
> +		    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
> +		     (tmp != NFS4_DEVICEID4_SIZE + 4))) {
> +			status = htonl(NFS4ERR_INVAL);
> +			goto out;
> +		}
> +		dev->cbd_layout_type = ntohl(*p++);
> +		memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
> +		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
> +
> +		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
> +			p = read_buf(xdr, sizeof(uint32_t));
> +			if (unlikely(p == NULL)) {
> +				status = htonl(NFS4ERR_DELAY);
> +				goto out;
> +			}
> +			dev->cbd_immediate = ntohl(*p++);
> +		} else {
> +			dev->cbd_immediate = 0;
> +		}
> +
> +		args->ndevs++;
> +
> +		dprintk("%s: type %d layout 0x%x immediate %d\n",
> +			__func__, dev->cbd_notify_type, dev->cbd_layout_type,
> +			dev->cbd_immediate);
> +	}
> +out:
> +	dprintk("%s: status %d ndevs %d\n",
> +		__func__, ntohl(status), args->ndevs);
> +	return status;
> +}
> +
>  static __be32 decode_sessionid(struct xdr_stream *xdr,
>  				 struct nfs4_sessionid *sid)
>  {
> @@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
>  	case OP_CB_RECALL_ANY:
>  	case OP_CB_RECALL_SLOT:
>  	case OP_CB_LAYOUTRECALL:
> +	case OP_CB_NOTIFY_DEVICEID:
>  		*op = &callback_ops[op_nr];
>  		break;
>  
> -	case OP_CB_NOTIFY_DEVICEID:
>  	case OP_CB_NOTIFY:
>  	case OP_CB_PUSH_DELEG:
>  	case OP_CB_RECALLABLE_OBJ_AVAIL:
> @@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
>  			(callback_decode_arg_t)decode_layoutrecall_args,
>  		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
>  	},
> +	[OP_CB_NOTIFY_DEVICEID] = {
> +		.process_op = (callback_process_op_t)nfs4_callback_devicenotify,
> +		.decode_args =
> +			(callback_decode_arg_t)decode_devicenotify_args,
> +		.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
> +	},
>  	[OP_CB_SEQUENCE] = {
>  		.process_op = (callback_process_op_t)nfs4_callback_sequence,
>  		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index e6e0c294..2feab7f 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -867,6 +867,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
>  	.commit_pagelist	= filelayout_commit_pagelist,
>  	.read_pagelist		= filelayout_read_pagelist,
>  	.write_pagelist		= filelayout_write_pagelist,
> +	.delete_deviceid	= filelayout_delete_deviceid,
>  };
>  
>  static int __init nfs4filelayout_init(void)
> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> index 7c44579..8be70ab 100644
> --- a/fs/nfs/nfs4filelayout.h
> +++ b/fs/nfs/nfs4filelayout.h
> @@ -105,5 +105,6 @@ nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
>  extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
>  struct nfs4_file_layout_dsaddr *
>  get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
> +void filelayout_delete_deviceid(struct nfs4_deviceid *);
>  
>  #endif /* FS_NFS_NFS4FILELAYOUT_H */
> diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
> index de5350f..601aaea 100644
> --- a/fs/nfs/nfs4filelayoutdev.c
> +++ b/fs/nfs/nfs4filelayoutdev.c
> @@ -601,7 +601,7 @@ void
>  nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
>  {
>  	if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
> -		hlist_del_rcu(&dsaddr->node);
> +		hlist_del_init_rcu(&dsaddr->node);
>  		spin_unlock(&filelayout_deviceid_lock);
>  
>  		synchronize_rcu();
> @@ -631,6 +631,42 @@ fail:
>  	return NULL;
>  }
>  
> +static struct nfs4_file_layout_dsaddr *
> +nfs4_fl_unhash_deviceid(struct nfs4_deviceid *id)
> +{
> +	struct nfs4_file_layout_dsaddr *d;
> +	struct hlist_node *n;
> +	long hash = nfs4_fl_deviceid_hash(id);
> +
> +	dprintk("%s: hash %ld\n", __func__, hash);
> +	rcu_read_lock();
> +	hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node)
> +		if (!memcmp(&d->deviceid, id, sizeof(*id)))
> +			goto found;
> +	rcu_read_unlock();
> +	return NULL;
> +
> +found:
> +	rcu_read_unlock();

Is there a reason why we should drop the rcu lock here...

> +	spin_lock(&filelayout_deviceid_lock);
> +	hlist_del_init_rcu(&d->node);
> +	spin_unlock(&filelayout_deviceid_lock);

...instead of here? IOW: do we need to enable preemption before we
redisable it in the spin_lock() call?

Also, how are you preventing races? There is no test under the spin lock
for whether or not the device is still hashed before you call
hlist_del_init_rcu(), so how do you know that it is safe to put the
reference to filelayout_delete_deviceid()?

> +	synchronize_rcu();
> +
> +	return d;
> +}
> +
> +void
> +filelayout_delete_deviceid(struct nfs4_deviceid *id)
> +{
> +	struct nfs4_file_layout_dsaddr *d;
> +
> +	d = nfs4_fl_unhash_deviceid(id);
> +	/* balance the initial ref taken in decode_and_add_device */
> +	if (d && atomic_dec_and_test(&d->ref))
> +		nfs4_fl_free_deviceid(d);
> +}
> +
>  /*
>   * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
>   * Then: ((res + fsi) % dsaddr->stripe_count)
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index bc48272..4cb0a0d 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -89,6 +89,9 @@ struct pnfs_layoutdriver_type {
>  	 */
>  	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
>  	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
> +
> +	/* device notification methods */
> +	void (*delete_deviceid)(struct nfs4_deviceid *);
>  };
>  
>  struct pnfs_layout_hdr {

-- 
Trond Myklebust
Linux NFS client maintainer

NetApp
Trond.Myklebust@netapp.com
www.netapp.com


  reply	other threads:[~2011-04-20 19:41 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-20 16:46 [RFC 0/27] pnfs-submit for 2.6.40 Benny Halevy
2011-04-20 17:26 ` [RFC 01/27] pnfs: CB_NOTIFY_DEVICEID Benny Halevy
2011-04-20 19:41   ` Trond Myklebust [this message]
2011-04-22  6:22     ` Benny Halevy
2011-04-20 17:26 ` [RFC 02/27] pnfs: direct i/o Benny Halevy
2011-04-20 17:26 ` [RFC 03/27] pnfs: layoutreturn Benny Halevy
2011-04-20 19:53   ` Trond Myklebust
2011-04-22  6:52     ` Benny Halevy
2011-04-22  8:04       ` [PATCH 1/6] SQUASHME: call pnfs_return_layout right before pnfs_destroy_layout Benny Halevy
2011-04-22  8:04       ` [PATCH 2/6] SQUASHME: remove assert_spin_locked from pnfs_clear_lseg_list Benny Halevy
2011-04-22  8:04       ` [PATCH 3/6] SQUASHME: remove wait parameter from the layoutreturn path Benny Halevy
2011-04-22  8:31         ` Benny Halevy
2011-04-22  8:05       ` [PATCH 4/6] SQUASHME: remove return_type field from nfs4_layoutreturn_args Benny Halevy
2011-04-22  8:05       ` [PATCH 5/6] SQUASHME: remove range " Benny Halevy
2011-04-22  8:05       ` [PATCH 6/6] SQUASHME: no need to send layoutcommit from _pnfs_return_layout Benny Halevy
2011-04-20 17:26 ` [RFC 04/27] pnfs: layoutret_on_setattr Benny Halevy
2011-04-20 20:03   ` Trond Myklebust
2011-04-22  8:23     ` Benny Halevy
2011-04-20 17:26 ` [RFC 05/27] pnfs: Use byte-range layout segments Benny Halevy
2011-04-20 17:26 ` [RFC 06/27] pnfs: encode_layoutreturn Benny Halevy
2011-04-20 20:16   ` Trond Myklebust
2011-04-22  8:26     ` Benny Halevy
2011-04-20 17:27 ` [RFC 07/27] pnfs: encode_layoutcommit Benny Halevy
2011-04-20 20:18   ` Trond Myklebust
2011-04-22  8:48     ` Benny Halevy
2011-04-20 17:27 ` [RFC 08/27] pnfs: {setup,cleanup}_layoutcommit Benny Halevy
2011-04-20 20:22   ` Trond Myklebust
2011-04-20 17:27 ` [RFC 09/27] pnfs: support for non-rpc layout drivers Benny Halevy
2011-04-20 20:34   ` Trond Myklebust
2011-04-22  9:03     ` Benny Halevy
2011-04-20 17:27 ` [RFC 10/27] pnfs: {,un}set_layoutdriver methods Benny Halevy
2011-04-20 17:27 ` [RFC 11/27] pnfs: per mount layout driver private data Benny Halevy
2011-04-20 20:36   ` Trond Myklebust
2011-04-22  9:05     ` Benny Halevy
2011-04-20 17:27 ` [RFC 12/27] pnfs: alloc and free layout_hdr layoutdriver methods Benny Halevy
2011-04-20 20:43   ` Trond Myklebust
2011-04-22  9:09     ` Benny Halevy
2011-04-20 17:27 ` [RFC 13/27] pnfs: client stats Benny Halevy
2011-04-20 17:28 ` [RFC 14/27] pnfsd: introduce exp_xdr.h Benny Halevy
2011-04-20 17:28 ` [RFC 15/27] pnfs-obj: pnfs_osd XDR definitions Benny Halevy
2011-04-20 20:49   ` Trond Myklebust
2011-04-22  9:11     ` Benny Halevy
2011-04-20 17:28 ` [RFC 16/27] pnfs-obj: pnfs_osd XDR client implementations Benny Halevy
2011-04-20 17:28 ` [RFC 17/27] exofs: pnfs-tree: Remove pnfs-osd private definitions Benny Halevy
2011-04-20 17:28 ` [RFC 18/27] pnfs-obj: Define PNFS_OBJLAYOUT Kconfig option Benny Halevy
2011-04-20 17:28 ` [RFC 19/27] pnfs-obj: objlayout driver skeleton Benny Halevy
2011-04-20 17:28 ` [RFC 20/27] pnfs-obj: objio_osd device information retrieval and caching Benny Halevy
2011-04-20 17:28 ` [RFC 21/27] pnfs-obj: objio_osd real IO implementation Benny Halevy
2011-04-20 17:29 ` [RFC 22/27] sunrpc: New xdr_rewind_stream() Benny Halevy
2011-04-20 17:29 ` [RFC 23/27] pnfs-obj: objlayout_encode_layoutreturn Implementation Benny Halevy
2011-04-20 17:29 ` [RFC 24/27] pnfs-obj: objio_osd report osd_errors for layoutreturn Benny Halevy
2011-04-20 17:29 ` [RFC 25/27] pnfs-obj: objlayout_encode_layoutcommit implementation Benny Halevy
2011-04-20 17:29 ` [RFC 26/27] pnfs-obj: objio_osd: RAID0 support Benny Halevy
2011-04-20 17:29 ` [RFC 27/27] pnfs-obj: objio_osd: groups support Benny Halevy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1303328485.23206.13.camel@lade.trondhjem.org \
    --to=trond.myklebust@netapp.com \
    --cc=bhalevy@panasas.com \
    --cc=linux-nfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.