Netdev Archive on lore.kernel.org
 help / color / Atom feed
From: Yishai Hadas <yishaih@dev.mellanox.co.il>
To: Jason Gunthorpe <jgg@mellanox.com>
Cc: Leon Romanovsky <leon@kernel.org>,
	Doug Ledford <dledford@redhat.com>,
	Leon Romanovsky <leonro@mellanox.com>,
	RDMA mailing list <linux-rdma@vger.kernel.org>,
	Yishai Hadas <yishaih@mellanox.com>,
	Saeed Mahameed <saeedm@mellanox.com>,
	linux-netdev <netdev@vger.kernel.org>
Subject: Re: [PATCH rdma-next v1 11/12] IB/mlx5: Implement DEVX dispatching event
Date: Mon, 24 Jun 2019 19:55:32 +0300
Message-ID: <3a2e53f8-e7dd-3e01-c7c7-99d41f711d87@dev.mellanox.co.il> (raw)
In-Reply-To: <20190624120338.GD5479@mellanox.com>

On 6/24/2019 3:03 PM, Jason Gunthorpe wrote:
> On Tue, Jun 18, 2019 at 08:15:39PM +0300, Leon Romanovsky wrote:
>> From: Yishai Hadas <yishaih@mellanox.com>
>>
>> Implement DEVX dispatching event by looking up for the applicable
>> subscriptions for the reported event and using their target fd to
>> signal/set the event.
>>
>> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
>> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
>>   drivers/infiniband/hw/mlx5/devx.c         | 362 +++++++++++++++++++++-
>>   include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
>>   2 files changed, 357 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
>> index 304b13e7a265..49fdce95d6d9 100644
>> +++ b/drivers/infiniband/hw/mlx5/devx.c
>> @@ -34,6 +34,11 @@ struct devx_async_data {
>>   	struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
>>   };
>>   
>> +struct devx_async_event_data {
>> +	struct list_head list; /* headed in ev_queue->event_list */
>> +	struct mlx5_ib_uapi_devx_async_event_hdr hdr;
>> +};
>> +
>>   /* first level XA value data structure */
>>   struct devx_event {
>>   	struct xarray object_ids; /* second XA level, Key = object id */
>> @@ -54,7 +59,9 @@ struct devx_event_subscription {
>>   				   * devx_obj_event->obj_sub_list
>>   				   */
>>   	struct list_head obj_list; /* headed in devx_object */
>> +	struct list_head event_list; /* headed in ev_queue->event_list */
>>   
>> +	u8  is_cleaned:1;
> 
> There is a loose bool 'is_obj_related' that should be combined with
> this bool bitfield as well.
> 

OK

>>   static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
>> -				      struct devx_event_subscription *sub)
>> +				      struct devx_event_subscription *sub,
>> +				      bool file_close)
>>   {
>> -	list_del_rcu(&sub->file_list);
>> +	if (sub->is_cleaned)
>> +		goto end;
>> +
>> +	sub->is_cleaned = 1;
>>   	list_del_rcu(&sub->xa_list);
>>   
>>   	if (sub->is_obj_related) {
>> @@ -1303,10 +1355,15 @@ static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
>>   		}
>>   	}
>>   
>> -	if (sub->eventfd)
>> -		eventfd_ctx_put(sub->eventfd);
>> +end:
>> +	if (file_close) {
>> +		if (sub->eventfd)
>> +			eventfd_ctx_put(sub->eventfd);
>>   
>> -	kfree_rcu(sub, rcu);
>> +		list_del_rcu(&sub->file_list);
>> +		/* subscription may not be used by the read API any more */
>> +		kfree_rcu(sub, rcu);
>> +	}
> 
> Dis like this confusing file_close stuff, just put this in the single place
> that calls this with the true bool
> 

OK, will do.

>> +static int deliver_event(struct devx_event_subscription *event_sub,
>> +			 const void *data)
>> +{
>> +	struct ib_uobject *fd_uobj = event_sub->fd_uobj;
>> +	struct devx_async_event_file *ev_file;
>> +	struct devx_async_event_queue *ev_queue;
>> +	struct devx_async_event_data *event_data;
>> +	unsigned long flags;
>> +	bool omit_data;
>> +
>> +	ev_file = container_of(fd_uobj, struct devx_async_event_file,
>> +			       uobj);
>> +	ev_queue = &ev_file->ev_queue;
>> +	omit_data = ev_queue->flags &
>> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
>> +
>> +	if (omit_data) {
>> +		spin_lock_irqsave(&ev_queue->lock, flags);
>> +		if (!list_empty(&event_sub->event_list)) {
>> +			spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +			return 0;
>> +		}
>> +
>> +		list_add_tail(&event_sub->event_list, &ev_queue->event_list);
>> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +		wake_up_interruptible(&ev_queue->poll_wait);
>> +		return 0;
>> +	}
>> +
>> +	event_data = kzalloc(sizeof(*event_data) +
>> +			     (omit_data ? 0 : sizeof(struct mlx5_eqe)),
>> +			     GFP_ATOMIC);
> 
> omit_data is always false here
> 

Correct, will clean it up.

>> +	if (!event_data) {
>> +		spin_lock_irqsave(&ev_queue->lock, flags);
>> +		ev_queue->is_overflow_err = 1;
>> +		spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	event_data->hdr.cookie = event_sub->cookie;
>> +	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
>> +
>> +	spin_lock_irqsave(&ev_queue->lock, flags);
>> +	list_add_tail(&event_data->list, &ev_queue->event_list);
>> +	spin_unlock_irqrestore(&ev_queue->lock, flags);
>> +	wake_up_interruptible(&ev_queue->poll_wait);
>> +
>> +	return 0;
>> +}
>> +
>> +static void dispatch_event_fd(struct list_head *fd_list,
>> +			      const void *data)
>> +{
>> +	struct devx_event_subscription *item;
>> +
>> +	list_for_each_entry_rcu(item, fd_list, xa_list) {
>> +		if (!get_file_rcu((struct file *)item->object))
>> +			continue;
>> +
>> +		if (item->eventfd) {
>> +			eventfd_signal(item->eventfd, 1);
>> +			fput(item->object);
>> +			continue;
>> +		}
>> +
>> +		deliver_event(item, data);
>> +		fput(item->object);
>> +	}
>> +}
>> +
>>   static int devx_event_notifier(struct notifier_block *nb,
>>   			       unsigned long event_type, void *data)
>>   {
>> -	return NOTIFY_DONE;
>> +	struct mlx5_devx_event_table *table;
>> +	struct mlx5_ib_dev *dev;
>> +	struct devx_event *event;
>> +	struct devx_obj_event *obj_event;
>> +	u16 obj_type = 0;
>> +	bool is_unaffiliated;
>> +	u32 obj_id;
>> +
>> +	/* Explicit filtering to kernel events which may occur frequently */
>> +	if (event_type == MLX5_EVENT_TYPE_CMD ||
>> +	    event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
>> +		return NOTIFY_OK;
>> +
>> +	table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
>> +	dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
>> +	is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
>> +
>> +	if (!is_unaffiliated)
>> +		obj_type = get_event_obj_type(event_type, data);
>> +	event = xa_load(&table->event_xa, event_type | (obj_type << 16));
>> +	if (!event)
>> +		return NOTIFY_DONE;
> 
> event should be in the rcu as well

Do we really need this ? I didn't see a flow that really requires that.
> 
>> +	if (is_unaffiliated) {
>> +		dispatch_event_fd(&event->unaffiliated_list, data);
>> +		return NOTIFY_OK;
>> +	}
>> +
>> +	obj_id = devx_get_obj_id_from_event(event_type, data);
>> +	rcu_read_lock();
>> +	obj_event = xa_load(&event->object_ids, obj_id);
>> +	if (!obj_event) {
>> +		rcu_read_unlock();
>> +		return NOTIFY_DONE;
>> +	}
>> +
>> +	dispatch_event_fd(&obj_event->obj_sub_list, data);
>> +
>> +	rcu_read_unlock();
>> +	return NOTIFY_OK;
>>   }
>>   
>>   void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
>> @@ -2221,7 +2444,7 @@ void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
>>   		event = entry;
>>   		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
>>   					 xa_list)
>> -			devx_cleanup_subscription(dev, sub);
>> +			devx_cleanup_subscription(dev, sub, false);
>>   		kfree(entry);
>>   	}
>>   	mutex_unlock(&dev->devx_event_table.event_xa_lock);
>> @@ -2329,18 +2552,126 @@ static const struct file_operations devx_async_cmd_event_fops = {
>>   static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
>>   				     size_t count, loff_t *pos)
>>   {
>> -	return -EINVAL;
>> +	struct devx_async_event_file *ev_file = filp->private_data;
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +	struct devx_event_subscription *event_sub;
>> +	struct devx_async_event_data *uninitialized_var(event);
>> +	int ret = 0;
>> +	size_t eventsz;
>> +	bool omit_data;
>> +	void *event_data;
>> +
>> +	omit_data = ev_queue->flags &
>> +		MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA;
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +
>> +	if (ev_queue->is_overflow_err) {
>> +		ev_queue->is_overflow_err = 0;
>> +		spin_unlock_irq(&ev_queue->lock);
>> +		return -EOVERFLOW;
>> +	}
>> +
>> +	while (list_empty(&ev_queue->event_list)) {
>> +		spin_unlock_irq(&ev_queue->lock);
>> +
>> +		if (filp->f_flags & O_NONBLOCK)
>> +			return -EAGAIN;
>> +
>> +		if (wait_event_interruptible(ev_queue->poll_wait,
>> +			    (!list_empty(&ev_queue->event_list) ||
>> +			     ev_queue->is_destroyed))) {
>> +			return -ERESTARTSYS;
>> +		}
>> +
>> +		if (list_empty(&ev_queue->event_list) &&
>> +		    ev_queue->is_destroyed)
>> +			return -EIO;
> 
> All these tests should be under the lock.

We can't call wait_event_interruptible() above which may sleep under the 
lock, correct ? are you referring to the list_empty() and is_destroyed ?

By the way looking in uverb code [1], similar code which is not done 
under the lock as of here..

[1] 
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L244


> 
> Why don't we return EIO as soon as is-destroyed happens? What is the
> point of flushing out the accumulated events?

It follows the above uverb code/logic that returns existing events even 
in that case, also the async command events in this file follows that 
logic, I suggest to stay consistent.
> 
>> +
>> +		spin_lock_irq(&ev_queue->lock);
>> +	}
>> +
>> +	if (omit_data) {
>> +		event_sub = list_first_entry(&ev_queue->event_list,
>> +					struct devx_event_subscription,
>> +					event_list);
>> +		eventsz = sizeof(event_sub->cookie);
>> +		event_data = &event_sub->cookie;
>> +	} else {
>> +		event = list_first_entry(&ev_queue->event_list,
>> +				      struct devx_async_event_data, list);
>> +		eventsz = sizeof(struct mlx5_eqe) +
>> +			sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
>> +		event_data = &event->hdr;
>> +	}
>> +
>> +	if (eventsz > count) {
>> +		spin_unlock_irq(&ev_queue->lock);
>> +		return -ENOSPC;
> 
> This is probably the wrong errno

OK, will change to -EINVAL as in uverbs
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L254

> 
>> +	}
>> +
>> +	if (omit_data)
>> +		list_del_init(&event_sub->event_list);
>> +	else
>> +		list_del(&event->list);
>> +
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	if (copy_to_user(buf, event_data, eventsz))
>> +		ret = -EFAULT;
>> +	else
>> +		ret = eventsz;
> 
> This is really poorly ordered, EFAULT will cause the event to be lost. :(

Agree but apparently rare case .. see also the below notes.

> 
> Maybe the event should be re-added on error? Tricky.

What will happen if another copy_to_user may then fail again (loop ?) 
... not sure that we want to get into this tricky handling ...

As of above, It follows the logic from uverbs at that area.
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_main.c#L267

> 
>> +	if (!omit_data)
>> +		kfree(event);
>> +	return ret;
>>   }
>>   
>>   static __poll_t devx_async_event_poll(struct file *filp,
>>   				      struct poll_table_struct *wait)
>>   {
>> -	return 0;
>> +	struct devx_async_event_file *ev_file = filp->private_data;
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +	__poll_t pollflags = 0;
>> +
>> +	poll_wait(filp, &ev_queue->poll_wait, wait);
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +	if (ev_queue->is_destroyed)
>> +		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
>> +	else if (!list_empty(&ev_queue->event_list))
>> +		pollflags = EPOLLIN | EPOLLRDNORM;
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	return pollflags;
>>   }
>>   
>>   static int devx_async_event_close(struct inode *inode, struct file *filp)
>>   {
>> +	struct ib_uobject *uobj = filp->private_data;
>> +	struct devx_async_event_file *ev_file =
>> +		container_of(uobj, struct devx_async_event_file, uobj);
>> +	struct devx_event_subscription *event_sub, *event_sub_tmp;
>> +	struct devx_async_event_data *entry, *tmp;
>> +
>> +	mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
>> +	/* delete the subscriptions which are related to this FD */
>> +	list_for_each_entry_safe(event_sub, event_sub_tmp,
>> +				 &ev_file->subscribed_events_list, file_list)
>> +		devx_cleanup_subscription(ev_file->dev, event_sub, true);
>> +	mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
>> +
>> +	/* free the pending events allocation */
>> +	if (!(ev_file->ev_queue.flags &
>> +	    MLX5_IB_UAPI_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA)) {
>> +		spin_lock_irq(&ev_file->ev_queue.lock);
>> +		list_for_each_entry_safe(entry, tmp,
>> +					 &ev_file->ev_queue.event_list, list)
>> +			kfree(entry); /* read can't come any nore */
> 
> spelling

OK

> 
>> +		spin_unlock_irq(&ev_file->ev_queue.lock);
>> +	}
>>   	uverbs_close_fd(filp);
>> +	put_device(&ev_file->dev->ib_dev.dev);
>>   	return 0;
>>   }
>>   
>> @@ -2374,6 +2705,17 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
>>   static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
>>   					    enum rdma_remove_reason why)
>>   {
>> +	struct devx_async_event_file *ev_file =
>> +		container_of(uobj, struct devx_async_event_file,
>> +			     uobj);
>> +	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
>> +
>> +	spin_lock_irq(&ev_queue->lock);
>> +	ev_queue->is_destroyed = 1;
>> +	spin_unlock_irq(&ev_queue->lock);
>> +
>> +	if (why == RDMA_REMOVE_DRIVER_REMOVE)
>> +		wake_up_interruptible(&ev_queue->poll_wait);
> 
> Why isn't this wakeup always done?

Maybe you are right and this can be always done to wake up any readers 
as the 'is_destroyed' was set.

By the way, any idea why it was done as such in uverbs [1] for similar 
flow ? also the command events follows that.


[1]
https://elixir.bootlin.com/linux/latest/source/drivers/infiniband/core/uverbs_std_types.c#L207

  reply index

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-18 17:15 [PATCH rdma-next v1 00/12] DEVX asynchronous events Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 01/12] net/mlx5: Fix mlx5_core_destroy_cq() error flow Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 02/12] net/mlx5: Use event mask based on device capabilities Leon Romanovsky
2019-06-27  0:23   ` Saeed Mahameed
2019-06-18 17:15 ` [PATCH mlx5-next v1 03/12] net/mlx5: Expose the API to register for ANY event Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 04/12] net/mlx5: mlx5_core_create_cq() enhancements Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 05/12] net/mlx5: Report a CQ error event only when a handler was set Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 06/12] net/mlx5: Report EQE data upon CQ completion Leon Romanovsky
2019-06-18 17:15 ` [PATCH mlx5-next v1 07/12] net/mlx5: Expose device definitions for object events Leon Romanovsky
2019-06-18 17:15 ` [PATCH rdma-next v1 08/12] IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD Leon Romanovsky
2019-06-24 11:51   ` Jason Gunthorpe
2019-06-24 13:25     ` Yishai Hadas
2019-06-24 14:30       ` Jason Gunthorpe
2019-06-18 17:15 ` [PATCH rdma-next v1 09/12] IB/mlx5: Register DEVX with mlx5_core to get async events Leon Romanovsky
2019-06-24 11:52   ` Jason Gunthorpe
2019-06-24 13:36     ` Yishai Hadas
2019-06-24 14:30       ` Jason Gunthorpe
2019-06-18 17:15 ` [PATCH rdma-next v1 10/12] IB/mlx5: Enable subscription for device events over DEVX Leon Romanovsky
2019-06-24 11:57   ` Jason Gunthorpe
2019-06-24 16:13     ` Yishai Hadas
2019-06-24 17:56       ` Jason Gunthorpe
2019-06-18 17:15 ` [PATCH rdma-next v1 11/12] IB/mlx5: Implement DEVX dispatching event Leon Romanovsky
2019-06-24 12:03   ` Jason Gunthorpe
2019-06-24 16:55     ` Yishai Hadas [this message]
2019-06-24 18:06       ` Jason Gunthorpe
2019-06-25 14:41         ` Yishai Hadas
2019-06-25 20:23           ` Jason Gunthorpe
2019-06-18 17:15 ` [PATCH rdma-next v1 12/12] IB/mlx5: Add DEVX support for CQ events Leon Romanovsky
2019-06-24 12:04   ` Jason Gunthorpe
2019-06-24 17:03     ` Yishai Hadas
2019-06-24 18:06       ` Jason Gunthorpe
2019-06-18 18:51 ` [PATCH rdma-next v1 00/12] DEVX asynchronous events Saeed Mahameed
2019-06-19  4:45   ` Leon Romanovsky
2019-06-24 21:57     ` Saeed Mahameed
2019-06-30  8:53       ` Leon Romanovsky

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3a2e53f8-e7dd-3e01-c7c7-99d41f711d87@dev.mellanox.co.il \
    --to=yishaih@dev.mellanox.co.il \
    --cc=dledford@redhat.com \
    --cc=jgg@mellanox.com \
    --cc=leon@kernel.org \
    --cc=leonro@mellanox.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=saeedm@mellanox.com \
    --cc=yishaih@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Netdev Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/netdev/0 netdev/git/0.git
	git clone --mirror https://lore.kernel.org/netdev/1 netdev/git/1.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 netdev netdev/ https://lore.kernel.org/netdev \
		netdev@vger.kernel.org netdev@archiver.kernel.org
	public-inbox-index netdev


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.netdev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox