All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
@ 2013-04-09  9:39 Asias He
  2013-04-11 10:47 ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Asias He @ 2013-04-09  9:39 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |  4 +++
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 1f9116c..719ce13 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -91,6 +91,15 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+
+	/*
+	 * vs_inflight[0]/[1] are used to track requests issued
+	 * before/during the flush operation
+	 */
+	u64 vs_inflight[2];
+	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
+	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
+	int vs_during_flush; /* flag to indicate if we are in flush operation */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	int during_flush;
+
+	spin_lock(&vs->vs_flush_lock);
+	during_flush = vs->vs_during_flush;
+	vs->vs_inflight[during_flush]++;
+	spin_unlock(&vs->vs_flush_lock);
+
+	return during_flush;
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
+{
+	u64 inflight;
+
+	spin_lock(&vs->vs_flush_lock);
+	inflight = vs->vs_inflight[during_flush]--;
+	/*
+	 * Wakeup the waiter when all the requests issued before the flush
+	 * operation are finished and we are during the flush operation.
+	 */
+	if (!inflight && !during_flush && vs->vs_during_flush)
+		wake_up(&vs->vs_flush_wait);
+	spin_unlock(&vs->vs_flush_lock);
+}
+
+static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
+{
+	bool ret = false;
+
+	/* The requests issued before the flush operation are finished ? */
+	spin_lock(&vs->vs_flush_lock);
+	if (!vs->vs_inflight[0])
+		ret = true;
+	spin_unlock(&vs->vs_flush_lock);
+
+	return ret;
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(vs, evt->during_flush);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->during_flush = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 {
 	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
+	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
 
 	/* TODO locking against target/backend threads? */
 	transport_generic_free_cmd(se_cmd, 1);
@@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
+
 	kfree(tv_cmd);
 }
 
 static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
-	struct virtio_scsi_event *event)
+	struct tcm_vhost_evt *evt)
 {
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
 	unsigned out, in;
 	int head, ret;
@@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
 	while (llnode) {
 		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
 		llnode = llist_next(llnode);
-		tcm_vhost_do_evt_work(vs, &evt->event);
+		tcm_vhost_do_evt_work(vs, evt);
 		tcm_vhost_free_evt(vs, evt);
 	}
 }
@@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 	struct virtio_scsi_cmd_resp v_rsp;
 	struct tcm_vhost_cmd *tv_cmd;
 	struct llist_node *llnode;
-	struct se_cmd *se_cmd;
 	int ret, vq;
+	struct se_cmd *se_cmd;
 
 	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
 	llnode = llist_del_all(&vs->vs_completion_list);
@@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
+	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 
 		if (unlikely(vq->iov[out].iov_len !=
@@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
 		 */
 		tv_cmd->tvc_vq_desc = head;
+
 		/*
 		 * Dispatch tv_cmd descriptor for cmwq execution in process
 		 * context provided by tcm_vhost_workqueue.  This also ensures
@@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
 	int i;
 
+	/* Flush operation is started */
+	spin_lock(&vs->vs_flush_lock);
+	vs->vs_during_flush = 1;
+	spin_unlock(&vs->vs_flush_lock);
+
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
+	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait until all requests issued before the flush to be finished */
+	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
+
+	/* Flush operation is finished */
+	spin_lock(&vs->vs_flush_lock);
+	vs->vs_during_flush = 0;
+	spin_unlock(&vs->vs_flush_lock);
 }
 
 /*
@@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
 	u8 target;
 
 	mutex_lock(&vs->dev.mutex);
+
 	/* Verify that ring has been setup correctly. */
 	for (index = 0; index < vs->dev.nvqs; ++index) {
 		if (!vhost_vq_access_ok(&vs->vqs[index])) {
@@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	s->vs_inflight[0] = 0;
+	s->vs_inflight[1] = 0;
+	spin_lock_init(&s->vs_flush_lock);
+	init_waitqueue_head(&s->vs_flush_wait);
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..dd84622 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	/* Indicate this command is issued during the flush operaton */
+	int during_flush;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +93,8 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	/* Indicate this event is issued during the flush operaton */
+	int during_flush;
 };
 
 /*
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-09  9:39 [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
@ 2013-04-11 10:47 ` Michael S. Tsirkin
  2013-04-12  6:25   ` Asias He
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-11 10:47 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> This patch makes vhost_scsi_flush() wait for all the pending requests
> issued before the flush operation to be finished.
> 
> Changes in v3:
> - Rebase
> - Drop 'tcm_vhost: Wait for pending requests in
>   vhost_scsi_clear_endpoint()' in this series, we already did that in
>   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> 
> Changes in v2:
> - Increase/Decrease inflight requests in
>   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> 
> Signed-off-by: Asias He <asias@redhat.com>

Nack, let's not do this home-grown here.  Please use a kref.

The array of two trick is also too tricky for my taste.

Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
by a kref pointer, allocate a new kref when you flush.

Access can be done with RCU so we won't need any locks.

> ---
>  drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/vhost/tcm_vhost.h |  4 +++
>  2 files changed, 87 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 1f9116c..719ce13 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -91,6 +91,15 @@ struct vhost_scsi {
>  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
>  	bool vs_events_dropped; /* any missed events */
>  	int vs_events_nr; /* num of pending events */
> +
> +	/*
> +	 * vs_inflight[0]/[1] are used to track requests issued
> +	 * before/during the flush operation
> +	 */
> +	u64 vs_inflight[2];
> +	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
> +	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
> +	int vs_during_flush; /* flag to indicate if we are in flush operation */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
>  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
>  }
>  
> +static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> +{
> +	int during_flush;
> +
> +	spin_lock(&vs->vs_flush_lock);
> +	during_flush = vs->vs_during_flush;
> +	vs->vs_inflight[during_flush]++;
> +	spin_unlock(&vs->vs_flush_lock);
> +
> +	return during_flush;
> +}
> +
> +static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
> +{
> +	u64 inflight;
> +
> +	spin_lock(&vs->vs_flush_lock);
> +	inflight = vs->vs_inflight[during_flush]--;
> +	/*
> +	 * Wakeup the waiter when all the requests issued before the flush
> +	 * operation are finished and we are during the flush operation.
> +	 */
> +	if (!inflight && !during_flush && vs->vs_during_flush)
> +		wake_up(&vs->vs_flush_wait);
> +	spin_unlock(&vs->vs_flush_lock);
> +}
> +
> +static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
> +{
> +	bool ret = false;
> +
> +	/* The requests issued before the flush operation are finished ? */
> +	spin_lock(&vs->vs_flush_lock);
> +	if (!vs->vs_inflight[0])
> +		ret = true;
> +	spin_unlock(&vs->vs_flush_lock);
> +
> +	return ret;
> +}
> +
>  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
>  {
>  	bool ret = false;
> @@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
>  {
>  	mutex_lock(&vs->vs_events_lock);
> +	tcm_vhost_dec_inflight(vs, evt->during_flush);
>  	vs->vs_events_nr--;
>  	kfree(evt);
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
>  	if (evt) {
>  		evt->event.event = event;
>  		evt->event.reason = reason;
> +		evt->during_flush = tcm_vhost_inc_inflight(vs);
>  		vs->vs_events_nr++;
>  	}
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
>  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  {
>  	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> +	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
>  
>  	/* TODO locking against target/backend threads? */
>  	transport_generic_free_cmd(se_cmd, 1);
> @@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  		kfree(tv_cmd->tvc_sgl);
>  	}
>  
> +	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
> +
>  	kfree(tv_cmd);
>  }
>  
>  static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> -	struct virtio_scsi_event *event)
> +	struct tcm_vhost_evt *evt)
>  {
>  	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> +	struct virtio_scsi_event *event = &evt->event;
>  	struct virtio_scsi_event __user *eventp;
>  	unsigned out, in;
>  	int head, ret;
> @@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
>  	while (llnode) {
>  		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
>  		llnode = llist_next(llnode);
> -		tcm_vhost_do_evt_work(vs, &evt->event);
> +		tcm_vhost_do_evt_work(vs, evt);
>  		tcm_vhost_free_evt(vs, evt);
>  	}
>  }
> @@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  	struct virtio_scsi_cmd_resp v_rsp;
>  	struct tcm_vhost_cmd *tv_cmd;
>  	struct llist_node *llnode;
> -	struct se_cmd *se_cmd;
>  	int ret, vq;
> +	struct se_cmd *se_cmd;
>  
>  	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
>  	llnode = llist_del_all(&vs->vs_completion_list);
> @@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
>  }
>  
>  static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> +	struct vhost_scsi *vs,
>  	struct tcm_vhost_tpg *tv_tpg,
>  	struct virtio_scsi_cmd_req *v_req,
>  	u32 exp_data_len,
> @@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
>  	tv_cmd->tvc_exp_data_len = exp_data_len;
>  	tv_cmd->tvc_data_direction = data_direction;
>  	tv_cmd->tvc_nexus = tv_nexus;
> +	tv_cmd->tvc_vhost = vs;
> +	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
>  
>  	return tv_cmd;
>  }
> @@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
>  		for (i = 0; i < data_num; i++)
>  			exp_data_len += vq->iov[data_first + i].iov_len;
>  
> -		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
> +		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
>  					exp_data_len, data_direction);
>  		if (IS_ERR(tv_cmd)) {
>  			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
> @@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
>  		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
>  			": %d\n", tv_cmd, exp_data_len, data_direction);
>  
> -		tv_cmd->tvc_vhost = vs;
>  		tv_cmd->tvc_vq = vq;
>  
>  		if (unlikely(vq->iov[out].iov_len !=
> @@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
>  		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
>  		 */
>  		tv_cmd->tvc_vq_desc = head;
> +
>  		/*
>  		 * Dispatch tv_cmd descriptor for cmwq execution in process
>  		 * context provided by tcm_vhost_workqueue.  This also ensures
> @@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
>  {
>  	int i;
>  
> +	/* Flush operation is started */
> +	spin_lock(&vs->vs_flush_lock);
> +	vs->vs_during_flush = 1;
> +	spin_unlock(&vs->vs_flush_lock);
> +
>  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>  		vhost_scsi_flush_vq(vs, i);
>  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> +
> +	/* Wait until all requests issued before the flush to be finished */
> +	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
> +
> +	/* Flush operation is finished */
> +	spin_lock(&vs->vs_flush_lock);
> +	vs->vs_during_flush = 0;
> +	spin_unlock(&vs->vs_flush_lock);
>  }
>  
>  /*
> @@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
>  	u8 target;
>  
>  	mutex_lock(&vs->dev.mutex);
> +
>  	/* Verify that ring has been setup correctly. */
>  	for (index = 0; index < vs->dev.nvqs; ++index) {
>  		if (!vhost_vq_access_ok(&vs->vqs[index])) {
> @@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	s->vs_events_dropped = false;
>  	mutex_init(&s->vs_events_lock);
>  
> +	s->vs_inflight[0] = 0;
> +	s->vs_inflight[1] = 0;
> +	spin_lock_init(&s->vs_flush_lock);
> +	init_waitqueue_head(&s->vs_flush_wait);
> +
>  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
>  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
>  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index 94e9ee53..dd84622 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
>  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
>  	/* Completed commands list, serviced from vhost worker thread */
>  	struct llist_node tvc_completion_list;
> +	/* Indicate this command is issued during the flush operaton */
> +	int during_flush;
>  };
>  
>  struct tcm_vhost_nexus {
> @@ -91,6 +93,8 @@ struct tcm_vhost_evt {
>  	struct virtio_scsi_event event;
>  	/* virtio_scsi event list, serviced from vhost worker thread */
>  	struct llist_node list;
> +	/* Indicate this event is issued during the flush operaton */
> +	int during_flush;
>  };
>  
>  /*
> -- 
> 1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-11 10:47 ` Michael S. Tsirkin
@ 2013-04-12  6:25   ` Asias He
  2013-04-12 11:33     ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Asias He @ 2013-04-12  6:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > This patch makes vhost_scsi_flush() wait for all the pending requests
> > issued before the flush operation to be finished.
> > 
> > Changes in v3:
> > - Rebase
> > - Drop 'tcm_vhost: Wait for pending requests in
> >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > 
> > Changes in v2:
> > - Increase/Decrease inflight requests in
> >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > 
> > Signed-off-by: Asias He <asias@redhat.com>
> 
> Nack, let's not do this home-grown here.  Please use a kref.
> 
> The array of two trick is also too tricky for my taste.
> 
> Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> by a kref pointer, allocate a new kref when you flush.
> 
> Access can be done with RCU so we won't need any locks.

I do not think kref helps and the right place to use here. Also, a
pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
a wait queue as well.

Do you mean something as so:

   struct vhost_scsi_inflight {
   	struct kref kref;
   	wait_queue_head_t wait;
   }
   
   vhost_scsi_allocate_cmd()
   	rcu_read_lock()
   	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
   	kref_get(&tv_cmd->inflight->kref)
   	rcu_read_unlock()
   
   vhost_scsi_free_cmd()
   	kref_put(&tv_cmd->inflight.kref, my_release)
   
   my_release()
   	wake_up(&inflight->wait)
   
   vhost_scsi_flush()
   	old_inflight = vs->vs_inflight;
   	new_inflight = kmalloc(*new_inflight, ...)
   	rcu_assign_pointer(vs->vs_inflight, new_inflight);
   	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
   	synchronize_rcu();
   	free(old_inflight)

1) The kref need to be accessed in the free cmd/evt function, you can not use
rcu to protect it.

2) No need to use synchronize_rcu to wait for the reader of
vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
time time, we are safe to free the old_inflight.

3) The kref is not used in a standard way. We are refcounting the evt
and cmd, not the vhost_scsi_inflight. A single is atomic conter is
enough.

Though, I do not like the array trick too. I can change to allocate
vhost_scsi_inflight when we flush.

> > ---
> >  drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
> >  drivers/vhost/tcm_vhost.h |  4 +++
> >  2 files changed, 87 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > index 1f9116c..719ce13 100644
> > --- a/drivers/vhost/tcm_vhost.c
> > +++ b/drivers/vhost/tcm_vhost.c
> > @@ -91,6 +91,15 @@ struct vhost_scsi {
> >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> >  	bool vs_events_dropped; /* any missed events */
> >  	int vs_events_nr; /* num of pending events */
> > +
> > +	/*
> > +	 * vs_inflight[0]/[1] are used to track requests issued
> > +	 * before/during the flush operation
> > +	 */
> > +	u64 vs_inflight[2];
> > +	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
> > +	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
> > +	int vs_during_flush; /* flag to indicate if we are in flush operation */
> >  };
> >  
> >  /* Local pointer to allocated TCM configfs fabric module */
> > @@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
> >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> >  }
> >  
> > +static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > +{
> > +	int during_flush;
> > +
> > +	spin_lock(&vs->vs_flush_lock);
> > +	during_flush = vs->vs_during_flush;
> > +	vs->vs_inflight[during_flush]++;
> > +	spin_unlock(&vs->vs_flush_lock);
> > +
> > +	return during_flush;
> > +}
> > +
> > +static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
> > +{
> > +	u64 inflight;
> > +
> > +	spin_lock(&vs->vs_flush_lock);
> > +	inflight = vs->vs_inflight[during_flush]--;
> > +	/*
> > +	 * Wakeup the waiter when all the requests issued before the flush
> > +	 * operation are finished and we are during the flush operation.
> > +	 */
> > +	if (!inflight && !during_flush && vs->vs_during_flush)
> > +		wake_up(&vs->vs_flush_wait);
> > +	spin_unlock(&vs->vs_flush_lock);
> > +}
> > +
> > +static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
> > +{
> > +	bool ret = false;
> > +
> > +	/* The requests issued before the flush operation are finished ? */
> > +	spin_lock(&vs->vs_flush_lock);
> > +	if (!vs->vs_inflight[0])
> > +		ret = true;
> > +	spin_unlock(&vs->vs_flush_lock);
> > +
> > +	return ret;
> > +}
> > +
> >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> >  {
> >  	bool ret = false;
> > @@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> >  {
> >  	mutex_lock(&vs->vs_events_lock);
> > +	tcm_vhost_dec_inflight(vs, evt->during_flush);
> >  	vs->vs_events_nr--;
> >  	kfree(evt);
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> >  	if (evt) {
> >  		evt->event.event = event;
> >  		evt->event.reason = reason;
> > +		evt->during_flush = tcm_vhost_inc_inflight(vs);
> >  		vs->vs_events_nr++;
> >  	}
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> >  {
> >  	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> > +	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
> >  
> >  	/* TODO locking against target/backend threads? */
> >  	transport_generic_free_cmd(se_cmd, 1);
> > @@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> >  		kfree(tv_cmd->tvc_sgl);
> >  	}
> >  
> > +	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
> > +
> >  	kfree(tv_cmd);
> >  }
> >  
> >  static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> > -	struct virtio_scsi_event *event)
> > +	struct tcm_vhost_evt *evt)
> >  {
> >  	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> > +	struct virtio_scsi_event *event = &evt->event;
> >  	struct virtio_scsi_event __user *eventp;
> >  	unsigned out, in;
> >  	int head, ret;
> > @@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
> >  	while (llnode) {
> >  		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
> >  		llnode = llist_next(llnode);
> > -		tcm_vhost_do_evt_work(vs, &evt->event);
> > +		tcm_vhost_do_evt_work(vs, evt);
> >  		tcm_vhost_free_evt(vs, evt);
> >  	}
> >  }
> > @@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> >  	struct virtio_scsi_cmd_resp v_rsp;
> >  	struct tcm_vhost_cmd *tv_cmd;
> >  	struct llist_node *llnode;
> > -	struct se_cmd *se_cmd;
> >  	int ret, vq;
> > +	struct se_cmd *se_cmd;
> >  
> >  	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
> >  	llnode = llist_del_all(&vs->vs_completion_list);
> > @@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> >  }
> >  
> >  static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > +	struct vhost_scsi *vs,
> >  	struct tcm_vhost_tpg *tv_tpg,
> >  	struct virtio_scsi_cmd_req *v_req,
> >  	u32 exp_data_len,
> > @@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> >  	tv_cmd->tvc_exp_data_len = exp_data_len;
> >  	tv_cmd->tvc_data_direction = data_direction;
> >  	tv_cmd->tvc_nexus = tv_nexus;
> > +	tv_cmd->tvc_vhost = vs;
> > +	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
> >  
> >  	return tv_cmd;
> >  }
> > @@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> >  		for (i = 0; i < data_num; i++)
> >  			exp_data_len += vq->iov[data_first + i].iov_len;
> >  
> > -		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
> > +		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
> >  					exp_data_len, data_direction);
> >  		if (IS_ERR(tv_cmd)) {
> >  			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
> > @@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> >  		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
> >  			": %d\n", tv_cmd, exp_data_len, data_direction);
> >  
> > -		tv_cmd->tvc_vhost = vs;
> >  		tv_cmd->tvc_vq = vq;
> >  
> >  		if (unlikely(vq->iov[out].iov_len !=
> > @@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> >  		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
> >  		 */
> >  		tv_cmd->tvc_vq_desc = head;
> > +
> >  		/*
> >  		 * Dispatch tv_cmd descriptor for cmwq execution in process
> >  		 * context provided by tcm_vhost_workqueue.  This also ensures
> > @@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
> >  {
> >  	int i;
> >  
> > +	/* Flush operation is started */
> > +	spin_lock(&vs->vs_flush_lock);
> > +	vs->vs_during_flush = 1;
> > +	spin_unlock(&vs->vs_flush_lock);
> > +
> >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> >  		vhost_scsi_flush_vq(vs, i);
> >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > +
> > +	/* Wait until all requests issued before the flush to be finished */
> > +	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
> > +
> > +	/* Flush operation is finished */
> > +	spin_lock(&vs->vs_flush_lock);
> > +	vs->vs_during_flush = 0;
> > +	spin_unlock(&vs->vs_flush_lock);
> >  }
> >  
> >  /*
> > @@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
> >  	u8 target;
> >  
> >  	mutex_lock(&vs->dev.mutex);
> > +
> >  	/* Verify that ring has been setup correctly. */
> >  	for (index = 0; index < vs->dev.nvqs; ++index) {
> >  		if (!vhost_vq_access_ok(&vs->vqs[index])) {
> > @@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> >  	s->vs_events_dropped = false;
> >  	mutex_init(&s->vs_events_lock);
> >  
> > +	s->vs_inflight[0] = 0;
> > +	s->vs_inflight[1] = 0;
> > +	spin_lock_init(&s->vs_flush_lock);
> > +	init_waitqueue_head(&s->vs_flush_wait);
> > +
> >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > index 94e9ee53..dd84622 100644
> > --- a/drivers/vhost/tcm_vhost.h
> > +++ b/drivers/vhost/tcm_vhost.h
> > @@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
> >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> >  	/* Completed commands list, serviced from vhost worker thread */
> >  	struct llist_node tvc_completion_list;
> > +	/* Indicate this command is issued during the flush operaton */
> > +	int during_flush;
> >  };
> >  
> >  struct tcm_vhost_nexus {
> > @@ -91,6 +93,8 @@ struct tcm_vhost_evt {
> >  	struct virtio_scsi_event event;
> >  	/* virtio_scsi event list, serviced from vhost worker thread */
> >  	struct llist_node list;
> > +	/* Indicate this event is issued during the flush operaton */
> > +	int during_flush;
> >  };
> >  
> >  /*
> > -- 
> > 1.8.1.4

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-12  6:25   ` Asias He
@ 2013-04-12 11:33     ` Michael S. Tsirkin
  2013-04-12 14:59         ` Asias He
                         ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-12 11:33 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Fri, Apr 12, 2013 at 02:25:23PM +0800, Asias He wrote:
> On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > issued before the flush operation to be finished.
> > > 
> > > Changes in v3:
> > > - Rebase
> > > - Drop 'tcm_vhost: Wait for pending requests in
> > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > 
> > > Changes in v2:
> > > - Increase/Decrease inflight requests in
> > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > 
> > > Signed-off-by: Asias He <asias@redhat.com>
> > 
> > Nack, let's not do this home-grown here.  Please use a kref.
> > 
> > The array of two trick is also too tricky for my taste.
> > 
> > Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> > by a kref pointer, allocate a new kref when you flush.
> > 
> > Access can be done with RCU so we won't need any locks.
> 
> I do not think kref helps and the right place to use here. Also, a
> pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
> a wait queue as well.
> 
> Do you mean something as so:
> 
>    struct vhost_scsi_inflight {
>    	struct kref kref;
>    	wait_queue_head_t wait;
>    }
>    
>    vhost_scsi_allocate_cmd()
>    	rcu_read_lock()
>    	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
>    	kref_get(&tv_cmd->inflight->kref)
>    	rcu_read_unlock()
>    
>    vhost_scsi_free_cmd()
>    	kref_put(&tv_cmd->inflight.kref, my_release)
>    
>    my_release()
>    	wake_up(&inflight->wait)
>    
>    vhost_scsi_flush()
>    	old_inflight = vs->vs_inflight;
>    	new_inflight = kmalloc(*new_inflight, ...)
>    	rcu_assign_pointer(vs->vs_inflight, new_inflight);
>    	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
>    	synchronize_rcu();
>    	free(old_inflight)
> 
> 1) The kref need to be accessed in the free cmd/evt function, you can not use
> rcu to protect it.

No, it's vs_inflight pointer that is protected by RCU.
But if you prefer, we can have it per-vq and
protected by vq mutex.


> 2) No need to use synchronize_rcu to wait for the reader of
> vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
> time time, we are safe to free the old_inflight.

RCU is to avoid old vhost_scsi_allocate_cmd from using
the old pointer. But we can use vq flush instead, that's
often done in vhost.

> 3) The kref is not used in a standard way. We are refcounting the evt
> and cmd, not the vhost_scsi_inflight. A single is atomic conter is
> enough.

Looks standard to me.

> Though, I do not like the array trick too. I can change to allocate
> vhost_scsi_inflight when we flush.

That's better but homegrown refcounting is better avoided too.

> > > ---
> > >  drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
> > >  drivers/vhost/tcm_vhost.h |  4 +++
> > >  2 files changed, 87 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > index 1f9116c..719ce13 100644
> > > --- a/drivers/vhost/tcm_vhost.c
> > > +++ b/drivers/vhost/tcm_vhost.c
> > > @@ -91,6 +91,15 @@ struct vhost_scsi {
> > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > >  	bool vs_events_dropped; /* any missed events */
> > >  	int vs_events_nr; /* num of pending events */
> > > +
> > > +	/*
> > > +	 * vs_inflight[0]/[1] are used to track requests issued
> > > +	 * before/during the flush operation
> > > +	 */
> > > +	u64 vs_inflight[2];
> > > +	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
> > > +	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
> > > +	int vs_during_flush; /* flag to indicate if we are in flush operation */
> > >  };
> > >  
> > >  /* Local pointer to allocated TCM configfs fabric module */
> > > @@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
> > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > >  }
> > >  
> > > +static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	int during_flush;
> > > +
> > > +	spin_lock(&vs->vs_flush_lock);
> > > +	during_flush = vs->vs_during_flush;
> > > +	vs->vs_inflight[during_flush]++;
> > > +	spin_unlock(&vs->vs_flush_lock);
> > > +
> > > +	return during_flush;
> > > +}
> > > +
> > > +static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
> > > +{
> > > +	u64 inflight;
> > > +
> > > +	spin_lock(&vs->vs_flush_lock);
> > > +	inflight = vs->vs_inflight[during_flush]--;
> > > +	/*
> > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > +	 * operation are finished and we are during the flush operation.
> > > +	 */
> > > +	if (!inflight && !during_flush && vs->vs_during_flush)
> > > +		wake_up(&vs->vs_flush_wait);
> > > +	spin_unlock(&vs->vs_flush_lock);
> > > +}
> > > +
> > > +static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	bool ret = false;
> > > +
> > > +	/* The requests issued before the flush operation are finished ? */
> > > +	spin_lock(&vs->vs_flush_lock);
> > > +	if (!vs->vs_inflight[0])
> > > +		ret = true;
> > > +	spin_unlock(&vs->vs_flush_lock);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > >  {
> > >  	bool ret = false;
> > > @@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > >  {
> > >  	mutex_lock(&vs->vs_events_lock);
> > > +	tcm_vhost_dec_inflight(vs, evt->during_flush);
> > >  	vs->vs_events_nr--;
> > >  	kfree(evt);
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > >  	if (evt) {
> > >  		evt->event.event = event;
> > >  		evt->event.reason = reason;
> > > +		evt->during_flush = tcm_vhost_inc_inflight(vs);
> > >  		vs->vs_events_nr++;
> > >  	}
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > >  {
> > >  	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> > > +	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
> > >  
> > >  	/* TODO locking against target/backend threads? */
> > >  	transport_generic_free_cmd(se_cmd, 1);
> > > @@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > >  		kfree(tv_cmd->tvc_sgl);
> > >  	}
> > >  
> > > +	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
> > > +
> > >  	kfree(tv_cmd);
> > >  }
> > >  
> > >  static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> > > -	struct virtio_scsi_event *event)
> > > +	struct tcm_vhost_evt *evt)
> > >  {
> > >  	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> > > +	struct virtio_scsi_event *event = &evt->event;
> > >  	struct virtio_scsi_event __user *eventp;
> > >  	unsigned out, in;
> > >  	int head, ret;
> > > @@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
> > >  	while (llnode) {
> > >  		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
> > >  		llnode = llist_next(llnode);
> > > -		tcm_vhost_do_evt_work(vs, &evt->event);
> > > +		tcm_vhost_do_evt_work(vs, evt);
> > >  		tcm_vhost_free_evt(vs, evt);
> > >  	}
> > >  }
> > > @@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > >  	struct virtio_scsi_cmd_resp v_rsp;
> > >  	struct tcm_vhost_cmd *tv_cmd;
> > >  	struct llist_node *llnode;
> > > -	struct se_cmd *se_cmd;
> > >  	int ret, vq;
> > > +	struct se_cmd *se_cmd;
> > >  
> > >  	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
> > >  	llnode = llist_del_all(&vs->vs_completion_list);
> > > @@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > >  }
> > >  
> > >  static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > +	struct vhost_scsi *vs,
> > >  	struct tcm_vhost_tpg *tv_tpg,
> > >  	struct virtio_scsi_cmd_req *v_req,
> > >  	u32 exp_data_len,
> > > @@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > >  	tv_cmd->tvc_exp_data_len = exp_data_len;
> > >  	tv_cmd->tvc_data_direction = data_direction;
> > >  	tv_cmd->tvc_nexus = tv_nexus;
> > > +	tv_cmd->tvc_vhost = vs;
> > > +	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
> > >  
> > >  	return tv_cmd;
> > >  }
> > > @@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > >  		for (i = 0; i < data_num; i++)
> > >  			exp_data_len += vq->iov[data_first + i].iov_len;
> > >  
> > > -		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
> > > +		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
> > >  					exp_data_len, data_direction);
> > >  		if (IS_ERR(tv_cmd)) {
> > >  			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
> > > @@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > >  		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
> > >  			": %d\n", tv_cmd, exp_data_len, data_direction);
> > >  
> > > -		tv_cmd->tvc_vhost = vs;
> > >  		tv_cmd->tvc_vq = vq;
> > >  
> > >  		if (unlikely(vq->iov[out].iov_len !=
> > > @@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > >  		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
> > >  		 */
> > >  		tv_cmd->tvc_vq_desc = head;
> > > +
> > >  		/*
> > >  		 * Dispatch tv_cmd descriptor for cmwq execution in process
> > >  		 * context provided by tcm_vhost_workqueue.  This also ensures
> > > @@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
> > >  {
> > >  	int i;
> > >  
> > > +	/* Flush operation is started */
> > > +	spin_lock(&vs->vs_flush_lock);
> > > +	vs->vs_during_flush = 1;
> > > +	spin_unlock(&vs->vs_flush_lock);
> > > +
> > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > >  		vhost_scsi_flush_vq(vs, i);
> > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > +
> > > +	/* Wait until all requests issued before the flush to be finished */
> > > +	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
> > > +
> > > +	/* Flush operation is finished */
> > > +	spin_lock(&vs->vs_flush_lock);
> > > +	vs->vs_during_flush = 0;
> > > +	spin_unlock(&vs->vs_flush_lock);
> > >  }
> > >  
> > >  /*
> > > @@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
> > >  	u8 target;
> > >  
> > >  	mutex_lock(&vs->dev.mutex);
> > > +
> > >  	/* Verify that ring has been setup correctly. */
> > >  	for (index = 0; index < vs->dev.nvqs; ++index) {
> > >  		if (!vhost_vq_access_ok(&vs->vqs[index])) {
> > > @@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > >  	s->vs_events_dropped = false;
> > >  	mutex_init(&s->vs_events_lock);
> > >  
> > > +	s->vs_inflight[0] = 0;
> > > +	s->vs_inflight[1] = 0;
> > > +	spin_lock_init(&s->vs_flush_lock);
> > > +	init_waitqueue_head(&s->vs_flush_wait);
> > > +
> > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > index 94e9ee53..dd84622 100644
> > > --- a/drivers/vhost/tcm_vhost.h
> > > +++ b/drivers/vhost/tcm_vhost.h
> > > @@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
> > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > >  	/* Completed commands list, serviced from vhost worker thread */
> > >  	struct llist_node tvc_completion_list;
> > > +	/* Indicate this command is issued during the flush operaton */
> > > +	int during_flush;
> > >  };
> > >  
> > >  struct tcm_vhost_nexus {
> > > @@ -91,6 +93,8 @@ struct tcm_vhost_evt {
> > >  	struct virtio_scsi_event event;
> > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > >  	struct llist_node list;
> > > +	/* Indicate this event is issued during the flush operaton */
> > > +	int during_flush;
> > >  };
> > >  
> > >  /*
> > > -- 
> > > 1.8.1.4
> 
> -- 
> Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-12 11:33     ` Michael S. Tsirkin
@ 2013-04-12 14:59         ` Asias He
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-12 14:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Fri, Apr 12, 2013 at 02:33:32PM +0300, Michael S. Tsirkin wrote:
> On Fri, Apr 12, 2013 at 02:25:23PM +0800, Asias He wrote:
> > On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > issued before the flush operation to be finished.
> > > > 
> > > > Changes in v3:
> > > > - Rebase
> > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > 
> > > > Changes in v2:
> > > > - Increase/Decrease inflight requests in
> > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > 
> > > > Signed-off-by: Asias He <asias@redhat.com>
> > > 
> > > Nack, let's not do this home-grown here.  Please use a kref.
> > > 
> > > The array of two trick is also too tricky for my taste.
> > > 
> > > Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> > > by a kref pointer, allocate a new kref when you flush.
> > > 
> > > Access can be done with RCU so we won't need any locks.
> > 
> > I do not think kref helps and the right place to use here. Also, a
> > pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
> > a wait queue as well.
> > 
> > Do you mean something as so:
> > 
> >    struct vhost_scsi_inflight {
> >    	struct kref kref;
> >    	wait_queue_head_t wait;
> >    }
> >    
> >    vhost_scsi_allocate_cmd()
> >    	rcu_read_lock()
> >    	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
> >    	kref_get(&tv_cmd->inflight->kref)
> >    	rcu_read_unlock()
> >    
> >    vhost_scsi_free_cmd()
> >    	kref_put(&tv_cmd->inflight.kref, my_release)
> >    
> >    my_release()
> >    	wake_up(&inflight->wait)
> >    
> >    vhost_scsi_flush()
> >    	old_inflight = vs->vs_inflight;
> >    	new_inflight = kmalloc(*new_inflight, ...)
> >    	rcu_assign_pointer(vs->vs_inflight, new_inflight);
> >    	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
> >    	synchronize_rcu();
> >    	free(old_inflight)
> > 
> > 1) The kref need to be accessed in the free cmd/evt function, you can not use
> > rcu to protect it.
> 
> No, it's vs_inflight pointer that is protected by RCU.
> But if you prefer, we can have it per-vq and
> protected by vq mutex.

No, for event, it can be allocated outside the vhost thread. And vs_inflight
is not a per queue data why make it per queue.

> 
> > 2) No need to use synchronize_rcu to wait for the reader of
> > vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
> > time time, we are safe to free the old_inflight.
> 
> RCU is to avoid old vhost_scsi_allocate_cmd from using
> the old pointer. But we can use vq flush instead, that's
> often done in vhost.

> > 3) The kref is not used in a standard way. We are refcounting the evt
> > and cmd, not the vhost_scsi_inflight. A single is atomic conter is
> > enough.
> 
> Looks standard to me.

Strange ...

> > Though, I do not like the array trick too. I can change to allocate
> > vhost_scsi_inflight when we flush.
> 
> That's better but homegrown refcounting is better avoided too.

I had a version which dropped the array.

>From e542981a69b1088c7a170bf8e9c6e9d4df897ca4 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Mon, 11 Mar 2013 10:57:32 +0800
Subject: [PATCH] tcm_vhost: Wait for pending requests in
 vhost_scsi_flush()

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v4:
- Introduce vhost_scsi_inflight
- Drop array to track flush

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 81 ++++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |  3 ++
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index c425605..40e2809 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	atomic_t count;
+	wait_queue_head_t wait;
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,7 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+	struct vhost_scsi_inflight *vs_inflight;
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +114,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static struct vhost_scsi_inflight *tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (!inflight) {
+		/* Otherwize, we get dobule free of the previous inflight */
+		vs->vs_inflight = NULL;
+		return NULL;
+	}
+	atomic_set(&inflight->count, 0);
+	init_waitqueue_head(&inflight->wait);
+	vs->vs_inflight = inflight;
+
+	return inflight;
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	/*
+	 * Wakeup the waiter when all the requests issued before the flush
+	 * operation are finished and we are during the flush operation.
+	 */
+	if (inflight && !atomic_dec_return(&inflight->count))
+		wake_up(&inflight->wait);
+}
+
+static struct vhost_scsi_inflight *tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight = ACCESS_ONCE(vs->vs_inflight);
+	/* FIXME: possible race window here, if inflight points to old value
+	 * before we set the new value in _flush, and the wait_event() runs
+	 * before we call atomic_inc(), this way we may free old_inflight
+	 * however, but there is still one in flight*/
+	if (inflight)
+		atomic_inc(&inflight->count);
+
+	return inflight;
+}
+
+static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
+{
+	return atomic_read(&inflight->count) == 0;
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +453,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +475,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -445,13 +498,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
 static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
-	struct virtio_scsi_event *event)
+	struct tcm_vhost_evt *evt)
 {
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
 	unsigned out, in;
 	int head, ret;
@@ -511,7 +567,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
 	while (llnode) {
 		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
 		llnode = llist_next(llnode);
-		tcm_vhost_do_evt_work(vs, &evt->event);
+		tcm_vhost_do_evt_work(vs, evt);
 		tcm_vhost_free_evt(vs, evt);
 	}
 }
@@ -568,6 +624,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -592,6 +649,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -847,7 +906,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -857,7 +916,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 		tv_cmd->tvc_resp = vq->iov[out].iov_base;
 
@@ -981,10 +1039,21 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
 	int i;
+	struct vhost_scsi_inflight *old_inflight;
+
+	old_inflight = ACCESS_ONCE(vs->vs_inflight);
+	if (!tcm_vhost_alloc_inflight(vs))
+		return;
 
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
+	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait until all requests issued before the flush to be finished */
+	wait_event(old_inflight->wait, tcm_vhost_done_inflight(old_inflight));
+
+	kfree(old_inflight);
 }
 
 /*
@@ -1193,6 +1262,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if(!tcm_vhost_alloc_inflight(s))
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1218,6 +1290,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..c36ef5f 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,7 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +93,7 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*
-- 
1.8.1.4

> 
> > > > ---
> > > >  drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
> > > >  drivers/vhost/tcm_vhost.h |  4 +++
> > > >  2 files changed, 87 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > > index 1f9116c..719ce13 100644
> > > > --- a/drivers/vhost/tcm_vhost.c
> > > > +++ b/drivers/vhost/tcm_vhost.c
> > > > @@ -91,6 +91,15 @@ struct vhost_scsi {
> > > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > > >  	bool vs_events_dropped; /* any missed events */
> > > >  	int vs_events_nr; /* num of pending events */
> > > > +
> > > > +	/*
> > > > +	 * vs_inflight[0]/[1] are used to track requests issued
> > > > +	 * before/during the flush operation
> > > > +	 */
> > > > +	u64 vs_inflight[2];
> > > > +	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
> > > > +	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
> > > > +	int vs_during_flush; /* flag to indicate if we are in flush operation */
> > > >  };
> > > >  
> > > >  /* Local pointer to allocated TCM configfs fabric module */
> > > > @@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
> > > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > > >  }
> > > >  
> > > > +static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	int during_flush;
> > > > +
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	during_flush = vs->vs_during_flush;
> > > > +	vs->vs_inflight[during_flush]++;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > > +	return during_flush;
> > > > +}
> > > > +
> > > > +static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
> > > > +{
> > > > +	u64 inflight;
> > > > +
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	inflight = vs->vs_inflight[during_flush]--;
> > > > +	/*
> > > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > > +	 * operation are finished and we are during the flush operation.
> > > > +	 */
> > > > +	if (!inflight && !during_flush && vs->vs_during_flush)
> > > > +		wake_up(&vs->vs_flush_wait);
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +}
> > > > +
> > > > +static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	bool ret = false;
> > > > +
> > > > +	/* The requests issued before the flush operation are finished ? */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	if (!vs->vs_inflight[0])
> > > > +		ret = true;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > > >  {
> > > >  	bool ret = false;
> > > > @@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	mutex_lock(&vs->vs_events_lock);
> > > > +	tcm_vhost_dec_inflight(vs, evt->during_flush);
> > > >  	vs->vs_events_nr--;
> > > >  	kfree(evt);
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  	if (evt) {
> > > >  		evt->event.event = event;
> > > >  		evt->event.reason = reason;
> > > > +		evt->during_flush = tcm_vhost_inc_inflight(vs);
> > > >  		vs->vs_events_nr++;
> > > >  	}
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  {
> > > >  	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> > > > +	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
> > > >  
> > > >  	/* TODO locking against target/backend threads? */
> > > >  	transport_generic_free_cmd(se_cmd, 1);
> > > > @@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  		kfree(tv_cmd->tvc_sgl);
> > > >  	}
> > > >  
> > > > +	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
> > > > +
> > > >  	kfree(tv_cmd);
> > > >  }
> > > >  
> > > >  static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> > > > -	struct virtio_scsi_event *event)
> > > > +	struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> > > > +	struct virtio_scsi_event *event = &evt->event;
> > > >  	struct virtio_scsi_event __user *eventp;
> > > >  	unsigned out, in;
> > > >  	int head, ret;
> > > > @@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
> > > >  	while (llnode) {
> > > >  		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
> > > >  		llnode = llist_next(llnode);
> > > > -		tcm_vhost_do_evt_work(vs, &evt->event);
> > > > +		tcm_vhost_do_evt_work(vs, evt);
> > > >  		tcm_vhost_free_evt(vs, evt);
> > > >  	}
> > > >  }
> > > > @@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > > >  	struct virtio_scsi_cmd_resp v_rsp;
> > > >  	struct tcm_vhost_cmd *tv_cmd;
> > > >  	struct llist_node *llnode;
> > > > -	struct se_cmd *se_cmd;
> > > >  	int ret, vq;
> > > > +	struct se_cmd *se_cmd;
> > > >  
> > > >  	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
> > > >  	llnode = llist_del_all(&vs->vs_completion_list);
> > > > @@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > > >  }
> > > >  
> > > >  static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > > +	struct vhost_scsi *vs,
> > > >  	struct tcm_vhost_tpg *tv_tpg,
> > > >  	struct virtio_scsi_cmd_req *v_req,
> > > >  	u32 exp_data_len,
> > > > @@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > >  	tv_cmd->tvc_exp_data_len = exp_data_len;
> > > >  	tv_cmd->tvc_data_direction = data_direction;
> > > >  	tv_cmd->tvc_nexus = tv_nexus;
> > > > +	tv_cmd->tvc_vhost = vs;
> > > > +	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
> > > >  
> > > >  	return tv_cmd;
> > > >  }
> > > > @@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		for (i = 0; i < data_num; i++)
> > > >  			exp_data_len += vq->iov[data_first + i].iov_len;
> > > >  
> > > > -		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
> > > > +		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
> > > >  					exp_data_len, data_direction);
> > > >  		if (IS_ERR(tv_cmd)) {
> > > >  			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
> > > > @@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
> > > >  			": %d\n", tv_cmd, exp_data_len, data_direction);
> > > >  
> > > > -		tv_cmd->tvc_vhost = vs;
> > > >  		tv_cmd->tvc_vq = vq;
> > > >  
> > > >  		if (unlikely(vq->iov[out].iov_len !=
> > > > @@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
> > > >  		 */
> > > >  		tv_cmd->tvc_vq_desc = head;
> > > > +
> > > >  		/*
> > > >  		 * Dispatch tv_cmd descriptor for cmwq execution in process
> > > >  		 * context provided by tcm_vhost_workqueue.  This also ensures
> > > > @@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
> > > >  {
> > > >  	int i;
> > > >  
> > > > +	/* Flush operation is started */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	vs->vs_during_flush = 1;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > > >  		vhost_scsi_flush_vq(vs, i);
> > > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > > +
> > > > +	/* Wait until all requests issued before the flush to be finished */
> > > > +	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
> > > > +
> > > > +	/* Flush operation is finished */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	vs->vs_during_flush = 0;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
> > > >  	u8 target;
> > > >  
> > > >  	mutex_lock(&vs->dev.mutex);
> > > > +
> > > >  	/* Verify that ring has been setup correctly. */
> > > >  	for (index = 0; index < vs->dev.nvqs; ++index) {
> > > >  		if (!vhost_vq_access_ok(&vs->vqs[index])) {
> > > > @@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > > >  	s->vs_events_dropped = false;
> > > >  	mutex_init(&s->vs_events_lock);
> > > >  
> > > > +	s->vs_inflight[0] = 0;
> > > > +	s->vs_inflight[1] = 0;
> > > > +	spin_lock_init(&s->vs_flush_lock);
> > > > +	init_waitqueue_head(&s->vs_flush_wait);
> > > > +
> > > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > > index 94e9ee53..dd84622 100644
> > > > --- a/drivers/vhost/tcm_vhost.h
> > > > +++ b/drivers/vhost/tcm_vhost.h
> > > > @@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
> > > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > > >  	/* Completed commands list, serviced from vhost worker thread */
> > > >  	struct llist_node tvc_completion_list;
> > > > +	/* Indicate this command is issued during the flush operaton */
> > > > +	int during_flush;
> > > >  };
> > > >  
> > > >  struct tcm_vhost_nexus {
> > > > @@ -91,6 +93,8 @@ struct tcm_vhost_evt {
> > > >  	struct virtio_scsi_event event;
> > > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > > >  	struct llist_node list;
> > > > +	/* Indicate this event is issued during the flush operaton */
> > > > +	int during_flush;
> > > >  };
> > > >  
> > > >  /*
> > > > -- 
> > > > 1.8.1.4
> > 
> > -- 
> > Asias

-- 
Asias

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
@ 2013-04-12 14:59         ` Asias He
  0 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-12 14:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Fri, Apr 12, 2013 at 02:33:32PM +0300, Michael S. Tsirkin wrote:
> On Fri, Apr 12, 2013 at 02:25:23PM +0800, Asias He wrote:
> > On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > issued before the flush operation to be finished.
> > > > 
> > > > Changes in v3:
> > > > - Rebase
> > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > 
> > > > Changes in v2:
> > > > - Increase/Decrease inflight requests in
> > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > 
> > > > Signed-off-by: Asias He <asias@redhat.com>
> > > 
> > > Nack, let's not do this home-grown here.  Please use a kref.
> > > 
> > > The array of two trick is also too tricky for my taste.
> > > 
> > > Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> > > by a kref pointer, allocate a new kref when you flush.
> > > 
> > > Access can be done with RCU so we won't need any locks.
> > 
> > I do not think kref helps and the right place to use here. Also, a
> > pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
> > a wait queue as well.
> > 
> > Do you mean something as so:
> > 
> >    struct vhost_scsi_inflight {
> >    	struct kref kref;
> >    	wait_queue_head_t wait;
> >    }
> >    
> >    vhost_scsi_allocate_cmd()
> >    	rcu_read_lock()
> >    	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
> >    	kref_get(&tv_cmd->inflight->kref)
> >    	rcu_read_unlock()
> >    
> >    vhost_scsi_free_cmd()
> >    	kref_put(&tv_cmd->inflight.kref, my_release)
> >    
> >    my_release()
> >    	wake_up(&inflight->wait)
> >    
> >    vhost_scsi_flush()
> >    	old_inflight = vs->vs_inflight;
> >    	new_inflight = kmalloc(*new_inflight, ...)
> >    	rcu_assign_pointer(vs->vs_inflight, new_inflight);
> >    	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
> >    	synchronize_rcu();
> >    	free(old_inflight)
> > 
> > 1) The kref need to be accessed in the free cmd/evt function, you can not use
> > rcu to protect it.
> 
> No, it's vs_inflight pointer that is protected by RCU.
> But if you prefer, we can have it per-vq and
> protected by vq mutex.

No, for event, it can be allocated outside the vhost thread. And vs_inflight
is not a per queue data why make it per queue.

> 
> > 2) No need to use synchronize_rcu to wait for the reader of
> > vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
> > time time, we are safe to free the old_inflight.
> 
> RCU is to avoid old vhost_scsi_allocate_cmd from using
> the old pointer. But we can use vq flush instead, that's
> often done in vhost.

> > 3) The kref is not used in a standard way. We are refcounting the evt
> > and cmd, not the vhost_scsi_inflight. A single is atomic conter is
> > enough.
> 
> Looks standard to me.

Strange ...

> > Though, I do not like the array trick too. I can change to allocate
> > vhost_scsi_inflight when we flush.
> 
> That's better but homegrown refcounting is better avoided too.

I had a version which dropped the array.

From e542981a69b1088c7a170bf8e9c6e9d4df897ca4 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Mon, 11 Mar 2013 10:57:32 +0800
Subject: [PATCH] tcm_vhost: Wait for pending requests in
 vhost_scsi_flush()

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v4:
- Introduce vhost_scsi_inflight
- Drop array to track flush

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 81 ++++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |  3 ++
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index c425605..40e2809 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	atomic_t count;
+	wait_queue_head_t wait;
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,7 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+	struct vhost_scsi_inflight *vs_inflight;
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +114,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static struct vhost_scsi_inflight *tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (!inflight) {
+		/* Otherwize, we get dobule free of the previous inflight */
+		vs->vs_inflight = NULL;
+		return NULL;
+	}
+	atomic_set(&inflight->count, 0);
+	init_waitqueue_head(&inflight->wait);
+	vs->vs_inflight = inflight;
+
+	return inflight;
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	/*
+	 * Wakeup the waiter when all the requests issued before the flush
+	 * operation are finished and we are during the flush operation.
+	 */
+	if (inflight && !atomic_dec_return(&inflight->count))
+		wake_up(&inflight->wait);
+}
+
+static struct vhost_scsi_inflight *tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight = ACCESS_ONCE(vs->vs_inflight);
+	/* FIXME: possible race window here, if inflight points to old value
+	 * before we set the new value in _flush, and the wait_event() runs
+	 * before we call atomic_inc(), this way we may free old_inflight
+	 * however, but there is still one in flight*/
+	if (inflight)
+		atomic_inc(&inflight->count);
+
+	return inflight;
+}
+
+static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
+{
+	return atomic_read(&inflight->count) == 0;
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +453,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +475,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -445,13 +498,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
 static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
-	struct virtio_scsi_event *event)
+	struct tcm_vhost_evt *evt)
 {
 	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
 	unsigned out, in;
 	int head, ret;
@@ -511,7 +567,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
 	while (llnode) {
 		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
 		llnode = llist_next(llnode);
-		tcm_vhost_do_evt_work(vs, &evt->event);
+		tcm_vhost_do_evt_work(vs, evt);
 		tcm_vhost_free_evt(vs, evt);
 	}
 }
@@ -568,6 +624,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -592,6 +649,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -847,7 +906,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -857,7 +916,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 		tv_cmd->tvc_resp = vq->iov[out].iov_base;
 
@@ -981,10 +1039,21 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
 	int i;
+	struct vhost_scsi_inflight *old_inflight;
+
+	old_inflight = ACCESS_ONCE(vs->vs_inflight);
+	if (!tcm_vhost_alloc_inflight(vs))
+		return;
 
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
+	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait until all requests issued before the flush to be finished */
+	wait_event(old_inflight->wait, tcm_vhost_done_inflight(old_inflight));
+
+	kfree(old_inflight);
 }
 
 /*
@@ -1193,6 +1262,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if(!tcm_vhost_alloc_inflight(s))
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1218,6 +1290,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..c36ef5f 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,7 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +93,7 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*
-- 
1.8.1.4

> 
> > > > ---
> > > >  drivers/vhost/tcm_vhost.c | 88 ++++++++++++++++++++++++++++++++++++++++++++---
> > > >  drivers/vhost/tcm_vhost.h |  4 +++
> > > >  2 files changed, 87 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > > index 1f9116c..719ce13 100644
> > > > --- a/drivers/vhost/tcm_vhost.c
> > > > +++ b/drivers/vhost/tcm_vhost.c
> > > > @@ -91,6 +91,15 @@ struct vhost_scsi {
> > > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > > >  	bool vs_events_dropped; /* any missed events */
> > > >  	int vs_events_nr; /* num of pending events */
> > > > +
> > > > +	/*
> > > > +	 * vs_inflight[0]/[1] are used to track requests issued
> > > > +	 * before/during the flush operation
> > > > +	 */
> > > > +	u64 vs_inflight[2];
> > > > +	wait_queue_head_t vs_flush_wait; /* wait queue for flush operation */
> > > > +	spinlock_t vs_flush_lock; /* lock to protect vs_during_flush */
> > > > +	int vs_during_flush; /* flag to indicate if we are in flush operation */
> > > >  };
> > > >  
> > > >  /* Local pointer to allocated TCM configfs fabric module */
> > > > @@ -108,6 +117,46 @@ static int iov_num_pages(struct iovec *iov)
> > > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > > >  }
> > > >  
> > > > +static int tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	int during_flush;
> > > > +
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	during_flush = vs->vs_during_flush;
> > > > +	vs->vs_inflight[during_flush]++;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > > +	return during_flush;
> > > > +}
> > > > +
> > > > +static void tcm_vhost_dec_inflight(struct vhost_scsi *vs, int during_flush)
> > > > +{
> > > > +	u64 inflight;
> > > > +
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	inflight = vs->vs_inflight[during_flush]--;
> > > > +	/*
> > > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > > +	 * operation are finished and we are during the flush operation.
> > > > +	 */
> > > > +	if (!inflight && !during_flush && vs->vs_during_flush)
> > > > +		wake_up(&vs->vs_flush_wait);
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +}
> > > > +
> > > > +static bool tcm_vhost_done_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	bool ret = false;
> > > > +
> > > > +	/* The requests issued before the flush operation are finished ? */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	if (!vs->vs_inflight[0])
> > > > +		ret = true;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > > >  {
> > > >  	bool ret = false;
> > > > @@ -402,6 +451,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	mutex_lock(&vs->vs_events_lock);
> > > > +	tcm_vhost_dec_inflight(vs, evt->during_flush);
> > > >  	vs->vs_events_nr--;
> > > >  	kfree(evt);
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -423,6 +473,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  	if (evt) {
> > > >  		evt->event.event = event;
> > > >  		evt->event.reason = reason;
> > > > +		evt->during_flush = tcm_vhost_inc_inflight(vs);
> > > >  		vs->vs_events_nr++;
> > > >  	}
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -433,6 +484,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  {
> > > >  	struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> > > > +	struct vhost_scsi *vs = tv_cmd->tvc_vhost;
> > > >  
> > > >  	/* TODO locking against target/backend threads? */
> > > >  	transport_generic_free_cmd(se_cmd, 1);
> > > > @@ -445,13 +497,16 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  		kfree(tv_cmd->tvc_sgl);
> > > >  	}
> > > >  
> > > > +	tcm_vhost_dec_inflight(vs, tv_cmd->during_flush);
> > > > +
> > > >  	kfree(tv_cmd);
> > > >  }
> > > >  
> > > >  static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
> > > > -	struct virtio_scsi_event *event)
> > > > +	struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
> > > > +	struct virtio_scsi_event *event = &evt->event;
> > > >  	struct virtio_scsi_event __user *eventp;
> > > >  	unsigned out, in;
> > > >  	int head, ret;
> > > > @@ -511,7 +566,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
> > > >  	while (llnode) {
> > > >  		evt = llist_entry(llnode, struct tcm_vhost_evt, list);
> > > >  		llnode = llist_next(llnode);
> > > > -		tcm_vhost_do_evt_work(vs, &evt->event);
> > > > +		tcm_vhost_do_evt_work(vs, evt);
> > > >  		tcm_vhost_free_evt(vs, evt);
> > > >  	}
> > > >  }
> > > > @@ -529,8 +584,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > > >  	struct virtio_scsi_cmd_resp v_rsp;
> > > >  	struct tcm_vhost_cmd *tv_cmd;
> > > >  	struct llist_node *llnode;
> > > > -	struct se_cmd *se_cmd;
> > > >  	int ret, vq;
> > > > +	struct se_cmd *se_cmd;
> > > >  
> > > >  	bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
> > > >  	llnode = llist_del_all(&vs->vs_completion_list);
> > > > @@ -568,6 +623,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
> > > >  }
> > > >  
> > > >  static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > > +	struct vhost_scsi *vs,
> > > >  	struct tcm_vhost_tpg *tv_tpg,
> > > >  	struct virtio_scsi_cmd_req *v_req,
> > > >  	u32 exp_data_len,
> > > > @@ -592,6 +648,8 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > >  	tv_cmd->tvc_exp_data_len = exp_data_len;
> > > >  	tv_cmd->tvc_data_direction = data_direction;
> > > >  	tv_cmd->tvc_nexus = tv_nexus;
> > > > +	tv_cmd->tvc_vhost = vs;
> > > > +	tv_cmd->during_flush = tcm_vhost_inc_inflight(vs);
> > > >  
> > > >  	return tv_cmd;
> > > >  }
> > > > @@ -842,7 +900,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		for (i = 0; i < data_num; i++)
> > > >  			exp_data_len += vq->iov[data_first + i].iov_len;
> > > >  
> > > > -		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
> > > > +		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
> > > >  					exp_data_len, data_direction);
> > > >  		if (IS_ERR(tv_cmd)) {
> > > >  			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
> > > > @@ -852,7 +910,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
> > > >  			": %d\n", tv_cmd, exp_data_len, data_direction);
> > > >  
> > > > -		tv_cmd->tvc_vhost = vs;
> > > >  		tv_cmd->tvc_vq = vq;
> > > >  
> > > >  		if (unlikely(vq->iov[out].iov_len !=
> > > > @@ -905,6 +962,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
> > > >  		 * tcm_vhost_queue_data_in() and tcm_vhost_queue_status()
> > > >  		 */
> > > >  		tv_cmd->tvc_vq_desc = head;
> > > > +
> > > >  		/*
> > > >  		 * Dispatch tv_cmd descriptor for cmwq execution in process
> > > >  		 * context provided by tcm_vhost_workqueue.  This also ensures
> > > > @@ -984,9 +1042,23 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
> > > >  {
> > > >  	int i;
> > > >  
> > > > +	/* Flush operation is started */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	vs->vs_during_flush = 1;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > > +
> > > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > > >  		vhost_scsi_flush_vq(vs, i);
> > > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > > +
> > > > +	/* Wait until all requests issued before the flush to be finished */
> > > > +	wait_event(vs->vs_flush_wait, tcm_vhost_done_inflight(vs));
> > > > +
> > > > +	/* Flush operation is finished */
> > > > +	spin_lock(&vs->vs_flush_lock);
> > > > +	vs->vs_during_flush = 0;
> > > > +	spin_unlock(&vs->vs_flush_lock);
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -1094,6 +1166,7 @@ static int vhost_scsi_clear_endpoint(
> > > >  	u8 target;
> > > >  
> > > >  	mutex_lock(&vs->dev.mutex);
> > > > +
> > > >  	/* Verify that ring has been setup correctly. */
> > > >  	for (index = 0; index < vs->dev.nvqs; ++index) {
> > > >  		if (!vhost_vq_access_ok(&vs->vqs[index])) {
> > > > @@ -1195,6 +1268,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > > >  	s->vs_events_dropped = false;
> > > >  	mutex_init(&s->vs_events_lock);
> > > >  
> > > > +	s->vs_inflight[0] = 0;
> > > > +	s->vs_inflight[1] = 0;
> > > > +	spin_lock_init(&s->vs_flush_lock);
> > > > +	init_waitqueue_head(&s->vs_flush_wait);
> > > > +
> > > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > > index 94e9ee53..dd84622 100644
> > > > --- a/drivers/vhost/tcm_vhost.h
> > > > +++ b/drivers/vhost/tcm_vhost.h
> > > > @@ -37,6 +37,8 @@ struct tcm_vhost_cmd {
> > > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > > >  	/* Completed commands list, serviced from vhost worker thread */
> > > >  	struct llist_node tvc_completion_list;
> > > > +	/* Indicate this command is issued during the flush operaton */
> > > > +	int during_flush;
> > > >  };
> > > >  
> > > >  struct tcm_vhost_nexus {
> > > > @@ -91,6 +93,8 @@ struct tcm_vhost_evt {
> > > >  	struct virtio_scsi_event event;
> > > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > > >  	struct llist_node list;
> > > > +	/* Indicate this event is issued during the flush operaton */
> > > > +	int during_flush;
> > > >  };
> > > >  
> > > >  /*
> > > > -- 
> > > > 1.8.1.4
> > 
> > -- 
> > Asias

-- 
Asias

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 0/2] tcm_vhost flush
  2013-04-12 11:33     ` Michael S. Tsirkin
  2013-04-12 14:59         ` Asias He
@ 2013-04-13  3:29       ` Asias He
  2013-04-16  9:16         ` [PATCH v5 " Asias He
                           ` (4 more replies)
  2013-04-13  3:29       ` [PATCH v4 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
                         ` (2 subsequent siblings)
  4 siblings, 5 replies; 28+ messages in thread
From: Asias He @ 2013-04-13  3:29 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

Asias He (2):
  tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  tcm_vhost: Wait for pending requests in vhost_scsi_flush()

 drivers/vhost/tcm_vhost.c | 78 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/tcm_vhost.h |  5 +++
 2 files changed, 81 insertions(+), 2 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  2013-04-12 11:33     ` Michael S. Tsirkin
  2013-04-12 14:59         ` Asias He
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
@ 2013-04-13  3:29       ` Asias He
  2013-04-13  3:29       ` [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
  2013-04-13  3:29       ` Asias He
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-13  3:29 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

It is needed in next patch.

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index aa457d2..e09f0fe 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -569,6 +569,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -593,6 +594,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
 
 	return tv_cmd;
 }
@@ -848,7 +850,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -858,7 +860,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 		tv_cmd->tvc_resp = vq->iov[out].iov_base;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-12 11:33     ` Michael S. Tsirkin
                         ` (2 preceding siblings ...)
  2013-04-13  3:29       ` [PATCH v4 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
@ 2013-04-13  3:29       ` Asias He
  2013-04-14  9:58         ` Michael S. Tsirkin
  2013-04-14  9:58         ` Michael S. Tsirkin
  2013-04-13  3:29       ` Asias He
  4 siblings, 2 replies; 28+ messages in thread
From: Asias He @ 2013-04-13  3:29 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: Paolo Bonzini, Stefan Hajnoczi, Michael S. Tsirkin,
	Rusty Russell, kvm, virtualization, target-devel, Asias He

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v4:
- Introduce vhost_scsi_inflight
- Drop array to track flush
- Use RCU to protect vs_inflight explicitly

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/tcm_vhost.h |  5 ++++
 2 files changed, 78 insertions(+)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index e09f0fe..5dde525 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	wait_queue_head_t wait;
+	atomic_t count;
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,8 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+
+	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static struct vhost_scsi_inflight *
+tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (inflight) {
+		atomic_set(&inflight->count, 0);
+		init_waitqueue_head(&inflight->wait);
+	}
+	rcu_assign_pointer(vs->vs_inflight, inflight);
+	synchronize_rcu();
+
+	return inflight;
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	rcu_read_lock();
+	inflight = rcu_dereference(vs->vs_inflight);
+	if (inflight)
+		atomic_inc(&inflight->count);
+	rcu_read_unlock();
+
+	return inflight;
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	/*
+	 * Wakeup the waiter when all the requests issued before the flush
+	 * operation are finished.
+	 */
+	if (inflight && !atomic_dec_return(&inflight->count))
+		wake_up(&inflight->wait);
+}
+
+static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
+{
+	return atomic_read(&inflight->count) == 0;
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
@@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
 	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
 	int i;
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = ACCESS_ONCE(vs->vs_inflight);
+	if (!tcm_vhost_alloc_inflight(vs))
+		return;
 
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
+	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait until all requests issued before the flush to be finished */
+	if (inflight) {
+		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
+		kfree(inflight);
+	}
 }
 
 /*
@@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if (!tcm_vhost_alloc_inflight(s))
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..7567767 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +94,8 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-12 11:33     ` Michael S. Tsirkin
                         ` (3 preceding siblings ...)
  2013-04-13  3:29       ` [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
@ 2013-04-13  3:29       ` Asias He
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-13  3:29 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v4:
- Introduce vhost_scsi_inflight
- Drop array to track flush
- Use RCU to protect vs_inflight explicitly

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/vhost/tcm_vhost.h |  5 ++++
 2 files changed, 78 insertions(+)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index e09f0fe..5dde525 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	wait_queue_head_t wait;
+	atomic_t count;
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,8 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+
+	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static struct vhost_scsi_inflight *
+tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (inflight) {
+		atomic_set(&inflight->count, 0);
+		init_waitqueue_head(&inflight->wait);
+	}
+	rcu_assign_pointer(vs->vs_inflight, inflight);
+	synchronize_rcu();
+
+	return inflight;
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	rcu_read_lock();
+	inflight = rcu_dereference(vs->vs_inflight);
+	if (inflight)
+		atomic_inc(&inflight->count);
+	rcu_read_unlock();
+
+	return inflight;
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	/*
+	 * Wakeup the waiter when all the requests issued before the flush
+	 * operation are finished.
+	 */
+	if (inflight && !atomic_dec_return(&inflight->count))
+		wake_up(&inflight->wait);
+}
+
+static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
+{
+	return atomic_read(&inflight->count) == 0;
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
@@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
 	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
 	int i;
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = ACCESS_ONCE(vs->vs_inflight);
+	if (!tcm_vhost_alloc_inflight(vs))
+		return;
 
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
+	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait until all requests issued before the flush to be finished */
+	if (inflight) {
+		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
+		kfree(inflight);
+	}
 }
 
 /*
@@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if (!tcm_vhost_alloc_inflight(s))
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..7567767 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +94,8 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-13  3:29       ` [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
@ 2013-04-14  9:58         ` Michael S. Tsirkin
  2013-04-14 12:27           ` Asias He
  2013-04-14 12:27           ` Asias He
  2013-04-14  9:58         ` Michael S. Tsirkin
  1 sibling, 2 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-14  9:58 UTC (permalink / raw)
  To: Asias He
  Cc: Nicholas Bellinger, Paolo Bonzini, Stefan Hajnoczi,
	Rusty Russell, kvm, virtualization, target-devel

On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> This patch makes vhost_scsi_flush() wait for all the pending requests
> issued before the flush operation to be finished.
> 
> Changes in v4:
> - Introduce vhost_scsi_inflight
> - Drop array to track flush
> - Use RCU to protect vs_inflight explicitly
> 
> Changes in v3:
> - Rebase
> - Drop 'tcm_vhost: Wait for pending requests in
>   vhost_scsi_clear_endpoint()' in this series, we already did that in
>   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> 
> Changes in v2:
> - Increase/Decrease inflight requests in
>   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> 
> Signed-off-by: Asias He <asias@redhat.com>
> ---
>  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/tcm_vhost.h |  5 ++++
>  2 files changed, 78 insertions(+)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index e09f0fe..5dde525 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -74,6 +74,11 @@ enum {
>  #define VHOST_SCSI_MAX_VQ	128
>  #define VHOST_SCSI_MAX_EVENT	128
>  
> +struct vhost_scsi_inflight {
> +	wait_queue_head_t wait;
> +	atomic_t count;

Okay now let's switch to kref + completion, make it more
descriptive.

> +};
> +
>  struct vhost_scsi {
>  	/* Protected by vhost_scsi->dev.mutex */
>  	struct tcm_vhost_tpg **vs_tpg;
> @@ -91,6 +96,8 @@ struct vhost_scsi {
>  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
>  	bool vs_events_dropped; /* any missed events */
>  	int vs_events_nr; /* num of pending events */
> +
> +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
>  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
>  }
>  
> +static struct vhost_scsi_inflight *
> +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> +	if (inflight) {

This is used in set_features, so let's make it int
and return error to user if not. No need to corrupt kernel
memory silently like this.



> +		atomic_set(&inflight->count, 0);


Ugh. So once all requests finish, refcount is 0
and then inflight is freed, and then the next request will
get a freed inflight value and dereference. Looks pretty bad,
but maybe there's an increment somewhere that fixes it.

But let's not go there.  That's why I said above we should use kref +
completion. That makes is very clear how to use it correctly.
So:
	- initialize to 1
	- swap pointer with RCU
	- decrement
	- wait_for_completion



> +		init_waitqueue_head(&inflight->wait);
> +	}
> +	rcu_assign_pointer(vs->vs_inflight, inflight);
> +	synchronize_rcu();
> +
> +	return inflight;
> +}
> +

This looks like it will overwrite inflight without
freeing the old one. In fact it won't because caller
has saved the pointer but this interface is
just too tricky. Please just opencode this function.



> +static struct vhost_scsi_inflight *
> +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	rcu_read_lock();
> +	inflight = rcu_dereference(vs->vs_inflight);
> +	if (inflight)

How can it be NULL?

> +		atomic_inc(&inflight->count);
> +	rcu_read_unlock();
> +
> +	return inflight;
> +}
> +
> +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> +{
> +	/*
> +	 * Wakeup the waiter when all the requests issued before the flush
> +	 * operation are finished.
> +	 */
> +	if (inflight && !atomic_dec_return(&inflight->count))
> +		wake_up(&inflight->wait);
> +}
> +
> +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> +{
> +	return atomic_read(&inflight->count) == 0;
> +}
> +
>  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
>  {
>  	bool ret = false;
> @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
>  {
>  	mutex_lock(&vs->vs_events_lock);
> +	tcm_vhost_dec_inflight(evt->inflight);
>  	vs->vs_events_nr--;
>  	kfree(evt);
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
>  	if (evt) {
>  		evt->event.event = event;
>  		evt->event.reason = reason;
> +		evt->inflight = tcm_vhost_inc_inflight(vs);
>  		vs->vs_events_nr++;
>  	}
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  		kfree(tv_cmd->tvc_sgl);
>  	}
>  
> +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> +
>  	kfree(tv_cmd);
>  }
>  
> @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
>  	tv_cmd->tvc_data_direction = data_direction;
>  	tv_cmd->tvc_nexus = tv_nexus;
>  	tv_cmd->tvc_vhost = vs;
> +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
>  
>  	return tv_cmd;
>  }
> @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
>  static void vhost_scsi_flush(struct vhost_scsi *vs)
>  {
>  	int i;
> +	struct vhost_scsi_inflight *inflight;
> +
> +	inflight = ACCESS_ONCE(vs->vs_inflight);

rcu_dereference_protected ? This ACCESS_ONCE looks bogus.

> +	if (!tcm_vhost_alloc_inflight(vs))
> +		return;
>  
>  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>  		vhost_scsi_flush_vq(vs, i);
>  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> +
> +	/* Wait until all requests issued before the flush to be finished */

s/until/for/

> +	if (inflight) {

How can this be NULL?

> +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> +		kfree(inflight);
> +	}
>  }
>  
>  /*
> @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	s->vs_events_dropped = false;
>  	mutex_init(&s->vs_events_lock);
>  
> +	if (!tcm_vhost_alloc_inflight(s))
> +		return -ENOMEM;
> +
>  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
>  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
>  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
>  	vhost_scsi_clear_endpoint(s, &t);
>  	vhost_dev_stop(&s->dev);
>  	vhost_dev_cleanup(&s->dev, false);
> +	kfree(s->vs_inflight);
>  	kfree(s);
>  	return 0;
>  }
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index 94e9ee53..7567767 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -2,6 +2,7 @@
>  #define TCM_VHOST_NAMELEN 256
>  #define TCM_VHOST_MAX_CDB_SIZE 32
>  
> +struct vhost_scsi_inflight;
>  struct tcm_vhost_cmd {
>  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
>  	int tvc_vq_desc;
> @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
>  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
>  	/* Completed commands list, serviced from vhost worker thread */
>  	struct llist_node tvc_completion_list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  struct tcm_vhost_nexus {
> @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
>  	struct virtio_scsi_event event;
>  	/* virtio_scsi event list, serviced from vhost worker thread */
>  	struct llist_node list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  /*
> -- 
> 1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-13  3:29       ` [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
  2013-04-14  9:58         ` Michael S. Tsirkin
@ 2013-04-14  9:58         ` Michael S. Tsirkin
  1 sibling, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-14  9:58 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> This patch makes vhost_scsi_flush() wait for all the pending requests
> issued before the flush operation to be finished.
> 
> Changes in v4:
> - Introduce vhost_scsi_inflight
> - Drop array to track flush
> - Use RCU to protect vs_inflight explicitly
> 
> Changes in v3:
> - Rebase
> - Drop 'tcm_vhost: Wait for pending requests in
>   vhost_scsi_clear_endpoint()' in this series, we already did that in
>   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> 
> Changes in v2:
> - Increase/Decrease inflight requests in
>   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> 
> Signed-off-by: Asias He <asias@redhat.com>
> ---
>  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/vhost/tcm_vhost.h |  5 ++++
>  2 files changed, 78 insertions(+)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index e09f0fe..5dde525 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -74,6 +74,11 @@ enum {
>  #define VHOST_SCSI_MAX_VQ	128
>  #define VHOST_SCSI_MAX_EVENT	128
>  
> +struct vhost_scsi_inflight {
> +	wait_queue_head_t wait;
> +	atomic_t count;

Okay now let's switch to kref + completion, make it more
descriptive.

> +};
> +
>  struct vhost_scsi {
>  	/* Protected by vhost_scsi->dev.mutex */
>  	struct tcm_vhost_tpg **vs_tpg;
> @@ -91,6 +96,8 @@ struct vhost_scsi {
>  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
>  	bool vs_events_dropped; /* any missed events */
>  	int vs_events_nr; /* num of pending events */
> +
> +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
>  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
>  }
>  
> +static struct vhost_scsi_inflight *
> +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> +	if (inflight) {

This is used in set_features, so let's make it int
and return error to user if not. No need to corrupt kernel
memory silently like this.



> +		atomic_set(&inflight->count, 0);


Ugh. So once all requests finish, refcount is 0
and then inflight is freed, and then the next request will
get a freed inflight value and dereference. Looks pretty bad,
but maybe there's an increment somewhere that fixes it.

But let's not go there.  That's why I said above we should use kref +
completion. That makes is very clear how to use it correctly.
So:
	- initialize to 1
	- swap pointer with RCU
	- decrement
	- wait_for_completion



> +		init_waitqueue_head(&inflight->wait);
> +	}
> +	rcu_assign_pointer(vs->vs_inflight, inflight);
> +	synchronize_rcu();
> +
> +	return inflight;
> +}
> +

This looks like it will overwrite inflight without
freeing the old one. In fact it won't because caller
has saved the pointer but this interface is
just too tricky. Please just opencode this function.



> +static struct vhost_scsi_inflight *
> +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	rcu_read_lock();
> +	inflight = rcu_dereference(vs->vs_inflight);
> +	if (inflight)

How can it be NULL?

> +		atomic_inc(&inflight->count);
> +	rcu_read_unlock();
> +
> +	return inflight;
> +}
> +
> +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> +{
> +	/*
> +	 * Wakeup the waiter when all the requests issued before the flush
> +	 * operation are finished.
> +	 */
> +	if (inflight && !atomic_dec_return(&inflight->count))
> +		wake_up(&inflight->wait);
> +}
> +
> +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> +{
> +	return atomic_read(&inflight->count) == 0;
> +}
> +
>  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
>  {
>  	bool ret = false;
> @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
>  {
>  	mutex_lock(&vs->vs_events_lock);
> +	tcm_vhost_dec_inflight(evt->inflight);
>  	vs->vs_events_nr--;
>  	kfree(evt);
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
>  	if (evt) {
>  		evt->event.event = event;
>  		evt->event.reason = reason;
> +		evt->inflight = tcm_vhost_inc_inflight(vs);
>  		vs->vs_events_nr++;
>  	}
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  		kfree(tv_cmd->tvc_sgl);
>  	}
>  
> +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> +
>  	kfree(tv_cmd);
>  }
>  
> @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
>  	tv_cmd->tvc_data_direction = data_direction;
>  	tv_cmd->tvc_nexus = tv_nexus;
>  	tv_cmd->tvc_vhost = vs;
> +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
>  
>  	return tv_cmd;
>  }
> @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
>  static void vhost_scsi_flush(struct vhost_scsi *vs)
>  {
>  	int i;
> +	struct vhost_scsi_inflight *inflight;
> +
> +	inflight = ACCESS_ONCE(vs->vs_inflight);

rcu_dereference_protected ? This ACCESS_ONCE looks bogus.

> +	if (!tcm_vhost_alloc_inflight(vs))
> +		return;
>  
>  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>  		vhost_scsi_flush_vq(vs, i);
>  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> +
> +	/* Wait until all requests issued before the flush to be finished */

s/until/for/

> +	if (inflight) {

How can this be NULL?

> +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> +		kfree(inflight);
> +	}
>  }
>  
>  /*
> @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	s->vs_events_dropped = false;
>  	mutex_init(&s->vs_events_lock);
>  
> +	if (!tcm_vhost_alloc_inflight(s))
> +		return -ENOMEM;
> +
>  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
>  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
>  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
>  	vhost_scsi_clear_endpoint(s, &t);
>  	vhost_dev_stop(&s->dev);
>  	vhost_dev_cleanup(&s->dev, false);
> +	kfree(s->vs_inflight);
>  	kfree(s);
>  	return 0;
>  }
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index 94e9ee53..7567767 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -2,6 +2,7 @@
>  #define TCM_VHOST_NAMELEN 256
>  #define TCM_VHOST_MAX_CDB_SIZE 32
>  
> +struct vhost_scsi_inflight;
>  struct tcm_vhost_cmd {
>  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
>  	int tvc_vq_desc;
> @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
>  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
>  	/* Completed commands list, serviced from vhost worker thread */
>  	struct llist_node tvc_completion_list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  struct tcm_vhost_nexus {
> @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
>  	struct virtio_scsi_event event;
>  	/* virtio_scsi event list, serviced from vhost worker thread */
>  	struct llist_node list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  /*
> -- 
> 1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-12 14:59         ` Asias He
  (?)
@ 2013-04-14 10:07         ` Michael S. Tsirkin
  2013-04-14 12:38           ` Asias He
  -1 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-14 10:07 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Fri, Apr 12, 2013 at 10:59:51PM +0800, Asias He wrote:
> On Fri, Apr 12, 2013 at 02:33:32PM +0300, Michael S. Tsirkin wrote:
> > On Fri, Apr 12, 2013 at 02:25:23PM +0800, Asias He wrote:
> > > On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > > issued before the flush operation to be finished.
> > > > > 
> > > > > Changes in v3:
> > > > > - Rebase
> > > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > > 
> > > > > Changes in v2:
> > > > > - Increase/Decrease inflight requests in
> > > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > > 
> > > > > Signed-off-by: Asias He <asias@redhat.com>
> > > > 
> > > > Nack, let's not do this home-grown here.  Please use a kref.
> > > > 
> > > > The array of two trick is also too tricky for my taste.
> > > > 
> > > > Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> > > > by a kref pointer, allocate a new kref when you flush.
> > > > 
> > > > Access can be done with RCU so we won't need any locks.
> > > 
> > > I do not think kref helps and the right place to use here. Also, a
> > > pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
> > > a wait queue as well.
> > > 
> > > Do you mean something as so:
> > > 
> > >    struct vhost_scsi_inflight {
> > >    	struct kref kref;
> > >    	wait_queue_head_t wait;
> > >    }
> > >    
> > >    vhost_scsi_allocate_cmd()
> > >    	rcu_read_lock()
> > >    	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
> > >    	kref_get(&tv_cmd->inflight->kref)
> > >    	rcu_read_unlock()
> > >    
> > >    vhost_scsi_free_cmd()
> > >    	kref_put(&tv_cmd->inflight.kref, my_release)
> > >    
> > >    my_release()
> > >    	wake_up(&inflight->wait)
> > >    
> > >    vhost_scsi_flush()
> > >    	old_inflight = vs->vs_inflight;
> > >    	new_inflight = kmalloc(*new_inflight, ...)
> > >    	rcu_assign_pointer(vs->vs_inflight, new_inflight);
> > >    	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
> > >    	synchronize_rcu();
> > >    	free(old_inflight)
> > > 
> > > 1) The kref need to be accessed in the free cmd/evt function, you can not use
> > > rcu to protect it.
> > 
> > No, it's vs_inflight pointer that is protected by RCU.
> > But if you prefer, we can have it per-vq and
> > protected by vq mutex.
> 
> No, for event, it can be allocated outside the vhost thread. And vs_inflight
> is not a per queue data why make it per queue.

For multiqueue, to avoid cache-line contention when multiple threads try
to increment the same atomic value.

> > 
> > > 2) No need to use synchronize_rcu to wait for the reader of
> > > vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
> > > time time, we are safe to free the old_inflight.
> > 
> > RCU is to avoid old vhost_scsi_allocate_cmd from using
> > the old pointer. But we can use vq flush instead, that's
> > often done in vhost.
> 
> > > 3) The kref is not used in a standard way. We are refcounting the evt
> > > and cmd, not the vhost_scsi_inflight. A single is atomic conter is
> > > enough.
> > 
> > Looks standard to me.
> 
> Strange ...
> 
> > > Though, I do not like the array trick too. I can change to allocate
> > > vhost_scsi_inflight when we flush.
> > 
> > That's better but homegrown refcounting is better avoided too.
> 
> I had a version which dropped the array.

Right, that's better, except it triggers wakeups each time the queue
becomes empty. Which is not really necessary as long as you don't flush.
Instead init to 1, and decrement before flush.

Commented on the patch itself in a separate thread.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-14  9:58         ` Michael S. Tsirkin
  2013-04-14 12:27           ` Asias He
@ 2013-04-14 12:27           ` Asias He
  2013-04-15  7:18             ` Asias He
  2013-04-15 10:11             ` Michael S. Tsirkin
  1 sibling, 2 replies; 28+ messages in thread
From: Asias He @ 2013-04-14 12:27 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Nicholas Bellinger, Paolo Bonzini, Stefan Hajnoczi,
	Rusty Russell, kvm, virtualization, target-devel

On Sun, Apr 14, 2013 at 12:58:03PM +0300, Michael S. Tsirkin wrote:
> On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> > This patch makes vhost_scsi_flush() wait for all the pending requests
> > issued before the flush operation to be finished.
> > 
> > Changes in v4:
> > - Introduce vhost_scsi_inflight
> > - Drop array to track flush
> > - Use RCU to protect vs_inflight explicitly
> > 
> > Changes in v3:
> > - Rebase
> > - Drop 'tcm_vhost: Wait for pending requests in
> >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > 
> > Changes in v2:
> > - Increase/Decrease inflight requests in
> >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > 
> > Signed-off-by: Asias He <asias@redhat.com>
> > ---
> >  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/vhost/tcm_vhost.h |  5 ++++
> >  2 files changed, 78 insertions(+)
> > 
> > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > index e09f0fe..5dde525 100644
> > --- a/drivers/vhost/tcm_vhost.c
> > +++ b/drivers/vhost/tcm_vhost.c
> > @@ -74,6 +74,11 @@ enum {
> >  #define VHOST_SCSI_MAX_VQ	128
> >  #define VHOST_SCSI_MAX_EVENT	128
> >  
> > +struct vhost_scsi_inflight {
> > +	wait_queue_head_t wait;
> > +	atomic_t count;
> 
> Okay now let's switch to kref + completion, make it more
> descriptive.

I still do not see why kref is better. Completion sounds good.

> > +};
> > +
> >  struct vhost_scsi {
> >  	/* Protected by vhost_scsi->dev.mutex */
> >  	struct tcm_vhost_tpg **vs_tpg;
> > @@ -91,6 +96,8 @@ struct vhost_scsi {
> >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> >  	bool vs_events_dropped; /* any missed events */
> >  	int vs_events_nr; /* num of pending events */
> > +
> > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
> >  };
> >  
> >  /* Local pointer to allocated TCM configfs fabric module */
> > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> >  }
> >  
> > +static struct vhost_scsi_inflight *
> > +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > +	if (inflight) {
> 
> This is used in set_features, so let's make it int
> and return error to user if not. No need to corrupt kernel
> memory silently like this.

What do you mean by used in set_features? The return value of
tcm_vhost_alloc_inflight was used at some point, but now it is not used.
So I will return int.

Why is it corrupted?

> 
> 
> > +		atomic_set(&inflight->count, 0);
> 
> 
> Ugh. So once all requests finish, refcount is 0
> and then inflight is freed, and then the next request will
> get a freed inflight value and dereference. Looks pretty bad,
> but maybe there's an increment somewhere that fixes it.

How can the next request get a freed inflight? It can not happen.  The
old inflight is freed only after all the requests which reference it are
finished. See the last few lines in vhost_scsi_flush.

> But let's not go there.  That's why I said above we should use kref +
> completion. That makes is very clear how to use it correctly.
> So:
> 	- initialize to 1
> 	- swap pointer with RCU
> 	- decrement
> 	- wait_for_completion

We can not go there.

> 
> 
> > +		init_waitqueue_head(&inflight->wait);
> > +	}
> > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > +	synchronize_rcu();
> > +
> > +	return inflight;
> > +}
> > +
> 
> This looks like it will overwrite inflight without
> freeing the old one. In fact it won't because caller
> has saved the pointer but this interface is
> just too tricky. Please just opencode this function.
> 

Did you see the old inflight was freed in vhost_scsi_flush().
It was code in a helper function because it is used in too places.
One is in vhost_scsi_open and the other is in vhost_scsi_scsi?

> 
> > +static struct vhost_scsi_inflight *
> > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	rcu_read_lock();
> > +	inflight = rcu_dereference(vs->vs_inflight);
> > +	if (inflight)
> 
> How can it be NULL?

When tcm_vhost_alloc_inflight failed to allocate inflight.

> > +		atomic_inc(&inflight->count);
> > +	rcu_read_unlock();
> > +
> > +	return inflight;
> > +}
> > +
> > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > +{
> > +	/*
> > +	 * Wakeup the waiter when all the requests issued before the flush
> > +	 * operation are finished.
> > +	 */
> > +	if (inflight && !atomic_dec_return(&inflight->count))
> > +		wake_up(&inflight->wait);
> > +}
> > +
> > +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> > +{
> > +	return atomic_read(&inflight->count) == 0;
> > +}
> > +
> >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> >  {
> >  	bool ret = false;
> > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> >  {
> >  	mutex_lock(&vs->vs_events_lock);
> > +	tcm_vhost_dec_inflight(evt->inflight);
> >  	vs->vs_events_nr--;
> >  	kfree(evt);
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> >  	if (evt) {
> >  		evt->event.event = event;
> >  		evt->event.reason = reason;
> > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> >  		vs->vs_events_nr++;
> >  	}
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> >  		kfree(tv_cmd->tvc_sgl);
> >  	}
> >  
> > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > +
> >  	kfree(tv_cmd);
> >  }
> >  
> > @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> >  	tv_cmd->tvc_data_direction = data_direction;
> >  	tv_cmd->tvc_nexus = tv_nexus;
> >  	tv_cmd->tvc_vhost = vs;
> > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> >  
> >  	return tv_cmd;
> >  }
> > @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> >  {
> >  	int i;
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	inflight = ACCESS_ONCE(vs->vs_inflight);
> 
> rcu_dereference_protected ? This ACCESS_ONCE looks bogus.

okay.

> > +	if (!tcm_vhost_alloc_inflight(vs))
> > +		return;
> >  
> >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> >  		vhost_scsi_flush_vq(vs, i);
> >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > +
> > +	/* Wait until all requests issued before the flush to be finished */
> 
> s/until/for/

okay.

> > +	if (inflight) {
> 
> How can this be NULL?

When tcm_vhost_alloc_inflight failed to allocate inflight.

> > +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> > +		kfree(inflight);
> > +	}
> >  }
> >  
> >  /*
> > @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> >  	s->vs_events_dropped = false;
> >  	mutex_init(&s->vs_events_lock);
> >  
> > +	if (!tcm_vhost_alloc_inflight(s))
> > +		return -ENOMEM;
> > +
> >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> >  	vhost_scsi_clear_endpoint(s, &t);
> >  	vhost_dev_stop(&s->dev);
> >  	vhost_dev_cleanup(&s->dev, false);
> > +	kfree(s->vs_inflight);
> >  	kfree(s);
> >  	return 0;
> >  }
> > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > index 94e9ee53..7567767 100644
> > --- a/drivers/vhost/tcm_vhost.h
> > +++ b/drivers/vhost/tcm_vhost.h
> > @@ -2,6 +2,7 @@
> >  #define TCM_VHOST_NAMELEN 256
> >  #define TCM_VHOST_MAX_CDB_SIZE 32
> >  
> > +struct vhost_scsi_inflight;
> >  struct tcm_vhost_cmd {
> >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> >  	int tvc_vq_desc;
> > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> >  	/* Completed commands list, serviced from vhost worker thread */
> >  	struct llist_node tvc_completion_list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  struct tcm_vhost_nexus {
> > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> >  	struct virtio_scsi_event event;
> >  	/* virtio_scsi event list, serviced from vhost worker thread */
> >  	struct llist_node list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  /*
> > -- 
> > 1.8.1.4

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-14  9:58         ` Michael S. Tsirkin
@ 2013-04-14 12:27           ` Asias He
  2013-04-14 12:27           ` Asias He
  1 sibling, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-14 12:27 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Sun, Apr 14, 2013 at 12:58:03PM +0300, Michael S. Tsirkin wrote:
> On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> > This patch makes vhost_scsi_flush() wait for all the pending requests
> > issued before the flush operation to be finished.
> > 
> > Changes in v4:
> > - Introduce vhost_scsi_inflight
> > - Drop array to track flush
> > - Use RCU to protect vs_inflight explicitly
> > 
> > Changes in v3:
> > - Rebase
> > - Drop 'tcm_vhost: Wait for pending requests in
> >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > 
> > Changes in v2:
> > - Increase/Decrease inflight requests in
> >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > 
> > Signed-off-by: Asias He <asias@redhat.com>
> > ---
> >  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/vhost/tcm_vhost.h |  5 ++++
> >  2 files changed, 78 insertions(+)
> > 
> > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > index e09f0fe..5dde525 100644
> > --- a/drivers/vhost/tcm_vhost.c
> > +++ b/drivers/vhost/tcm_vhost.c
> > @@ -74,6 +74,11 @@ enum {
> >  #define VHOST_SCSI_MAX_VQ	128
> >  #define VHOST_SCSI_MAX_EVENT	128
> >  
> > +struct vhost_scsi_inflight {
> > +	wait_queue_head_t wait;
> > +	atomic_t count;
> 
> Okay now let's switch to kref + completion, make it more
> descriptive.

I still do not see why kref is better. Completion sounds good.

> > +};
> > +
> >  struct vhost_scsi {
> >  	/* Protected by vhost_scsi->dev.mutex */
> >  	struct tcm_vhost_tpg **vs_tpg;
> > @@ -91,6 +96,8 @@ struct vhost_scsi {
> >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> >  	bool vs_events_dropped; /* any missed events */
> >  	int vs_events_nr; /* num of pending events */
> > +
> > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
> >  };
> >  
> >  /* Local pointer to allocated TCM configfs fabric module */
> > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> >  }
> >  
> > +static struct vhost_scsi_inflight *
> > +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > +	if (inflight) {
> 
> This is used in set_features, so let's make it int
> and return error to user if not. No need to corrupt kernel
> memory silently like this.

What do you mean by used in set_features? The return value of
tcm_vhost_alloc_inflight was used at some point, but now it is not used.
So I will return int.

Why is it corrupted?

> 
> 
> > +		atomic_set(&inflight->count, 0);
> 
> 
> Ugh. So once all requests finish, refcount is 0
> and then inflight is freed, and then the next request will
> get a freed inflight value and dereference. Looks pretty bad,
> but maybe there's an increment somewhere that fixes it.

How can the next request get a freed inflight? It can not happen.  The
old inflight is freed only after all the requests which reference it are
finished. See the last few lines in vhost_scsi_flush.

> But let's not go there.  That's why I said above we should use kref +
> completion. That makes is very clear how to use it correctly.
> So:
> 	- initialize to 1
> 	- swap pointer with RCU
> 	- decrement
> 	- wait_for_completion

We can not go there.

> 
> 
> > +		init_waitqueue_head(&inflight->wait);
> > +	}
> > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > +	synchronize_rcu();
> > +
> > +	return inflight;
> > +}
> > +
> 
> This looks like it will overwrite inflight without
> freeing the old one. In fact it won't because caller
> has saved the pointer but this interface is
> just too tricky. Please just opencode this function.
> 

Did you see the old inflight was freed in vhost_scsi_flush().
It was code in a helper function because it is used in too places.
One is in vhost_scsi_open and the other is in vhost_scsi_scsi?

> 
> > +static struct vhost_scsi_inflight *
> > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	rcu_read_lock();
> > +	inflight = rcu_dereference(vs->vs_inflight);
> > +	if (inflight)
> 
> How can it be NULL?

When tcm_vhost_alloc_inflight failed to allocate inflight.

> > +		atomic_inc(&inflight->count);
> > +	rcu_read_unlock();
> > +
> > +	return inflight;
> > +}
> > +
> > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > +{
> > +	/*
> > +	 * Wakeup the waiter when all the requests issued before the flush
> > +	 * operation are finished.
> > +	 */
> > +	if (inflight && !atomic_dec_return(&inflight->count))
> > +		wake_up(&inflight->wait);
> > +}
> > +
> > +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> > +{
> > +	return atomic_read(&inflight->count) == 0;
> > +}
> > +
> >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> >  {
> >  	bool ret = false;
> > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> >  {
> >  	mutex_lock(&vs->vs_events_lock);
> > +	tcm_vhost_dec_inflight(evt->inflight);
> >  	vs->vs_events_nr--;
> >  	kfree(evt);
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> >  	if (evt) {
> >  		evt->event.event = event;
> >  		evt->event.reason = reason;
> > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> >  		vs->vs_events_nr++;
> >  	}
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> >  		kfree(tv_cmd->tvc_sgl);
> >  	}
> >  
> > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > +
> >  	kfree(tv_cmd);
> >  }
> >  
> > @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> >  	tv_cmd->tvc_data_direction = data_direction;
> >  	tv_cmd->tvc_nexus = tv_nexus;
> >  	tv_cmd->tvc_vhost = vs;
> > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> >  
> >  	return tv_cmd;
> >  }
> > @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> >  {
> >  	int i;
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	inflight = ACCESS_ONCE(vs->vs_inflight);
> 
> rcu_dereference_protected ? This ACCESS_ONCE looks bogus.

okay.

> > +	if (!tcm_vhost_alloc_inflight(vs))
> > +		return;
> >  
> >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> >  		vhost_scsi_flush_vq(vs, i);
> >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > +
> > +	/* Wait until all requests issued before the flush to be finished */
> 
> s/until/for/

okay.

> > +	if (inflight) {
> 
> How can this be NULL?

When tcm_vhost_alloc_inflight failed to allocate inflight.

> > +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> > +		kfree(inflight);
> > +	}
> >  }
> >  
> >  /*
> > @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> >  	s->vs_events_dropped = false;
> >  	mutex_init(&s->vs_events_lock);
> >  
> > +	if (!tcm_vhost_alloc_inflight(s))
> > +		return -ENOMEM;
> > +
> >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> >  	vhost_scsi_clear_endpoint(s, &t);
> >  	vhost_dev_stop(&s->dev);
> >  	vhost_dev_cleanup(&s->dev, false);
> > +	kfree(s->vs_inflight);
> >  	kfree(s);
> >  	return 0;
> >  }
> > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > index 94e9ee53..7567767 100644
> > --- a/drivers/vhost/tcm_vhost.h
> > +++ b/drivers/vhost/tcm_vhost.h
> > @@ -2,6 +2,7 @@
> >  #define TCM_VHOST_NAMELEN 256
> >  #define TCM_VHOST_MAX_CDB_SIZE 32
> >  
> > +struct vhost_scsi_inflight;
> >  struct tcm_vhost_cmd {
> >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> >  	int tvc_vq_desc;
> > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> >  	/* Completed commands list, serviced from vhost worker thread */
> >  	struct llist_node tvc_completion_list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  struct tcm_vhost_nexus {
> > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> >  	struct virtio_scsi_event event;
> >  	/* virtio_scsi event list, serviced from vhost worker thread */
> >  	struct llist_node list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  /*
> > -- 
> > 1.8.1.4

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-14 10:07         ` Michael S. Tsirkin
@ 2013-04-14 12:38           ` Asias He
  0 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-14 12:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Sun, Apr 14, 2013 at 01:07:51PM +0300, Michael S. Tsirkin wrote:
> On Fri, Apr 12, 2013 at 10:59:51PM +0800, Asias He wrote:
> > On Fri, Apr 12, 2013 at 02:33:32PM +0300, Michael S. Tsirkin wrote:
> > > On Fri, Apr 12, 2013 at 02:25:23PM +0800, Asias He wrote:
> > > > On Thu, Apr 11, 2013 at 01:47:21PM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Apr 09, 2013 at 05:39:43PM +0800, Asias He wrote:
> > > > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > > > issued before the flush operation to be finished.
> > > > > > 
> > > > > > Changes in v3:
> > > > > > - Rebase
> > > > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > > > 
> > > > > > Changes in v2:
> > > > > > - Increase/Decrease inflight requests in
> > > > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > > > 
> > > > > > Signed-off-by: Asias He <asias@redhat.com>
> > > > > 
> > > > > Nack, let's not do this home-grown here.  Please use a kref.
> > > > > 
> > > > > The array of two trick is also too tricky for my taste.
> > > > > 
> > > > > Please replace during_flush in tcm_vhost_cmd and tcm_vhost_evt
> > > > > by a kref pointer, allocate a new kref when you flush.
> > > > > 
> > > > > Access can be done with RCU so we won't need any locks.
> > > > 
> > > > I do not think kref helps and the right place to use here. Also, a
> > > > pointer kref in tcm_vhost_cmd and tcm_vhost_evt is not enough, you need
> > > > a wait queue as well.
> > > > 
> > > > Do you mean something as so:
> > > > 
> > > >    struct vhost_scsi_inflight {
> > > >    	struct kref kref;
> > > >    	wait_queue_head_t wait;
> > > >    }
> > > >    
> > > >    vhost_scsi_allocate_cmd()
> > > >    	rcu_read_lock()
> > > >    	tv_cmd->inflight = rcu_dereference(vs->vs_inflight)
> > > >    	kref_get(&tv_cmd->inflight->kref)
> > > >    	rcu_read_unlock()
> > > >    
> > > >    vhost_scsi_free_cmd()
> > > >    	kref_put(&tv_cmd->inflight.kref, my_release)
> > > >    
> > > >    my_release()
> > > >    	wake_up(&inflight->wait)
> > > >    
> > > >    vhost_scsi_flush()
> > > >    	old_inflight = vs->vs_inflight;
> > > >    	new_inflight = kmalloc(*new_inflight, ...)
> > > >    	rcu_assign_pointer(vs->vs_inflight, new_inflight);
> > > >    	wait_event(old_inflight->wait, atomic_read(&old_inflight->kref->refcount) == 0)
> > > >    	synchronize_rcu();
> > > >    	free(old_inflight)
> > > > 
> > > > 1) The kref need to be accessed in the free cmd/evt function, you can not use
> > > > rcu to protect it.
> > > 
> > > No, it's vs_inflight pointer that is protected by RCU.
> > > But if you prefer, we can have it per-vq and
> > > protected by vq mutex.
> > 
> > No, for event, it can be allocated outside the vhost thread. And vs_inflight
> > is not a per queue data why make it per queue.
> 
> For multiqueue, to avoid cache-line contention when multiple threads try
> to increment the same atomic value.

In each queue, you still point to the same inflight data. Unless you put
the inflight data itself per queue.

> > > 
> > > > 2) No need to use synchronize_rcu to wait for the reader of
> > > > vs->vs_inflight to finish. We need to wait on the wait queue anyway. At
> > > > time time, we are safe to free the old_inflight.
> > > 
> > > RCU is to avoid old vhost_scsi_allocate_cmd from using
> > > the old pointer. But we can use vq flush instead, that's
> > > often done in vhost.
> > 
> > > > 3) The kref is not used in a standard way. We are refcounting the evt
> > > > and cmd, not the vhost_scsi_inflight. A single is atomic conter is
> > > > enough.
> > > 
> > > Looks standard to me.
> > 
> > Strange ...
> > 
> > > > Though, I do not like the array trick too. I can change to allocate
> > > > vhost_scsi_inflight when we flush.
> > > 
> > > That's better but homegrown refcounting is better avoided too.
> > 
> > I had a version which dropped the array.
> 
> Right, that's better, except it triggers wakeups each time the queue
> becomes empty. Which is not really necessary as long as you don't flush.
> Instead init to 1, and decrement before flush.

Well, this is one way to optimize it. Another way might be adding a flag
explicitly in vhost_scsi_inflight to indicate it.

> Commented on the patch itself in a separate thread.

Thanks.

> -- 
> MST

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-14 12:27           ` Asias He
@ 2013-04-15  7:18             ` Asias He
  2013-04-15 10:11             ` Michael S. Tsirkin
  1 sibling, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-15  7:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Sun, Apr 14, 2013 at 08:27:14PM +0800, Asias He wrote:
> On Sun, Apr 14, 2013 at 12:58:03PM +0300, Michael S. Tsirkin wrote:
> > On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > issued before the flush operation to be finished.
> > > 
> > > Changes in v4:
> > > - Introduce vhost_scsi_inflight
> > > - Drop array to track flush
> > > - Use RCU to protect vs_inflight explicitly
> > > 
> > > Changes in v3:
> > > - Rebase
> > > - Drop 'tcm_vhost: Wait for pending requests in
> > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > 
> > > Changes in v2:
> > > - Increase/Decrease inflight requests in
> > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > 
> > > Signed-off-by: Asias He <asias@redhat.com>
> > > ---
> > >  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
> > >  drivers/vhost/tcm_vhost.h |  5 ++++
> > >  2 files changed, 78 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > index e09f0fe..5dde525 100644
> > > --- a/drivers/vhost/tcm_vhost.c
> > > +++ b/drivers/vhost/tcm_vhost.c
> > > @@ -74,6 +74,11 @@ enum {
> > >  #define VHOST_SCSI_MAX_VQ	128
> > >  #define VHOST_SCSI_MAX_EVENT	128
> > >  
> > > +struct vhost_scsi_inflight {
> > > +	wait_queue_head_t wait;
> > > +	atomic_t count;
> > 
> > Okay now let's switch to kref + completion, make it more
> > descriptive.
> 
> I still do not see why kref is better. Completion sounds good.

In the sense of we are kref'ing the vhost_scsi_inflight it is ok. 

Anyway, the current version looks as so. One more pending issue is that
how we handle the memory allocation failure of tcm_vhost_alloc_inflight
in vhost_scsi_flush.

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 4ae6725..86f01c3 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	struct completion comp; /* Wait for the flush operation to finish */
+	struct kref kref; /* Refcount for the inflight reqs */
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,8 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+
+	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static int tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+	int ret = -ENOMEM;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (inflight) {
+		kref_init(&inflight->kref);
+		init_completion(&inflight->comp);
+		ret = 0;
+	}
+	rcu_assign_pointer(vs->vs_inflight, inflight);
+	synchronize_rcu();
+
+	return ret;
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	rcu_read_lock();
+	inflight = rcu_dereference(vs->vs_inflight);
+	if (inflight)
+		kref_get(&inflight->kref);
+	rcu_read_unlock();
+
+	return inflight;
+}
+
+void tcm_vhost_done_inflight(struct kref *kref)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
+	complete(&inflight->comp);
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	if (inflight)
+		kref_put(&inflight->kref, tcm_vhost_done_inflight);
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
@@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
@@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
 	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
 
 	return tv_cmd;
 }
@@ -982,12 +1039,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
+	struct vhost_scsi_inflight *inflight;
 	int i;
 
+	/* inflight points to the old inflight */
+	inflight = rcu_dereference_protected(vs->vs_inflight,
+					     lockdep_is_held(&vs->dev.mutex));
+
+	/* Allocate a new inflight and make vs->vs_inflight points to it */
+	if (tcm_vhost_alloc_inflight(vs) < 0)
+		return;
+
+	/*
+	 * The inflight->kref was initialized to 1. We decrement it here to
+	 * indicate the start of the flush operation so that it will reach 0
+	 * when all the reqs are finished.
+	 */
+	kref_put(&inflight->kref, tcm_vhost_done_inflight);
+
+	/* Flush both the vhost poll and vhost work */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
 	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait for all reqs issued before the flush to be finished */
+	if (inflight) {
+		wait_for_completion(&inflight->comp);
+		kfree(inflight);
+	}
 }
 
 /*
@@ -1196,6 +1276,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if (tcm_vhost_alloc_inflight(s) < 0)
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1221,6 +1304,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..7567767 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +94,8 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*


> > > +};
> > > +
> > >  struct vhost_scsi {
> > >  	/* Protected by vhost_scsi->dev.mutex */
> > >  	struct tcm_vhost_tpg **vs_tpg;
> > > @@ -91,6 +96,8 @@ struct vhost_scsi {
> > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > >  	bool vs_events_dropped; /* any missed events */
> > >  	int vs_events_nr; /* num of pending events */
> > > +
> > > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
> > >  };
> > >  
> > >  /* Local pointer to allocated TCM configfs fabric module */
> > > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > >  }
> > >  
> > > +static struct vhost_scsi_inflight *
> > > +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > > +	if (inflight) {
> > 
> > This is used in set_features, so let's make it int
> > and return error to user if not. No need to corrupt kernel
> > memory silently like this.
> 
> What do you mean by used in set_features? The return value of
> tcm_vhost_alloc_inflight was used at some point, but now it is not used.
> So I will return int.
> 
> Why is it corrupted?
> 
> > 
> > 
> > > +		atomic_set(&inflight->count, 0);
> > 
> > 
> > Ugh. So once all requests finish, refcount is 0
> > and then inflight is freed, and then the next request will
> > get a freed inflight value and dereference. Looks pretty bad,
> > but maybe there's an increment somewhere that fixes it.
> 
> How can the next request get a freed inflight? It can not happen.  The
> old inflight is freed only after all the requests which reference it are
> finished. See the last few lines in vhost_scsi_flush.
> 
> > But let's not go there.  That's why I said above we should use kref +
> > completion. That makes is very clear how to use it correctly.
> > So:
> > 	- initialize to 1
> > 	- swap pointer with RCU
> > 	- decrement
> > 	- wait_for_completion
> 
> We can not go there.
> 
> > 
> > 
> > > +		init_waitqueue_head(&inflight->wait);
> > > +	}
> > > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > > +	synchronize_rcu();
> > > +
> > > +	return inflight;
> > > +}
> > > +
> > 
> > This looks like it will overwrite inflight without
> > freeing the old one. In fact it won't because caller
> > has saved the pointer but this interface is
> > just too tricky. Please just opencode this function.
> > 
> 
> Did you see the old inflight was freed in vhost_scsi_flush().
> It was code in a helper function because it is used in too places.
> One is in vhost_scsi_open and the other is in vhost_scsi_scsi?
> 
> > 
> > > +static struct vhost_scsi_inflight *
> > > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	rcu_read_lock();
> > > +	inflight = rcu_dereference(vs->vs_inflight);
> > > +	if (inflight)
> > 
> > How can it be NULL?
> 
> When tcm_vhost_alloc_inflight failed to allocate inflight.
> 
> > > +		atomic_inc(&inflight->count);
> > > +	rcu_read_unlock();
> > > +
> > > +	return inflight;
> > > +}
> > > +
> > > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > > +{
> > > +	/*
> > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > +	 * operation are finished.
> > > +	 */
> > > +	if (inflight && !atomic_dec_return(&inflight->count))
> > > +		wake_up(&inflight->wait);
> > > +}
> > > +
> > > +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> > > +{
> > > +	return atomic_read(&inflight->count) == 0;
> > > +}
> > > +
> > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > >  {
> > >  	bool ret = false;
> > > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > >  {
> > >  	mutex_lock(&vs->vs_events_lock);
> > > +	tcm_vhost_dec_inflight(evt->inflight);
> > >  	vs->vs_events_nr--;
> > >  	kfree(evt);
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > >  	if (evt) {
> > >  		evt->event.event = event;
> > >  		evt->event.reason = reason;
> > > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > >  		vs->vs_events_nr++;
> > >  	}
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > >  		kfree(tv_cmd->tvc_sgl);
> > >  	}
> > >  
> > > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > > +
> > >  	kfree(tv_cmd);
> > >  }
> > >  
> > > @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > >  	tv_cmd->tvc_data_direction = data_direction;
> > >  	tv_cmd->tvc_nexus = tv_nexus;
> > >  	tv_cmd->tvc_vhost = vs;
> > > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > >  
> > >  	return tv_cmd;
> > >  }
> > > @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> > >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> > >  {
> > >  	int i;
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	inflight = ACCESS_ONCE(vs->vs_inflight);
> > 
> > rcu_dereference_protected ? This ACCESS_ONCE looks bogus.
> 
> okay.
> 
> > > +	if (!tcm_vhost_alloc_inflight(vs))
> > > +		return;
> > >  
> > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > >  		vhost_scsi_flush_vq(vs, i);
> > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > +
> > > +	/* Wait until all requests issued before the flush to be finished */
> > 
> > s/until/for/
> 
> okay.
> 
> > > +	if (inflight) {
> > 
> > How can this be NULL?
> 
> When tcm_vhost_alloc_inflight failed to allocate inflight.
> 
> > > +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> > > +		kfree(inflight);
> > > +	}
> > >  }
> > >  
> > >  /*
> > > @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > >  	s->vs_events_dropped = false;
> > >  	mutex_init(&s->vs_events_lock);
> > >  
> > > +	if (!tcm_vhost_alloc_inflight(s))
> > > +		return -ENOMEM;
> > > +
> > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> > >  	vhost_scsi_clear_endpoint(s, &t);
> > >  	vhost_dev_stop(&s->dev);
> > >  	vhost_dev_cleanup(&s->dev, false);
> > > +	kfree(s->vs_inflight);
> > >  	kfree(s);
> > >  	return 0;
> > >  }
> > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > index 94e9ee53..7567767 100644
> > > --- a/drivers/vhost/tcm_vhost.h
> > > +++ b/drivers/vhost/tcm_vhost.h
> > > @@ -2,6 +2,7 @@
> > >  #define TCM_VHOST_NAMELEN 256
> > >  #define TCM_VHOST_MAX_CDB_SIZE 32
> > >  
> > > +struct vhost_scsi_inflight;
> > >  struct tcm_vhost_cmd {
> > >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> > >  	int tvc_vq_desc;
> > > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > >  	/* Completed commands list, serviced from vhost worker thread */
> > >  	struct llist_node tvc_completion_list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  struct tcm_vhost_nexus {
> > > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> > >  	struct virtio_scsi_event event;
> > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > >  	struct llist_node list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  /*
> > > -- 
> > > 1.8.1.4
> 
> -- 
> Asias
> --
> To unsubscribe from this list: send the line "unsubscribe target-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Asias

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-14 12:27           ` Asias He
  2013-04-15  7:18             ` Asias He
@ 2013-04-15 10:11             ` Michael S. Tsirkin
  2013-04-16  0:35               ` Asias He
  1 sibling, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-15 10:11 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Sun, Apr 14, 2013 at 08:27:14PM +0800, Asias He wrote:
> On Sun, Apr 14, 2013 at 12:58:03PM +0300, Michael S. Tsirkin wrote:
> > On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > issued before the flush operation to be finished.
> > > 
> > > Changes in v4:
> > > - Introduce vhost_scsi_inflight
> > > - Drop array to track flush
> > > - Use RCU to protect vs_inflight explicitly
> > > 
> > > Changes in v3:
> > > - Rebase
> > > - Drop 'tcm_vhost: Wait for pending requests in
> > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > 
> > > Changes in v2:
> > > - Increase/Decrease inflight requests in
> > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > 
> > > Signed-off-by: Asias He <asias@redhat.com>
> > > ---
> > >  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
> > >  drivers/vhost/tcm_vhost.h |  5 ++++
> > >  2 files changed, 78 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > index e09f0fe..5dde525 100644
> > > --- a/drivers/vhost/tcm_vhost.c
> > > +++ b/drivers/vhost/tcm_vhost.c
> > > @@ -74,6 +74,11 @@ enum {
> > >  #define VHOST_SCSI_MAX_VQ	128
> > >  #define VHOST_SCSI_MAX_EVENT	128
> > >  
> > > +struct vhost_scsi_inflight {
> > > +	wait_queue_head_t wait;
> > > +	atomic_t count;
> > 
> > Okay now let's switch to kref + completion, make it more
> > descriptive.
> 
> I still do not see why kref is better.

It makes the fact you are doing reference counting, explicit.

> Completion sounds good.
> 
> > > +};
> > > +
> > >  struct vhost_scsi {
> > >  	/* Protected by vhost_scsi->dev.mutex */
> > >  	struct tcm_vhost_tpg **vs_tpg;
> > > @@ -91,6 +96,8 @@ struct vhost_scsi {
> > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > >  	bool vs_events_dropped; /* any missed events */
> > >  	int vs_events_nr; /* num of pending events */
> > > +
> > > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
> > >  };
> > >  
> > >  /* Local pointer to allocated TCM configfs fabric module */
> > > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > >  }
> > >  
> > > +static struct vhost_scsi_inflight *
> > > +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > > +	if (inflight) {
> > 
> > This is used in set_features, so let's make it int
> > and return error to user if not. No need to corrupt kernel
> > memory silently like this.
> 
> What do you mean by used in set_features? The return value of
> tcm_vhost_alloc_inflight was used at some point, but now it is not used.
> So I will return int.
> 
> Why is it corrupted?

You skip flushes so something can be in flight, our code
assumes flush actually flushes things.

> > 
> > 
> > > +		atomic_set(&inflight->count, 0);
> > 
> > 
> > Ugh. So once all requests finish, refcount is 0
> > and then inflight is freed, and then the next request will
> > get a freed inflight value and dereference. Looks pretty bad,
> > but maybe there's an increment somewhere that fixes it.
> 
> How can the next request get a freed inflight? It can not happen.  The
> old inflight is freed only after all the requests which reference it are
> finished. See the last few lines in vhost_scsi_flush.
> 
> > But let's not go there.  That's why I said above we should use kref +
> > completion. That makes is very clear how to use it correctly.
> > So:
> > 	- initialize to 1
> > 	- swap pointer with RCU
> > 	- decrement
> > 	- wait_for_completion
> 
> We can not go there.

Right. But it's confusing, and also adds overhead on data path
(wakeup each time last request is completed).
Let's do standard ref counting: init to 1, before flush - decrement
and wait for completion.

> > 
> > 
> > > +		init_waitqueue_head(&inflight->wait);
> > > +	}
> > > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > > +	synchronize_rcu();
> > > +
> > > +	return inflight;
> > > +}
> > > +
> > 
> > This looks like it will overwrite inflight without
> > freeing the old one. In fact it won't because caller
> > has saved the pointer but this interface is
> > just too tricky. Please just opencode this function.
> > 
> 
> Did you see the old inflight was freed in vhost_scsi_flush().
> It was code in a helper function because it is used in too places.
> One is in vhost_scsi_open and the other is in vhost_scsi_scsi?

The name is still confusing.
alloc should simply allocate and return pointer.
Have callers do assign and flush as appropriate.
In particular open does not need synchronize_rcu,
and not checking old inflight value.

> > 
> > > +static struct vhost_scsi_inflight *
> > > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	rcu_read_lock();
> > > +	inflight = rcu_dereference(vs->vs_inflight);
> > > +	if (inflight)
> > 
> > How can it be NULL?
> 
> When tcm_vhost_alloc_inflight failed to allocate inflight.

Then we won't flush which is a wrong way to handle such
and error. Instead, fail the command.

> > > +		atomic_inc(&inflight->count);
> > > +	rcu_read_unlock();
> > > +
> > > +	return inflight;
> > > +}
> > > +
> > > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > > +{
> > > +	/*
> > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > +	 * operation are finished.
> > > +	 */
> > > +	if (inflight && !atomic_dec_return(&inflight->count))
> > > +		wake_up(&inflight->wait);
> > > +}
> > > +
> > > +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> > > +{
> > > +	return atomic_read(&inflight->count) == 0;
> > > +}
> > > +
> > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > >  {
> > >  	bool ret = false;
> > > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > >  {
> > >  	mutex_lock(&vs->vs_events_lock);
> > > +	tcm_vhost_dec_inflight(evt->inflight);
> > >  	vs->vs_events_nr--;
> > >  	kfree(evt);
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > >  	if (evt) {
> > >  		evt->event.event = event;
> > >  		evt->event.reason = reason;
> > > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > >  		vs->vs_events_nr++;
> > >  	}
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > >  		kfree(tv_cmd->tvc_sgl);
> > >  	}
> > >  
> > > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > > +
> > >  	kfree(tv_cmd);
> > >  }
> > >  
> > > @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > >  	tv_cmd->tvc_data_direction = data_direction;
> > >  	tv_cmd->tvc_nexus = tv_nexus;
> > >  	tv_cmd->tvc_vhost = vs;
> > > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > >  
> > >  	return tv_cmd;
> > >  }
> > > @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> > >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> > >  {
> > >  	int i;
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	inflight = ACCESS_ONCE(vs->vs_inflight);
> > 
> > rcu_dereference_protected ? This ACCESS_ONCE looks bogus.
> 
> okay.
> 
> > > +	if (!tcm_vhost_alloc_inflight(vs))
> > > +		return;
> > >  
> > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > >  		vhost_scsi_flush_vq(vs, i);
> > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > +
> > > +	/* Wait until all requests issued before the flush to be finished */
> > 
> > s/until/for/
> 
> okay.
> 
> > > +	if (inflight) {
> > 
> > How can this be NULL?
> 
> When tcm_vhost_alloc_inflight failed to allocate inflight.

Again, wrong way to handle it.

> > > +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> > > +		kfree(inflight);
> > > +	}
> > >  }
> > >  
> > >  /*
> > > @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > >  	s->vs_events_dropped = false;
> > >  	mutex_init(&s->vs_events_lock);
> > >  
> > > +	if (!tcm_vhost_alloc_inflight(s))
> > > +		return -ENOMEM;
> > > +
> > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> > >  	vhost_scsi_clear_endpoint(s, &t);
> > >  	vhost_dev_stop(&s->dev);
> > >  	vhost_dev_cleanup(&s->dev, false);
> > > +	kfree(s->vs_inflight);
> > >  	kfree(s);
> > >  	return 0;
> > >  }
> > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > index 94e9ee53..7567767 100644
> > > --- a/drivers/vhost/tcm_vhost.h
> > > +++ b/drivers/vhost/tcm_vhost.h
> > > @@ -2,6 +2,7 @@
> > >  #define TCM_VHOST_NAMELEN 256
> > >  #define TCM_VHOST_MAX_CDB_SIZE 32
> > >  
> > > +struct vhost_scsi_inflight;
> > >  struct tcm_vhost_cmd {
> > >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> > >  	int tvc_vq_desc;
> > > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > >  	/* Completed commands list, serviced from vhost worker thread */
> > >  	struct llist_node tvc_completion_list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  struct tcm_vhost_nexus {
> > > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> > >  	struct virtio_scsi_event event;
> > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > >  	struct llist_node list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  /*
> > > -- 
> > > 1.8.1.4
> 
> -- 
> Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-15 10:11             ` Michael S. Tsirkin
@ 2013-04-16  0:35               ` Asias He
  0 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-16  0:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Mon, Apr 15, 2013 at 01:11:54PM +0300, Michael S. Tsirkin wrote:
> On Sun, Apr 14, 2013 at 08:27:14PM +0800, Asias He wrote:
> > On Sun, Apr 14, 2013 at 12:58:03PM +0300, Michael S. Tsirkin wrote:
> > > On Sat, Apr 13, 2013 at 11:29:14AM +0800, Asias He wrote:
> > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > issued before the flush operation to be finished.
> > > > 
> > > > Changes in v4:
> > > > - Introduce vhost_scsi_inflight
> > > > - Drop array to track flush
> > > > - Use RCU to protect vs_inflight explicitly
> > > > 
> > > > Changes in v3:
> > > > - Rebase
> > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > 
> > > > Changes in v2:
> > > > - Increase/Decrease inflight requests in
> > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > 
> > > > Signed-off-by: Asias He <asias@redhat.com>
> > > > ---
> > > >  drivers/vhost/tcm_vhost.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
> > > >  drivers/vhost/tcm_vhost.h |  5 ++++
> > > >  2 files changed, 78 insertions(+)
> > > > 
> > > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > > index e09f0fe..5dde525 100644
> > > > --- a/drivers/vhost/tcm_vhost.c
> > > > +++ b/drivers/vhost/tcm_vhost.c
> > > > @@ -74,6 +74,11 @@ enum {
> > > >  #define VHOST_SCSI_MAX_VQ	128
> > > >  #define VHOST_SCSI_MAX_EVENT	128
> > > >  
> > > > +struct vhost_scsi_inflight {
> > > > +	wait_queue_head_t wait;
> > > > +	atomic_t count;
> > > 
> > > Okay now let's switch to kref + completion, make it more
> > > descriptive.
> > 
> > I still do not see why kref is better.
> 
> It makes the fact you are doing reference counting, explicit.

See the version I sent yesterday.

> > Completion sounds good.
> > 
> > > > +};
> > > > +
> > > >  struct vhost_scsi {
> > > >  	/* Protected by vhost_scsi->dev.mutex */
> > > >  	struct tcm_vhost_tpg **vs_tpg;
> > > > @@ -91,6 +96,8 @@ struct vhost_scsi {
> > > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > > >  	bool vs_events_dropped; /* any missed events */
> > > >  	int vs_events_nr; /* num of pending events */
> > > > +
> > > > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight req */
> > > >  };
> > > >  
> > > >  /* Local pointer to allocated TCM configfs fabric module */
> > > > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> > > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > > >  }
> > > >  
> > > > +static struct vhost_scsi_inflight *
> > > > +tcm_vhost_alloc_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +
> > > > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > > > +	if (inflight) {
> > > 
> > > This is used in set_features, so let's make it int
> > > and return error to user if not. No need to corrupt kernel
> > > memory silently like this.
> > 
> > What do you mean by used in set_features? The return value of
> > tcm_vhost_alloc_inflight was used at some point, but now it is not used.
> > So I will return int.
> > 
> > Why is it corrupted?
> 
> You skip flushes so something can be in flight, our code
> assumes flush actually flushes things.


So what is the best we can do if it fails to allocate memory for inflight
in vhost_scsi_flush. BTW, this is a downside of using dynamic allocation.

The best I can think of:

We set vs->vs_inflight to NULL which means we do not track the new
requests and keep going to do the flush and wait for the old requests to
be done.

> > > 
> > > 
> > > > +		atomic_set(&inflight->count, 0);
> > > 
> > > 
> > > Ugh. So once all requests finish, refcount is 0
> > > and then inflight is freed, and then the next request will
> > > get a freed inflight value and dereference. Looks pretty bad,
> > > but maybe there's an increment somewhere that fixes it.
> > 
> > How can the next request get a freed inflight? It can not happen.  The
> > old inflight is freed only after all the requests which reference it are
> > finished. See the last few lines in vhost_scsi_flush.
> > 
> > > But let's not go there.  That's why I said above we should use kref +
> > > completion. That makes is very clear how to use it correctly.
> > > So:
> > > 	- initialize to 1
> > > 	- swap pointer with RCU
> > > 	- decrement
> > > 	- wait_for_completion
> > 
> > We can not go there.
> 
> Right. But it's confusing, and also adds overhead on data path
> (wakeup each time last request is completed).
> Let's do standard ref counting: init to 1, before flush - decrement
> and wait for completion.

Yes, as I mentioned in the other mail, we can optimize the 'wakeup too
much' issue. Certainly, init to 1 and decrement to start flush is one
way to do it.

> > > 
> > > 
> > > > +		init_waitqueue_head(&inflight->wait);
> > > > +	}
> > > > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > > > +	synchronize_rcu();
> > > > +
> > > > +	return inflight;
> > > > +}
> > > > +
> > > 
> > > This looks like it will overwrite inflight without
> > > freeing the old one. In fact it won't because caller
> > > has saved the pointer but this interface is
> > > just too tricky. Please just opencode this function.
> > > 
> > 
> > Did you see the old inflight was freed in vhost_scsi_flush().
> > It was code in a helper function because it is used in too places.
> > One is in vhost_scsi_open and the other is in vhost_scsi_scsi?
> 
> The name is still confusing.
> alloc should simply allocate and return pointer.
> Have callers do assign and flush as appropriate.
> In particular open does not need synchronize_rcu,
> and not checking old inflight value.

It's just a name for the helper. We can change to what ever we want.

> 
> > > 
> > > > +static struct vhost_scsi_inflight *
> > > > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +
> > > > +	rcu_read_lock();
> > > > +	inflight = rcu_dereference(vs->vs_inflight);
> > > > +	if (inflight)
> > > 
> > > How can it be NULL?
> > 
> > When tcm_vhost_alloc_inflight failed to allocate inflight.
> 
> Then we won't flush which is a wrong way to handle such
> and error. Instead, fail the command.

Okay, we can not track it we fail it. It is a safe way to go.

> > > > +		atomic_inc(&inflight->count);
> > > > +	rcu_read_unlock();
> > > > +
> > > > +	return inflight;
> > > > +}
> > > > +
> > > > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > > > +{
> > > > +	/*
> > > > +	 * Wakeup the waiter when all the requests issued before the flush
> > > > +	 * operation are finished.
> > > > +	 */
> > > > +	if (inflight && !atomic_dec_return(&inflight->count))
> > > > +		wake_up(&inflight->wait);
> > > > +}
> > > > +
> > > > +static bool tcm_vhost_done_inflight(struct vhost_scsi_inflight *inflight)
> > > > +{
> > > > +	return atomic_read(&inflight->count) == 0;
> > > > +}
> > > > +
> > > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > > >  {
> > > >  	bool ret = false;
> > > > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	mutex_lock(&vs->vs_events_lock);
> > > > +	tcm_vhost_dec_inflight(evt->inflight);
> > > >  	vs->vs_events_nr--;
> > > >  	kfree(evt);
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -423,6 +476,7 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  	if (evt) {
> > > >  		evt->event.event = event;
> > > >  		evt->event.reason = reason;
> > > > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > > >  		vs->vs_events_nr++;
> > > >  	}
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -445,6 +499,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  		kfree(tv_cmd->tvc_sgl);
> > > >  	}
> > > >  
> > > > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > > > +
> > > >  	kfree(tv_cmd);
> > > >  }
> > > >  
> > > > @@ -595,6 +651,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > >  	tv_cmd->tvc_data_direction = data_direction;
> > > >  	tv_cmd->tvc_nexus = tv_nexus;
> > > >  	tv_cmd->tvc_vhost = vs;
> > > > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > > >  
> > > >  	return tv_cmd;
> > > >  }
> > > > @@ -983,10 +1040,22 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> > > >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> > > >  {
> > > >  	int i;
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +
> > > > +	inflight = ACCESS_ONCE(vs->vs_inflight);
> > > 
> > > rcu_dereference_protected ? This ACCESS_ONCE looks bogus.
> > 
> > okay.
> > 
> > > > +	if (!tcm_vhost_alloc_inflight(vs))
> > > > +		return;
> > > >  
> > > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > > >  		vhost_scsi_flush_vq(vs, i);
> > > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > > +	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > > +
> > > > +	/* Wait until all requests issued before the flush to be finished */
> > > 
> > > s/until/for/
> > 
> > okay.
> > 
> > > > +	if (inflight) {
> > > 
> > > How can this be NULL?
> > 
> > When tcm_vhost_alloc_inflight failed to allocate inflight.
> 
> Again, wrong way to handle it.
> 
> > > > +		wait_event(inflight->wait, tcm_vhost_done_inflight(inflight));
> > > > +		kfree(inflight);
> > > > +	}
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -1195,6 +1264,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > > >  	s->vs_events_dropped = false;
> > > >  	mutex_init(&s->vs_events_lock);
> > > >  
> > > > +	if (!tcm_vhost_alloc_inflight(s))
> > > > +		return -ENOMEM;
> > > > +
> > > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > > @@ -1220,6 +1292,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> > > >  	vhost_scsi_clear_endpoint(s, &t);
> > > >  	vhost_dev_stop(&s->dev);
> > > >  	vhost_dev_cleanup(&s->dev, false);
> > > > +	kfree(s->vs_inflight);
> > > >  	kfree(s);
> > > >  	return 0;
> > > >  }
> > > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > > index 94e9ee53..7567767 100644
> > > > --- a/drivers/vhost/tcm_vhost.h
> > > > +++ b/drivers/vhost/tcm_vhost.h
> > > > @@ -2,6 +2,7 @@
> > > >  #define TCM_VHOST_NAMELEN 256
> > > >  #define TCM_VHOST_MAX_CDB_SIZE 32
> > > >  
> > > > +struct vhost_scsi_inflight;
> > > >  struct tcm_vhost_cmd {
> > > >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> > > >  	int tvc_vq_desc;
> > > > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> > > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > > >  	/* Completed commands list, serviced from vhost worker thread */
> > > >  	struct llist_node tvc_completion_list;
> > > > +	/* Used to track inflight req */
> > > > +	struct vhost_scsi_inflight *inflight;
> > > >  };
> > > >  
> > > >  struct tcm_vhost_nexus {
> > > > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> > > >  	struct virtio_scsi_event event;
> > > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > > >  	struct llist_node list;
> > > > +	/* Used to track inflight req */
> > > > +	struct vhost_scsi_inflight *inflight;
> > > >  };
> > > >  
> > > >  /*
> > > > -- 
> > > > 1.8.1.4
> > 
> > -- 
> > Asias

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v5 0/2] tcm_vhost flush
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
@ 2013-04-16  9:16         ` Asias He
  2013-04-16  9:16         ` Asias He
                           ` (3 subsequent siblings)
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-16  9:16 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: Paolo Bonzini, Stefan Hajnoczi, Michael S. Tsirkin,
	Rusty Russell, kvm, virtualization, target-devel, Asias He

Asias He (2):
  tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  tcm_vhost: Wait for pending requests in vhost_scsi_flush()

 drivers/vhost/tcm_vhost.c | 106 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |   5 +++
 2 files changed, 104 insertions(+), 7 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v5 0/2] tcm_vhost flush
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
  2013-04-16  9:16         ` [PATCH v5 " Asias He
@ 2013-04-16  9:16         ` Asias He
  2013-04-16  9:16         ` [PATCH v5 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-16  9:16 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

Asias He (2):
  tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  tcm_vhost: Wait for pending requests in vhost_scsi_flush()

 drivers/vhost/tcm_vhost.c | 106 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |   5 +++
 2 files changed, 104 insertions(+), 7 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v5 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
                           ` (2 preceding siblings ...)
  2013-04-16  9:16         ` [PATCH v5 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
@ 2013-04-16  9:16         ` Asias He
  2013-04-16  9:16         ` [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-16  9:16 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: Paolo Bonzini, Stefan Hajnoczi, Michael S. Tsirkin,
	Rusty Russell, kvm, virtualization, target-devel, Asias He

It is needed in next patch.

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index da2021b..4ae6725 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -569,6 +569,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -593,6 +594,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
 
 	return tv_cmd;
 }
@@ -848,7 +850,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -858,7 +860,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 		tv_cmd->tvc_resp = vq->iov[out].iov_base;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v5 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
  2013-04-16  9:16         ` [PATCH v5 " Asias He
  2013-04-16  9:16         ` Asias He
@ 2013-04-16  9:16         ` Asias He
  2013-04-16  9:16         ` Asias He
  2013-04-16  9:16         ` [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
  4 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-16  9:16 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

It is needed in next patch.

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index da2021b..4ae6725 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -569,6 +569,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_scsi *vs,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -593,6 +594,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->tvc_vhost = vs;
 
 	return tv_cmd;
 }
@@ -848,7 +850,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vs, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -858,7 +860,6 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		pr_debug("Allocated tv_cmd: %p exp_data_len: %d, data_direction"
 			": %d\n", tv_cmd, exp_data_len, data_direction);
 
-		tv_cmd->tvc_vhost = vs;
 		tv_cmd->tvc_vq = vq;
 		tv_cmd->tvc_resp = vq->iov[out].iov_base;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
                           ` (3 preceding siblings ...)
  2013-04-16  9:16         ` Asias He
@ 2013-04-16  9:16         ` Asias He
  2013-04-16 17:58           ` Michael S. Tsirkin
  4 siblings, 1 reply; 28+ messages in thread
From: Asias He @ 2013-04-16  9:16 UTC (permalink / raw)
  To: Nicholas Bellinger
  Cc: kvm, Michael S. Tsirkin, virtualization, target-devel,
	Stefan Hajnoczi, Paolo Bonzini

This patch makes vhost_scsi_flush() wait for all the pending requests
issued before the flush operation to be finished.

Changes in v5:
- Use kref and completion
- Fail req if vs->vs_inflight is NULL
- Rename tcm_vhost_alloc_inflight to tcm_vhost_set_inflight

Changes in v4:
- Introduce vhost_scsi_inflight
- Drop array to track flush
- Use RCU to protect vs_inflight explicitly

Changes in v3:
- Rebase
- Drop 'tcm_vhost: Wait for pending requests in
  vhost_scsi_clear_endpoint()' in this series, we already did that in
  'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'

Changes in v2:
- Increase/Decrease inflight requests in
  vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt

Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/vhost/tcm_vhost.c | 101 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/tcm_vhost.h |   5 +++
 2 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
index 4ae6725..ef40a8f 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/tcm_vhost.c
@@ -74,6 +74,11 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_inflight {
+	struct completion comp; /* Wait for the flush operation to finish */
+	struct kref kref; /* Refcount for the inflight reqs */
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
@@ -91,6 +96,8 @@ struct vhost_scsi {
 	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
 	bool vs_events_dropped; /* any missed events */
 	int vs_events_nr; /* num of pending events */
+
+	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
 };
 
 /* Local pointer to allocated TCM configfs fabric module */
@@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static int tcm_vhost_set_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+	int ret = -ENOMEM;
+
+	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
+	if (inflight) {
+		kref_init(&inflight->kref);
+		init_completion(&inflight->comp);
+		ret = 0;
+	}
+	rcu_assign_pointer(vs->vs_inflight, inflight);
+	synchronize_rcu();
+
+	return ret;
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_inc_inflight(struct vhost_scsi *vs)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	rcu_read_lock();
+	inflight = rcu_dereference(vs->vs_inflight);
+	if (inflight)
+		kref_get(&inflight->kref);
+	rcu_read_unlock();
+
+	return inflight;
+}
+
+void tcm_vhost_done_inflight(struct kref *kref)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
+	complete(&inflight->comp);
+}
+
+static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
+{
+	if (inflight)
+		kref_put(&inflight->kref, tcm_vhost_done_inflight);
+}
+
 static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
 {
 	bool ret = false;
@@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
 static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
 	mutex_lock(&vs->vs_events_lock);
+	tcm_vhost_dec_inflight(evt->inflight);
 	vs->vs_events_nr--;
 	kfree(evt);
 	mutex_unlock(&vs->vs_events_lock);
@@ -413,21 +466,27 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	struct tcm_vhost_evt *evt;
 
 	mutex_lock(&vs->vs_events_lock);
-	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
-		vs->vs_events_dropped = true;
-		mutex_unlock(&vs->vs_events_lock);
-		return NULL;
-	}
+	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT)
+		goto out;
 
 	evt = kzalloc(sizeof(*evt), GFP_KERNEL);
 	if (evt) {
 		evt->event.event = event;
 		evt->event.reason = reason;
+		evt->inflight = tcm_vhost_inc_inflight(vs);
+		if (!evt->inflight) {
+			kfree(evt);
+			goto out;
+		}
 		vs->vs_events_nr++;
 	}
 	mutex_unlock(&vs->vs_events_lock);
 
 	return evt;
+out:
+	vs->vs_events_dropped = true;
+	mutex_unlock(&vs->vs_events_lock);
+	return NULL;
 }
 
 static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
@@ -445,6 +504,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_dec_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
@@ -595,6 +656,9 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
 	tv_cmd->tvc_vhost = vs;
+	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
+	if (!tv_cmd->inflight)
+		return ERR_PTR(-ENOMEM);
 
 	return tv_cmd;
 }
@@ -982,12 +1046,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
+	struct vhost_scsi_inflight *inflight;
 	int i;
 
+	/* inflight points to the old inflight */
+	inflight = rcu_dereference_protected(vs->vs_inflight,
+					     lockdep_is_held(&vs->dev.mutex));
+
+	/* Allocate a new inflight and make vs->vs_inflight points to it */
+	if (tcm_vhost_set_inflight(vs) < 0)
+		pr_warn("vhost_scsi_flush failed to allocate inflight\n");
+
+	/*
+	 * The inflight->kref was initialized to 1. We decrement it here to
+	 * indicate the start of the flush operation so that it will reach 0
+	 * when all the reqs are finished.
+	 */
+	kref_put(&inflight->kref, tcm_vhost_done_inflight);
+
+	/* Flush both the vhost poll and vhost work */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
 	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait for all reqs issued before the flush to be finished */
+	if (inflight) {
+		wait_for_completion(&inflight->comp);
+		kfree(inflight);
+	}
 }
 
 /*
@@ -1196,6 +1283,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
 	s->vs_events_dropped = false;
 	mutex_init(&s->vs_events_lock);
 
+	if (tcm_vhost_set_inflight(s) < 0)
+		return -ENOMEM;
+
 	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
 	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
 	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
@@ -1221,6 +1311,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_scsi_clear_endpoint(s, &t);
 	vhost_dev_stop(&s->dev);
 	vhost_dev_cleanup(&s->dev, false);
+	kfree(s->vs_inflight);
 	kfree(s);
 	return 0;
 }
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
index 94e9ee53..7567767 100644
--- a/drivers/vhost/tcm_vhost.h
+++ b/drivers/vhost/tcm_vhost.h
@@ -2,6 +2,7 @@
 #define TCM_VHOST_NAMELEN 256
 #define TCM_VHOST_MAX_CDB_SIZE 32
 
+struct vhost_scsi_inflight;
 struct tcm_vhost_cmd {
 	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
 	int tvc_vq_desc;
@@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
 	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
 	/* Completed commands list, serviced from vhost worker thread */
 	struct llist_node tvc_completion_list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 struct tcm_vhost_nexus {
@@ -91,6 +94,8 @@ struct tcm_vhost_evt {
 	struct virtio_scsi_event event;
 	/* virtio_scsi event list, serviced from vhost worker thread */
 	struct llist_node list;
+	/* Used to track inflight req */
+	struct vhost_scsi_inflight *inflight;
 };
 
 /*
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-16  9:16         ` [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
@ 2013-04-16 17:58           ` Michael S. Tsirkin
  2013-04-17  1:29             ` Asias He
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-16 17:58 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Tue, Apr 16, 2013 at 05:16:51PM +0800, Asias He wrote:
> This patch makes vhost_scsi_flush() wait for all the pending requests
> issued before the flush operation to be finished.
> 
> Changes in v5:
> - Use kref and completion
> - Fail req if vs->vs_inflight is NULL
> - Rename tcm_vhost_alloc_inflight to tcm_vhost_set_inflight
> 
> Changes in v4:
> - Introduce vhost_scsi_inflight
> - Drop array to track flush
> - Use RCU to protect vs_inflight explicitly
> 
> Changes in v3:
> - Rebase
> - Drop 'tcm_vhost: Wait for pending requests in
>   vhost_scsi_clear_endpoint()' in this series, we already did that in
>   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> 
> Changes in v2:
> - Increase/Decrease inflight requests in
>   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> 
> Signed-off-by: Asias He <asias@redhat.com>

OK looks good, except error handling needs to be fixed.

> ---
>  drivers/vhost/tcm_vhost.c | 101 +++++++++++++++++++++++++++++++++++++++++++---
>  drivers/vhost/tcm_vhost.h |   5 +++
>  2 files changed, 101 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index 4ae6725..ef40a8f 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -74,6 +74,11 @@ enum {
>  #define VHOST_SCSI_MAX_VQ	128
>  #define VHOST_SCSI_MAX_EVENT	128
>  
> +struct vhost_scsi_inflight {
> +	struct completion comp; /* Wait for the flush operation to finish */
> +	struct kref kref; /* Refcount for the inflight reqs */
> +};
> +
>  struct vhost_scsi {
>  	/* Protected by vhost_scsi->dev.mutex */
>  	struct tcm_vhost_tpg **vs_tpg;
> @@ -91,6 +96,8 @@ struct vhost_scsi {
>  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
>  	bool vs_events_dropped; /* any missed events */
>  	int vs_events_nr; /* num of pending events */
> +
> +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
>  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
>  }
>  
> +static int tcm_vhost_set_inflight(struct vhost_scsi *vs)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +	int ret = -ENOMEM;
> +
> +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);

kzalloc is not needed, you initialize all fields.

> +	if (inflight) {
> +		kref_init(&inflight->kref);
> +		init_completion(&inflight->comp);
> +		ret = 0;
> +	}
> +	rcu_assign_pointer(vs->vs_inflight, inflight);

So if allocation fails, we stop tracking inflights?
This looks strange, and could break guests. Why not the usual
	if (!inflight)
		return -ENOMEM;

> +	synchronize_rcu();

open call is different:
	- sync is not needed
	- should use RCU_INIT_POINTER and not rcu_assign_pointer

So please move these out and make this function return the struct:
	struct vhost_scsi_inflight *inflight
	tcm_vhost_alloc_inflight(void)


> +
> +	return ret;
> +}
> +
> +static struct vhost_scsi_inflight *
> +tcm_vhost_inc_inflight(struct vhost_scsi *vs)

And then inc will not need to return inflight pointer,
which is really unusual.

> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	rcu_read_lock();
> +	inflight = rcu_dereference(vs->vs_inflight);
> +	if (inflight)
> +		kref_get(&inflight->kref);
> +	rcu_read_unlock();
> +
> +	return inflight;
> +}
> +
> +void tcm_vhost_done_inflight(struct kref *kref)
> +{
> +	struct vhost_scsi_inflight *inflight;
> +
> +	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
> +	complete(&inflight->comp);
> +}
> +
> +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> +{
> +	if (inflight)

Here as in other places, inflight must never be NULL.
Pls fix code so that invariant holds.

> +		kref_put(&inflight->kref, tcm_vhost_done_inflight);
> +}
> +
>  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
>  {
>  	bool ret = false;
> @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
>  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
>  {
>  	mutex_lock(&vs->vs_events_lock);
> +	tcm_vhost_dec_inflight(evt->inflight);
>  	vs->vs_events_nr--;
>  	kfree(evt);
>  	mutex_unlock(&vs->vs_events_lock);
> @@ -413,21 +466,27 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
>  	struct tcm_vhost_evt *evt;
>  
>  	mutex_lock(&vs->vs_events_lock);
> -	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> -		vs->vs_events_dropped = true;
> -		mutex_unlock(&vs->vs_events_lock);
> -		return NULL;
> -	}
> +	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT)
> +		goto out;
>  
>  	evt = kzalloc(sizeof(*evt), GFP_KERNEL);

BTW it looks like we should replace this kzalloc with kmalloc.
Should be a separate patch ...

>  	if (evt) {
>  		evt->event.event = event;
>  		evt->event.reason = reason;
> +		evt->inflight = tcm_vhost_inc_inflight(vs);
> +		if (!evt->inflight) {

We drop an event because earlier
we run out of memory for allocating the inflight counter.
Does not make sense to me.

> +			kfree(evt);
> +			goto out;
> +		}
>  		vs->vs_events_nr++;
>  	}
>  	mutex_unlock(&vs->vs_events_lock);
>  
>  	return evt;
> +out:
> +	vs->vs_events_dropped = true;
> +	mutex_unlock(&vs->vs_events_lock);
> +	return NULL;
>  }
>  
>  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> @@ -445,6 +504,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
>  		kfree(tv_cmd->tvc_sgl);
>  	}
>  
> +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> +
>  	kfree(tv_cmd);
>  }
>  
> @@ -595,6 +656,9 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
>  	tv_cmd->tvc_data_direction = data_direction;
>  	tv_cmd->tvc_nexus = tv_nexus;
>  	tv_cmd->tvc_vhost = vs;
> +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> +	if (!tv_cmd->inflight)
> +		return ERR_PTR(-ENOMEM);
>  
>  	return tv_cmd;
>  }
> @@ -982,12 +1046,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
>  
>  static void vhost_scsi_flush(struct vhost_scsi *vs)
>  {
> +	struct vhost_scsi_inflight *inflight;
>  	int i;
>  
> +	/* inflight points to the old inflight */
> +	inflight = rcu_dereference_protected(vs->vs_inflight,
> +					     lockdep_is_held(&vs->dev.mutex));
> +
> +	/* Allocate a new inflight and make vs->vs_inflight points to it */
> +	if (tcm_vhost_set_inflight(vs) < 0)
> +		pr_warn("vhost_scsi_flush failed to allocate inflight\n");

That's unlikely to reach the application. How about we stop here,
and propagate the error to ioctl caller?

> +
> +	/*
> +	 * The inflight->kref was initialized to 1. We decrement it here to
> +	 * indicate the start of the flush operation so that it will reach 0
> +	 * when all the reqs are finished.
> +	 */
> +	kref_put(&inflight->kref, tcm_vhost_done_inflight);
> +
> +	/* Flush both the vhost poll and vhost work */
>  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
>  		vhost_scsi_flush_vq(vs, i);
>  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
>  	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> +
> +	/* Wait for all reqs issued before the flush to be finished */
> +	if (inflight) {

inflight should never be NULL, otherwise inflight
tracjing is not effective. Please fix error handling so we
never reach here with inflight == NULL.

> +		wait_for_completion(&inflight->comp);
> +		kfree(inflight);
> +	}
>  }
>  
>  /*
> @@ -1196,6 +1283,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
>  	s->vs_events_dropped = false;
>  	mutex_init(&s->vs_events_lock);
>  
> +	if (tcm_vhost_set_inflight(s) < 0)
> +		return -ENOMEM;
> +

Better propagate the return code to user.

>  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
>  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
>  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> @@ -1221,6 +1311,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
>  	vhost_scsi_clear_endpoint(s, &t);
>  	vhost_dev_stop(&s->dev);
>  	vhost_dev_cleanup(&s->dev, false);
> +	kfree(s->vs_inflight);
>  	kfree(s);
>  	return 0;
>  }
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index 94e9ee53..7567767 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -2,6 +2,7 @@
>  #define TCM_VHOST_NAMELEN 256
>  #define TCM_VHOST_MAX_CDB_SIZE 32
>  
> +struct vhost_scsi_inflight;
>  struct tcm_vhost_cmd {
>  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
>  	int tvc_vq_desc;
> @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
>  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
>  	/* Completed commands list, serviced from vhost worker thread */
>  	struct llist_node tvc_completion_list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  struct tcm_vhost_nexus {
> @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
>  	struct virtio_scsi_event event;
>  	/* virtio_scsi event list, serviced from vhost worker thread */
>  	struct llist_node list;
> +	/* Used to track inflight req */
> +	struct vhost_scsi_inflight *inflight;
>  };
>  
>  /*
> -- 
> 1.8.1.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-16 17:58           ` Michael S. Tsirkin
@ 2013-04-17  1:29             ` Asias He
  2013-04-17 10:07               ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Asias He @ 2013-04-17  1:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Tue, Apr 16, 2013 at 08:58:27PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 16, 2013 at 05:16:51PM +0800, Asias He wrote:
> > This patch makes vhost_scsi_flush() wait for all the pending requests
> > issued before the flush operation to be finished.
> > 
> > Changes in v5:
> > - Use kref and completion
> > - Fail req if vs->vs_inflight is NULL
> > - Rename tcm_vhost_alloc_inflight to tcm_vhost_set_inflight
> > 
> > Changes in v4:
> > - Introduce vhost_scsi_inflight
> > - Drop array to track flush
> > - Use RCU to protect vs_inflight explicitly
> > 
> > Changes in v3:
> > - Rebase
> > - Drop 'tcm_vhost: Wait for pending requests in
> >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > 
> > Changes in v2:
> > - Increase/Decrease inflight requests in
> >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > 
> > Signed-off-by: Asias He <asias@redhat.com>
> 
> OK looks good, except error handling needs to be fixed.
> 
> > ---
> >  drivers/vhost/tcm_vhost.c | 101 +++++++++++++++++++++++++++++++++++++++++++---
> >  drivers/vhost/tcm_vhost.h |   5 +++
> >  2 files changed, 101 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > index 4ae6725..ef40a8f 100644
> > --- a/drivers/vhost/tcm_vhost.c
> > +++ b/drivers/vhost/tcm_vhost.c
> > @@ -74,6 +74,11 @@ enum {
> >  #define VHOST_SCSI_MAX_VQ	128
> >  #define VHOST_SCSI_MAX_EVENT	128
> >  
> > +struct vhost_scsi_inflight {
> > +	struct completion comp; /* Wait for the flush operation to finish */
> > +	struct kref kref; /* Refcount for the inflight reqs */
> > +};
> > +
> >  struct vhost_scsi {
> >  	/* Protected by vhost_scsi->dev.mutex */
> >  	struct tcm_vhost_tpg **vs_tpg;
> > @@ -91,6 +96,8 @@ struct vhost_scsi {
> >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> >  	bool vs_events_dropped; /* any missed events */
> >  	int vs_events_nr; /* num of pending events */
> > +
> > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
> >  };
> >  
> >  /* Local pointer to allocated TCM configfs fabric module */
> > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> >  }
> >  
> > +static int tcm_vhost_set_inflight(struct vhost_scsi *vs)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +	int ret = -ENOMEM;
> > +
> > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> 
> kzalloc is not needed, you initialize all fields.

okay.

> > +	if (inflight) {
> > +		kref_init(&inflight->kref);
> > +		init_completion(&inflight->comp);
> > +		ret = 0;
> > +	}
> > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> 
> So if allocation fails, we stop tracking inflights?
>
> This looks strange, and could break guests. Why not the usual
> 	if (!inflight)
> 		return -ENOMEM;

If allocation fails, we abort further reqs. No need to track.
Why it will break guest and how?

> > +	synchronize_rcu();
> 
> open call is different:
> 	- sync is not needed
> 	- should use RCU_INIT_POINTER and not rcu_assign_pointer
> 
> So please move these out and make this function return the struct:
> 	struct vhost_scsi_inflight *inflight
> 	tcm_vhost_alloc_inflight(void)

synchronize_rcu is actually needed. 

   tcm_vhost_inc_inflight
   {
   
           rcu_read_lock();
           inflight = rcu_dereference(vs->vs_inflight);         
   
          /* 
   	   * Possible race window here:
           * if inflight points to old inflight and
           * wait_for_completion runs before we call kref_get here,
           * We may free the old inflight
           * however, there is still one in flight which should be 
           * tracked by the old inflight.
   	   */
   
           kref_get(&inflight->kref);
           rcu_read_unlock();
   
           return inflight;
   }

> 
> > +
> > +	return ret;
> > +}
> > +
> > +static struct vhost_scsi_inflight *
> > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> 
> And then inc will not need to return inflight pointer,
> which is really unusual.

No you still need to return inflight. You need it for each tcm_vhost_cmd or
tcm_vhost_evt. 
 
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	rcu_read_lock();
> > +	inflight = rcu_dereference(vs->vs_inflight);
> > +	if (inflight)
> > +		kref_get(&inflight->kref);
> > +	rcu_read_unlock();
> > +
> > +	return inflight;
> > +}
> > +
> > +void tcm_vhost_done_inflight(struct kref *kref)
> > +{
> > +	struct vhost_scsi_inflight *inflight;
> > +
> > +	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
> > +	complete(&inflight->comp);
> > +}
> > +
> > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > +{
> > +	if (inflight)
> 
> Here as in other places, inflight must never be NULL.
> Pls fix code so that invariant holds.
> 
> > +		kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > +}
> > +
> >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> >  {
> >  	bool ret = false;
> > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> >  {
> >  	mutex_lock(&vs->vs_events_lock);
> > +	tcm_vhost_dec_inflight(evt->inflight);
> >  	vs->vs_events_nr--;
> >  	kfree(evt);
> >  	mutex_unlock(&vs->vs_events_lock);
> > @@ -413,21 +466,27 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> >  	struct tcm_vhost_evt *evt;
> >  
> >  	mutex_lock(&vs->vs_events_lock);
> > -	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> > -		vs->vs_events_dropped = true;
> > -		mutex_unlock(&vs->vs_events_lock);
> > -		return NULL;
> > -	}
> > +	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT)
> > +		goto out;
> >  
> >  	evt = kzalloc(sizeof(*evt), GFP_KERNEL);
> 
> BTW it looks like we should replace this kzalloc with kmalloc.
> Should be a separate patch ...

This belongs to the hotplug series. I will add to it.
 
> >  	if (evt) {
> >  		evt->event.event = event;
> >  		evt->event.reason = reason;
> > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > +		if (!evt->inflight) {
> 
> We drop an event because earlier
> we run out of memory for allocating the inflight counter.
> Does not make sense to me.

This tries to abort further reqs if we fail to allocate inflight.
 
> > +			kfree(evt);
> > +			goto out;
> > +		}
> >  		vs->vs_events_nr++;
> >  	}
> >  	mutex_unlock(&vs->vs_events_lock);
> >  
> >  	return evt;
> > +out:
> > +	vs->vs_events_dropped = true;
> > +	mutex_unlock(&vs->vs_events_lock);
> > +	return NULL;
> >  }
> >  
> >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > @@ -445,6 +504,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> >  		kfree(tv_cmd->tvc_sgl);
> >  	}
> >  
> > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > +
> >  	kfree(tv_cmd);
> >  }
> >  
> > @@ -595,6 +656,9 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> >  	tv_cmd->tvc_data_direction = data_direction;
> >  	tv_cmd->tvc_nexus = tv_nexus;
> >  	tv_cmd->tvc_vhost = vs;
> > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > +	if (!tv_cmd->inflight)
> > +		return ERR_PTR(-ENOMEM);
> >  
> >  	return tv_cmd;
> >  }
> > @@ -982,12 +1046,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> >  
> >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> >  {
> > +	struct vhost_scsi_inflight *inflight;
> >  	int i;
> >  
> > +	/* inflight points to the old inflight */
> > +	inflight = rcu_dereference_protected(vs->vs_inflight,
> > +					     lockdep_is_held(&vs->dev.mutex));
> > +
> > +	/* Allocate a new inflight and make vs->vs_inflight points to it */
> > +	if (tcm_vhost_set_inflight(vs) < 0)
> > +		pr_warn("vhost_scsi_flush failed to allocate inflight\n");
> 
> That's unlikely to reach the application. How about we stop here,
> and propagate the error to ioctl caller?

What user application can do in this case. Especially, the vhost_scsi_flush
fails when user tries to shutdown the guest.

What we are doing now is safer than just stops here. Because
1) we still flush all the existing reqs 
2) Further reqs will be aborted.
 
> > +
> > +	/*
> > +	 * The inflight->kref was initialized to 1. We decrement it here to
> > +	 * indicate the start of the flush operation so that it will reach 0
> > +	 * when all the reqs are finished.
> > +	 */
> > +	kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > +
> > +	/* Flush both the vhost poll and vhost work */
> >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> >  		vhost_scsi_flush_vq(vs, i);
> >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> >  	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > +
> > +	/* Wait for all reqs issued before the flush to be finished */
> > +	if (inflight) {
> 
> inflight should never be NULL, otherwise inflight
> tracjing is not effective. Please fix error handling so we
> never reach here with inflight == NULL.

It is effective. We abort the req if we can not track it.
 
> > +		wait_for_completion(&inflight->comp);
> > +		kfree(inflight);
> > +	}
> >  }
> >  
> >  /*
> > @@ -1196,6 +1283,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> >  	s->vs_events_dropped = false;
> >  	mutex_init(&s->vs_events_lock);
> >  
> > +	if (tcm_vhost_set_inflight(s) < 0)
> > +		return -ENOMEM;
> > +
> 
> Better propagate the return code to user.

We are returning -ENOMEM to user, no?
 
> >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > @@ -1221,6 +1311,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> >  	vhost_scsi_clear_endpoint(s, &t);
> >  	vhost_dev_stop(&s->dev);
> >  	vhost_dev_cleanup(&s->dev, false);
> > +	kfree(s->vs_inflight);
> >  	kfree(s);
> >  	return 0;
> >  }
> > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > index 94e9ee53..7567767 100644
> > --- a/drivers/vhost/tcm_vhost.h
> > +++ b/drivers/vhost/tcm_vhost.h
> > @@ -2,6 +2,7 @@
> >  #define TCM_VHOST_NAMELEN 256
> >  #define TCM_VHOST_MAX_CDB_SIZE 32
> >  
> > +struct vhost_scsi_inflight;
> >  struct tcm_vhost_cmd {
> >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> >  	int tvc_vq_desc;
> > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> >  	/* Completed commands list, serviced from vhost worker thread */
> >  	struct llist_node tvc_completion_list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  struct tcm_vhost_nexus {
> > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> >  	struct virtio_scsi_event event;
> >  	/* virtio_scsi event list, serviced from vhost worker thread */
> >  	struct llist_node list;
> > +	/* Used to track inflight req */
> > +	struct vhost_scsi_inflight *inflight;
> >  };
> >  
> >  /*
> > -- 
> > 1.8.1.4

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-17  1:29             ` Asias He
@ 2013-04-17 10:07               ` Michael S. Tsirkin
  2013-04-17 12:07                 ` Asias He
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2013-04-17 10:07 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Wed, Apr 17, 2013 at 09:29:53AM +0800, Asias He wrote:
> On Tue, Apr 16, 2013 at 08:58:27PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 16, 2013 at 05:16:51PM +0800, Asias He wrote:
> > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > issued before the flush operation to be finished.
> > > 
> > > Changes in v5:
> > > - Use kref and completion
> > > - Fail req if vs->vs_inflight is NULL
> > > - Rename tcm_vhost_alloc_inflight to tcm_vhost_set_inflight
> > > 
> > > Changes in v4:
> > > - Introduce vhost_scsi_inflight
> > > - Drop array to track flush
> > > - Use RCU to protect vs_inflight explicitly
> > > 
> > > Changes in v3:
> > > - Rebase
> > > - Drop 'tcm_vhost: Wait for pending requests in
> > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > 
> > > Changes in v2:
> > > - Increase/Decrease inflight requests in
> > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > 
> > > Signed-off-by: Asias He <asias@redhat.com>
> > 
> > OK looks good, except error handling needs to be fixed.

Let me be more explicit here. There are two rules that this violates:
	an ioctl should either succeed and return code >= 0,
	or fail and return code < 0. In later case it should not
	have any effect.
		This patch instead leaves the device in a bad state
		on an OOM error.

	There should be 1 way to test started/stopped state,
	and that is backend != NULL.


So how to fix this? As I said, split up tcm_vhost_set_inflight.
Have tcm_vhost_init_inflight instead, to simply init counters
without alloc and the RCU things. Now in set features ioctl,
Do allocations that can fail before you change state.
Now set features and after a barrier set inflight.


> > > ---
> > >  drivers/vhost/tcm_vhost.c | 101 +++++++++++++++++++++++++++++++++++++++++++---
> > >  drivers/vhost/tcm_vhost.h |   5 +++
> > >  2 files changed, 101 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > index 4ae6725..ef40a8f 100644
> > > --- a/drivers/vhost/tcm_vhost.c
> > > +++ b/drivers/vhost/tcm_vhost.c
> > > @@ -74,6 +74,11 @@ enum {
> > >  #define VHOST_SCSI_MAX_VQ	128
> > >  #define VHOST_SCSI_MAX_EVENT	128
> > >  
> > > +struct vhost_scsi_inflight {
> > > +	struct completion comp; /* Wait for the flush operation to finish */
> > > +	struct kref kref; /* Refcount for the inflight reqs */
> > > +};
> > > +
> > >  struct vhost_scsi {
> > >  	/* Protected by vhost_scsi->dev.mutex */
> > >  	struct tcm_vhost_tpg **vs_tpg;
> > > @@ -91,6 +96,8 @@ struct vhost_scsi {
> > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > >  	bool vs_events_dropped; /* any missed events */
> > >  	int vs_events_nr; /* num of pending events */
> > > +
> > > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
> > >  };
> > >  
> > >  /* Local pointer to allocated TCM configfs fabric module */
> > > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > >  }
> > >  
> > > +static int tcm_vhost_set_inflight(struct vhost_scsi *vs)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +	int ret = -ENOMEM;
> > > +
> > > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > 
> > kzalloc is not needed, you initialize all fields.
> 
> okay.
> 
> > > +	if (inflight) {
> > > +		kref_init(&inflight->kref);
> > > +		init_completion(&inflight->comp);
> > > +		ret = 0;
> > > +	}
> > > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > 
> > So if allocation fails, we stop tracking inflights?
> >
> > This looks strange, and could break guests. Why not the usual
> > 	if (!inflight)
> > 		return -ENOMEM;
> 
> If allocation fails, we abort further reqs.
> No need to track.
> Why it will break guest and how?

Well if not commands succeed presumably storage does not work?
Also need to add a comment about this hack.

But isn't failing the ioctl cleaner?
There will be less code on data path and less tricks.

> > > +	synchronize_rcu();
> > 
> > open call is different:
> > 	- sync is not needed
> > 	- should use RCU_INIT_POINTER and not rcu_assign_pointer
> > 
> > So please move these out and make this function return the struct:
> > 	struct vhost_scsi_inflight *inflight
> > 	tcm_vhost_alloc_inflight(void)
> 
> synchronize_rcu is actually needed. 

It's not needed for open: there's no old inflight there.

>    tcm_vhost_inc_inflight
>    {
>    
>            rcu_read_lock();
>            inflight = rcu_dereference(vs->vs_inflight);         
>    
>           /* 
>    	   * Possible race window here:
>            * if inflight points to old inflight and
>            * wait_for_completion runs before we call kref_get here,
>            * We may free the old inflight
>            * however, there is still one in flight which should be 
>            * tracked by the old inflight.
>    	   */
>    
>            kref_get(&inflight->kref);
>            rcu_read_unlock();
>    
>            return inflight;
>    }


By the way I see a bug. vhost_scsi_set_features does smp_wmb
and that is not paired with anything. I think we need to pair
it with an rmb before get_inflight and add a comment
about pairing in both places.

> > 
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +static struct vhost_scsi_inflight *
> > > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > 
> > And then inc will not need to return inflight pointer,
> > which is really unusual.
> 
> No you still need to return inflight. You need it for each tcm_vhost_cmd or
> tcm_vhost_evt. 

That's true. But it's a strange thing to do on increment.
Please rename inc to get and dec to put to make this
clear.

> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	rcu_read_lock();
> > > +	inflight = rcu_dereference(vs->vs_inflight);
> > > +	if (inflight)
> > > +		kref_get(&inflight->kref);
> > > +	rcu_read_unlock();
> > > +
> > > +	return inflight;
> > > +}
> > > +
> > > +void tcm_vhost_done_inflight(struct kref *kref)
> > > +{
> > > +	struct vhost_scsi_inflight *inflight;
> > > +
> > > +	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
> > > +	complete(&inflight->comp);
> > > +}
> > > +
> > > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > > +{
> > > +	if (inflight)
> > 
> > Here as in other places, inflight must never be NULL.
> > Pls fix code so that invariant holds.
> > 
> > > +		kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > > +}
> > > +
> > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > >  {
> > >  	bool ret = false;
> > > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > >  {
> > >  	mutex_lock(&vs->vs_events_lock);
> > > +	tcm_vhost_dec_inflight(evt->inflight);
> > >  	vs->vs_events_nr--;
> > >  	kfree(evt);
> > >  	mutex_unlock(&vs->vs_events_lock);
> > > @@ -413,21 +466,27 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > >  	struct tcm_vhost_evt *evt;
> > >  
> > >  	mutex_lock(&vs->vs_events_lock);
> > > -	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> > > -		vs->vs_events_dropped = true;
> > > -		mutex_unlock(&vs->vs_events_lock);
> > > -		return NULL;
> > > -	}
> > > +	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT)
> > > +		goto out;
> > >  
> > >  	evt = kzalloc(sizeof(*evt), GFP_KERNEL);
> > 
> > BTW it looks like we should replace this kzalloc with kmalloc.
> > Should be a separate patch ...
> 
> This belongs to the hotplug series. I will add to it.
>  
> > >  	if (evt) {
> > >  		evt->event.event = event;
> > >  		evt->event.reason = reason;
> > > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > > +		if (!evt->inflight) {
> > 
> > We drop an event because earlier
> > we run out of memory for allocating the inflight counter.
> > Does not make sense to me.
> 
> This tries to abort further reqs if we fail to allocate inflight.

Yes, it prevents a crash, but it's better to tell the user that
something's wrong.

> > > +			kfree(evt);
> > > +			goto out;
> > > +		}
> > >  		vs->vs_events_nr++;
> > >  	}
> > >  	mutex_unlock(&vs->vs_events_lock);
> > >  
> > >  	return evt;
> > > +out:
> > > +	vs->vs_events_dropped = true;
> > > +	mutex_unlock(&vs->vs_events_lock);
> > > +	return NULL;
> > >  }
> > >  
> > >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > @@ -445,6 +504,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > >  		kfree(tv_cmd->tvc_sgl);
> > >  	}
> > >  
> > > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > > +
> > >  	kfree(tv_cmd);
> > >  }
> > >  
> > > @@ -595,6 +656,9 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > >  	tv_cmd->tvc_data_direction = data_direction;
> > >  	tv_cmd->tvc_nexus = tv_nexus;
> > >  	tv_cmd->tvc_vhost = vs;
> > > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > > +	if (!tv_cmd->inflight)
> > > +		return ERR_PTR(-ENOMEM);
> > >  
> > >  	return tv_cmd;
> > >  }
> > > @@ -982,12 +1046,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> > >  
> > >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> > >  {
> > > +	struct vhost_scsi_inflight *inflight;
> > >  	int i;
> > >  
> > > +	/* inflight points to the old inflight */
> > > +	inflight = rcu_dereference_protected(vs->vs_inflight,
> > > +					     lockdep_is_held(&vs->dev.mutex));
> > > +
> > > +	/* Allocate a new inflight and make vs->vs_inflight points to it */
> > > +	if (tcm_vhost_set_inflight(vs) < 0)
> > > +		pr_warn("vhost_scsi_flush failed to allocate inflight\n");
> > 
> > That's unlikely to reach the application. How about we stop here,
> > and propagate the error to ioctl caller?
> 
> What user application can do in this case. Especially, the vhost_scsi_flush
> fails when user tries to shutdown the guest.

I am not sure why you walk about close. Only thing calling flush
at the moment is set features ioctl.

But let's assume you have another patch that calls flush
on close. Then this is a bug: close should not try to allocate
memory.  On close, we really should just set private data to NULL.
Then we know no new requests will be submitted.
No need to change inflight at all.

> What we are doing now is safer than just stops here. Because
> 1) we still flush all the existing reqs 
> 2) Further reqs will be aborted.


Stop really should stop processing new requests, not cancel them.  If
you cancel on close, we will not be able to stop kernel and then restart
without errors.

> > > +
> > > +	/*
> > > +	 * The inflight->kref was initialized to 1. We decrement it here to
> > > +	 * indicate the start of the flush operation so that it will reach 0
> > > +	 * when all the reqs are finished.
> > > +	 */
> > > +	kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > > +
> > > +	/* Flush both the vhost poll and vhost work */
> > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > >  		vhost_scsi_flush_vq(vs, i);
> > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > >  	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > +
> > > +	/* Wait for all reqs issued before the flush to be finished */
> > > +	if (inflight) {
> > 
> > inflight should never be NULL, otherwise inflight
> > tracjing is not effective. Please fix error handling so we
> > never reach here with inflight == NULL.
> 
> It is effective. We abort the req if we can not track it.

Yes. But that's guest visible, isn't it?

> > > +		wait_for_completion(&inflight->comp);
> > > +		kfree(inflight);
> > > +	}
> > >  }
> > >  
> > >  /*
> > > @@ -1196,6 +1283,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > >  	s->vs_events_dropped = false;
> > >  	mutex_init(&s->vs_events_lock);
> > >  
> > > +	if (tcm_vhost_set_inflight(s) < 0)
> > > +		return -ENOMEM;
> > > +
> > 
> > Better propagate the return code to user.
> 
> We are returning -ENOMEM to user, no?

I mean if you call a function that returns 0 or errno,
you should do
r = tcm_vhost_set_inflight
if (r)
	return r;

then it's easier to add more error codes in
tcm_vhost_set_inflight if needed.

However this was a general comment, I think tcm_vhost_set_inflight
should be split up so it won't return int.

> > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > @@ -1221,6 +1311,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> > >  	vhost_scsi_clear_endpoint(s, &t);
> > >  	vhost_dev_stop(&s->dev);
> > >  	vhost_dev_cleanup(&s->dev, false);
> > > +	kfree(s->vs_inflight);
> > >  	kfree(s);
> > >  	return 0;
> > >  }
> > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > index 94e9ee53..7567767 100644
> > > --- a/drivers/vhost/tcm_vhost.h
> > > +++ b/drivers/vhost/tcm_vhost.h
> > > @@ -2,6 +2,7 @@
> > >  #define TCM_VHOST_NAMELEN 256
> > >  #define TCM_VHOST_MAX_CDB_SIZE 32
> > >  
> > > +struct vhost_scsi_inflight;
> > >  struct tcm_vhost_cmd {
> > >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> > >  	int tvc_vq_desc;
> > > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > >  	/* Completed commands list, serviced from vhost worker thread */
> > >  	struct llist_node tvc_completion_list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  struct tcm_vhost_nexus {
> > > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> > >  	struct virtio_scsi_event event;
> > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > >  	struct llist_node list;
> > > +	/* Used to track inflight req */
> > > +	struct vhost_scsi_inflight *inflight;
> > >  };
> > >  
> > >  /*
> > > -- 
> > > 1.8.1.4
> 
> -- 
> Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush()
  2013-04-17 10:07               ` Michael S. Tsirkin
@ 2013-04-17 12:07                 ` Asias He
  0 siblings, 0 replies; 28+ messages in thread
From: Asias He @ 2013-04-17 12:07 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, virtualization, target-devel, Stefan Hajnoczi, Paolo Bonzini

On Wed, Apr 17, 2013 at 01:07:56PM +0300, Michael S. Tsirkin wrote:
> On Wed, Apr 17, 2013 at 09:29:53AM +0800, Asias He wrote:
> > On Tue, Apr 16, 2013 at 08:58:27PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 16, 2013 at 05:16:51PM +0800, Asias He wrote:
> > > > This patch makes vhost_scsi_flush() wait for all the pending requests
> > > > issued before the flush operation to be finished.
> > > > 
> > > > Changes in v5:
> > > > - Use kref and completion
> > > > - Fail req if vs->vs_inflight is NULL
> > > > - Rename tcm_vhost_alloc_inflight to tcm_vhost_set_inflight
> > > > 
> > > > Changes in v4:
> > > > - Introduce vhost_scsi_inflight
> > > > - Drop array to track flush
> > > > - Use RCU to protect vs_inflight explicitly
> > > > 
> > > > Changes in v3:
> > > > - Rebase
> > > > - Drop 'tcm_vhost: Wait for pending requests in
> > > >   vhost_scsi_clear_endpoint()' in this series, we already did that in
> > > >   'tcm_vhost: Use vq->private_data to indicate if the endpoint is setup'
> > > > 
> > > > Changes in v2:
> > > > - Increase/Decrease inflight requests in
> > > >   vhost_scsi_{allocate,free}_cmd and tcm_vhost_{allocate,free}_evt
> > > > 
> > > > Signed-off-by: Asias He <asias@redhat.com>
> > > 
> > > OK looks good, except error handling needs to be fixed.
> 
> Let me be more explicit here. There are two rules that this violates:
> 	an ioctl should either succeed and return code >= 0,
> 	or fail and return code < 0. In later case it should not
> 	have any effect.
> 		This patch instead leaves the device in a bad state
> 		on an OOM error.
> 
> 	There should be 1 way to test started/stopped state,
> 	and that is backend != NULL.
> 
> 
> So how to fix this? As I said, split up tcm_vhost_set_inflight.
> Have tcm_vhost_init_inflight instead, to simply init counters
> without alloc and the RCU things. Now in set features ioctl,
> Do allocations that can fail before you change state.
> Now set features and after a barrier set inflight.

Do you want do all this in other places where vhost_scsi_flush is
called? It's a pain.

> 
> > > > ---
> > > >  drivers/vhost/tcm_vhost.c | 101 +++++++++++++++++++++++++++++++++++++++++++---
> > > >  drivers/vhost/tcm_vhost.h |   5 +++
> > > >  2 files changed, 101 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> > > > index 4ae6725..ef40a8f 100644
> > > > --- a/drivers/vhost/tcm_vhost.c
> > > > +++ b/drivers/vhost/tcm_vhost.c
> > > > @@ -74,6 +74,11 @@ enum {
> > > >  #define VHOST_SCSI_MAX_VQ	128
> > > >  #define VHOST_SCSI_MAX_EVENT	128
> > > >  
> > > > +struct vhost_scsi_inflight {
> > > > +	struct completion comp; /* Wait for the flush operation to finish */
> > > > +	struct kref kref; /* Refcount for the inflight reqs */
> > > > +};
> > > > +
> > > >  struct vhost_scsi {
> > > >  	/* Protected by vhost_scsi->dev.mutex */
> > > >  	struct tcm_vhost_tpg **vs_tpg;
> > > > @@ -91,6 +96,8 @@ struct vhost_scsi {
> > > >  	struct mutex vs_events_lock; /* protect vs_events_dropped,events_nr */
> > > >  	bool vs_events_dropped; /* any missed events */
> > > >  	int vs_events_nr; /* num of pending events */
> > > > +
> > > > +	struct vhost_scsi_inflight __rcu *vs_inflight; /* track inflight reqs */
> > > >  };
> > > >  
> > > >  /* Local pointer to allocated TCM configfs fabric module */
> > > > @@ -108,6 +115,51 @@ static int iov_num_pages(struct iovec *iov)
> > > >  	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
> > > >  }
> > > >  
> > > > +static int tcm_vhost_set_inflight(struct vhost_scsi *vs)
> > > > +{
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +	int ret = -ENOMEM;
> > > > +
> > > > +	inflight = kzalloc(sizeof(*inflight), GFP_KERNEL);
> > > 
> > > kzalloc is not needed, you initialize all fields.
> > 
> > okay.
> > 
> > > > +	if (inflight) {
> > > > +		kref_init(&inflight->kref);
> > > > +		init_completion(&inflight->comp);
> > > > +		ret = 0;
> > > > +	}
> > > > +	rcu_assign_pointer(vs->vs_inflight, inflight);
> > > 
> > > So if allocation fails, we stop tracking inflights?
> > >
> > > This looks strange, and could break guests. Why not the usual
> > > 	if (!inflight)
> > > 		return -ENOMEM;
> > 
> > If allocation fails, we abort further reqs.
> > No need to track.
> > Why it will break guest and how?
> 
> Well if not commands succeed presumably storage does not work?
> Also need to add a comment about this hack.
> 
> But isn't failing the ioctl cleaner?
> There will be less code on data path and less tricks.

The point is what will you do when allocation fails in flush. You can
not do it in a cleaner way.

> > > > +	synchronize_rcu();
> > > 
> > > open call is different:
> > > 	- sync is not needed
> > > 	- should use RCU_INIT_POINTER and not rcu_assign_pointer
> > > 
> > > So please move these out and make this function return the struct:
> > > 	struct vhost_scsi_inflight *inflight
> > > 	tcm_vhost_alloc_inflight(void)
> > 
> > synchronize_rcu is actually needed. 
> 
> It's not needed for open: there's no old inflight there.

That's true. We can move it out. But it has no effect in open.

> >    tcm_vhost_inc_inflight
> >    {
> >    
> >            rcu_read_lock();
> >            inflight = rcu_dereference(vs->vs_inflight);         
> >    
> >           /* 
> >    	   * Possible race window here:
> >            * if inflight points to old inflight and
> >            * wait_for_completion runs before we call kref_get here,
> >            * We may free the old inflight
> >            * however, there is still one in flight which should be 
> >            * tracked by the old inflight.
> >    	   */
> >    
> >            kref_get(&inflight->kref);
> >            rcu_read_unlock();
> >    
> >            return inflight;
> >    }
> 
> 
> By the way I see a bug. vhost_scsi_set_features does smp_wmb
> and that is not paired with anything. I think we need to pair
> it with an rmb before get_inflight and add a comment
> about pairing in both places.

Let's fix it in other patches.

> > > 
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static struct vhost_scsi_inflight *
> > > > +tcm_vhost_inc_inflight(struct vhost_scsi *vs)
> > > 
> > > And then inc will not need to return inflight pointer,
> > > which is really unusual.
> > 
> > No you still need to return inflight. You need it for each tcm_vhost_cmd or
> > tcm_vhost_evt. 
> 
> That's true. But it's a strange thing to do on increment.
> Please rename inc to get and dec to put to make this
> clear.

So we still need it. Okay, let's do the rename.

> > > > +{
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +
> > > > +	rcu_read_lock();
> > > > +	inflight = rcu_dereference(vs->vs_inflight);
> > > > +	if (inflight)
> > > > +		kref_get(&inflight->kref);
> > > > +	rcu_read_unlock();
> > > > +
> > > > +	return inflight;
> > > > +}
> > > > +
> > > > +void tcm_vhost_done_inflight(struct kref *kref)
> > > > +{
> > > > +	struct vhost_scsi_inflight *inflight;
> > > > +
> > > > +	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
> > > > +	complete(&inflight->comp);
> > > > +}
> > > > +
> > > > +static void tcm_vhost_dec_inflight(struct vhost_scsi_inflight *inflight)
> > > > +{
> > > > +	if (inflight)
> > > 
> > > Here as in other places, inflight must never be NULL.
> > > Pls fix code so that invariant holds.
> > > 
> > > > +		kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > > > +}
> > > > +
> > > >  static bool tcm_vhost_check_feature(struct vhost_scsi *vs, int feature)
> > > >  {
> > > >  	bool ret = false;
> > > > @@ -402,6 +454,7 @@ static int tcm_vhost_queue_tm_rsp(struct se_cmd *se_cmd)
> > > >  static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
> > > >  {
> > > >  	mutex_lock(&vs->vs_events_lock);
> > > > +	tcm_vhost_dec_inflight(evt->inflight);
> > > >  	vs->vs_events_nr--;
> > > >  	kfree(evt);
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > > @@ -413,21 +466,27 @@ static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
> > > >  	struct tcm_vhost_evt *evt;
> > > >  
> > > >  	mutex_lock(&vs->vs_events_lock);
> > > > -	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
> > > > -		vs->vs_events_dropped = true;
> > > > -		mutex_unlock(&vs->vs_events_lock);
> > > > -		return NULL;
> > > > -	}
> > > > +	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT)
> > > > +		goto out;
> > > >  
> > > >  	evt = kzalloc(sizeof(*evt), GFP_KERNEL);
> > > 
> > > BTW it looks like we should replace this kzalloc with kmalloc.
> > > Should be a separate patch ...
> > 
> > This belongs to the hotplug series. I will add to it.
> >  
> > > >  	if (evt) {
> > > >  		evt->event.event = event;
> > > >  		evt->event.reason = reason;
> > > > +		evt->inflight = tcm_vhost_inc_inflight(vs);
> > > > +		if (!evt->inflight) {
> > > 
> > > We drop an event because earlier
> > > we run out of memory for allocating the inflight counter.
> > > Does not make sense to me.
> > 
> > This tries to abort further reqs if we fail to allocate inflight.
> 
> Yes, it prevents a crash, but it's better to tell the user that
> something's wrong.

So crash it?

> > > > +			kfree(evt);
> > > > +			goto out;
> > > > +		}
> > > >  		vs->vs_events_nr++;
> > > >  	}
> > > >  	mutex_unlock(&vs->vs_events_lock);
> > > >  
> > > >  	return evt;
> > > > +out:
> > > > +	vs->vs_events_dropped = true;
> > > > +	mutex_unlock(&vs->vs_events_lock);
> > > > +	return NULL;
> > > >  }
> > > >  
> > > >  static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > > @@ -445,6 +504,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
> > > >  		kfree(tv_cmd->tvc_sgl);
> > > >  	}
> > > >  
> > > > +	tcm_vhost_dec_inflight(tv_cmd->inflight);
> > > > +
> > > >  	kfree(tv_cmd);
> > > >  }
> > > >  
> > > > @@ -595,6 +656,9 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
> > > >  	tv_cmd->tvc_data_direction = data_direction;
> > > >  	tv_cmd->tvc_nexus = tv_nexus;
> > > >  	tv_cmd->tvc_vhost = vs;
> > > > +	tv_cmd->inflight = tcm_vhost_inc_inflight(vs);
> > > > +	if (!tv_cmd->inflight)
> > > > +		return ERR_PTR(-ENOMEM);
> > > >  
> > > >  	return tv_cmd;
> > > >  }
> > > > @@ -982,12 +1046,35 @@ static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
> > > >  
> > > >  static void vhost_scsi_flush(struct vhost_scsi *vs)
> > > >  {
> > > > +	struct vhost_scsi_inflight *inflight;
> > > >  	int i;
> > > >  
> > > > +	/* inflight points to the old inflight */
> > > > +	inflight = rcu_dereference_protected(vs->vs_inflight,
> > > > +					     lockdep_is_held(&vs->dev.mutex));
> > > > +
> > > > +	/* Allocate a new inflight and make vs->vs_inflight points to it */
> > > > +	if (tcm_vhost_set_inflight(vs) < 0)
> > > > +		pr_warn("vhost_scsi_flush failed to allocate inflight\n");
> > > 
> > > That's unlikely to reach the application. How about we stop here,
> > > and propagate the error to ioctl caller?
> > 
> > What user application can do in this case. Especially, the vhost_scsi_flush
> > fails when user tries to shutdown the guest.
> 
> I am not sure why you walk about close. Only thing calling flush
> at the moment is set features ioctl.

Which code are you looking at? Try

   git show linus/master:drivers/vhost/tcm_vhost.c

Flush is called in vhost_scsi_set_endpoint and vhost_scsi_clear_endpoint
as well.

> But let's assume you have another patch that calls flush
> on close. Then this is a bug: close should not try to allocate
> memory.  On close, we really should just set private data to NULL.
> Then we know no new requests will be submitted.
> No need to change inflight at all.

Well, do you need vhost_scsi_clear_endpoint on close?
Who wanted to allocate memory in flush at the first palce? 

> > What we are doing now is safer than just stops here. Because
> > 1) we still flush all the existing reqs 
> > 2) Further reqs will be aborted.
> 
> 
> Stop really should stop processing new requests, not cancel them.  If
> you cancel on close, we will not be able to stop kernel and then restart
> without errors.

What's the difference of 'stop processing new requests' and 'cancel them'?
On close, the vq->private_data is set to NULL. No new reqs will be
queued. So we can stop it.

> > > > +
> > > > +	/*
> > > > +	 * The inflight->kref was initialized to 1. We decrement it here to
> > > > +	 * indicate the start of the flush operation so that it will reach 0
> > > > +	 * when all the reqs are finished.
> > > > +	 */
> > > > +	kref_put(&inflight->kref, tcm_vhost_done_inflight);
> > > > +
> > > > +	/* Flush both the vhost poll and vhost work */
> > > >  	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
> > > >  		vhost_scsi_flush_vq(vs, i);
> > > >  	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
> > > >  	vhost_work_flush(&vs->dev, &vs->vs_event_work);
> > > > +
> > > > +	/* Wait for all reqs issued before the flush to be finished */
> > > > +	if (inflight) {
> > > 
> > > inflight should never be NULL, otherwise inflight
> > > tracjing is not effective. Please fix error handling so we
> > > never reach here with inflight == NULL.
> > 
> > It is effective. We abort the req if we can not track it.
> 
> Yes. But that's guest visible, isn't it?

What else the host does not know?

> > > > +		wait_for_completion(&inflight->comp);
> > > > +		kfree(inflight);
> > > > +	}
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -1196,6 +1283,9 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> > > >  	s->vs_events_dropped = false;
> > > >  	mutex_init(&s->vs_events_lock);
> > > >  
> > > > +	if (tcm_vhost_set_inflight(s) < 0)
> > > > +		return -ENOMEM;
> > > > +
> > > 
> > > Better propagate the return code to user.
> > 
> > We are returning -ENOMEM to user, no?
> 
> I mean if you call a function that returns 0 or errno,
> you should do
> r = tcm_vhost_set_inflight
> if (r)
> 	return r;
> 
> then it's easier to add more error codes in
> tcm_vhost_set_inflight if needed.

This is true, but what error code do you want to add?

> However this was a general comment, I think tcm_vhost_set_inflight
> should be split up so it won't return int.

What do you really want?

> > > >  	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
> > > >  	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> > > >  	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
> > > > @@ -1221,6 +1311,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
> > > >  	vhost_scsi_clear_endpoint(s, &t);
> > > >  	vhost_dev_stop(&s->dev);
> > > >  	vhost_dev_cleanup(&s->dev, false);
> > > > +	kfree(s->vs_inflight);
> > > >  	kfree(s);
> > > >  	return 0;
> > > >  }
> > > > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> > > > index 94e9ee53..7567767 100644
> > > > --- a/drivers/vhost/tcm_vhost.h
> > > > +++ b/drivers/vhost/tcm_vhost.h
> > > > @@ -2,6 +2,7 @@
> > > >  #define TCM_VHOST_NAMELEN 256
> > > >  #define TCM_VHOST_MAX_CDB_SIZE 32
> > > >  
> > > > +struct vhost_scsi_inflight;
> > > >  struct tcm_vhost_cmd {
> > > >  	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
> > > >  	int tvc_vq_desc;
> > > > @@ -37,6 +38,8 @@ struct tcm_vhost_cmd {
> > > >  	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
> > > >  	/* Completed commands list, serviced from vhost worker thread */
> > > >  	struct llist_node tvc_completion_list;
> > > > +	/* Used to track inflight req */
> > > > +	struct vhost_scsi_inflight *inflight;
> > > >  };
> > > >  
> > > >  struct tcm_vhost_nexus {
> > > > @@ -91,6 +94,8 @@ struct tcm_vhost_evt {
> > > >  	struct virtio_scsi_event event;
> > > >  	/* virtio_scsi event list, serviced from vhost worker thread */
> > > >  	struct llist_node list;
> > > > +	/* Used to track inflight req */
> > > > +	struct vhost_scsi_inflight *inflight;
> > > >  };
> > > >  
> > > >  /*
> > > > -- 
> > > > 1.8.1.4
> > 
> > -- 
> > Asias

-- 
Asias

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2013-04-17 12:07 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-04-09  9:39 [PATCH] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
2013-04-11 10:47 ` Michael S. Tsirkin
2013-04-12  6:25   ` Asias He
2013-04-12 11:33     ` Michael S. Tsirkin
2013-04-12 14:59       ` Asias He
2013-04-12 14:59         ` Asias He
2013-04-14 10:07         ` Michael S. Tsirkin
2013-04-14 12:38           ` Asias He
2013-04-13  3:29       ` [PATCH v4 0/2] tcm_vhost flush Asias He
2013-04-16  9:16         ` [PATCH v5 " Asias He
2013-04-16  9:16         ` Asias He
2013-04-16  9:16         ` [PATCH v5 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
2013-04-16  9:16         ` Asias He
2013-04-16  9:16         ` [PATCH v5 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
2013-04-16 17:58           ` Michael S. Tsirkin
2013-04-17  1:29             ` Asias He
2013-04-17 10:07               ` Michael S. Tsirkin
2013-04-17 12:07                 ` Asias He
2013-04-13  3:29       ` [PATCH v4 1/2] tcm_vhost: Pass vhost_scsi to vhost_scsi_allocate_cmd Asias He
2013-04-13  3:29       ` [PATCH v4 2/2] tcm_vhost: Wait for pending requests in vhost_scsi_flush() Asias He
2013-04-14  9:58         ` Michael S. Tsirkin
2013-04-14 12:27           ` Asias He
2013-04-14 12:27           ` Asias He
2013-04-15  7:18             ` Asias He
2013-04-15 10:11             ` Michael S. Tsirkin
2013-04-16  0:35               ` Asias He
2013-04-14  9:58         ` Michael S. Tsirkin
2013-04-13  3:29       ` Asias He

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.