All of lore.kernel.org
 help / color / mirror / Atom feed
* nvme: batch completions and do them outside of the queue lock
@ 2018-05-16 20:37 Jens Axboe
  2018-05-16 21:27 ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2018-05-16 20:37 UTC (permalink / raw)


This patch splits up the reaping of completion entries, and the
block side completion. The advantage of this is two-fold:

1) We can batch completions, this patch pulls them off in units
   of 8, but that number is fairly arbitrary. I wanted it to be
   big enough to hold most use cases, but not big enough to be
   a stack burden.

2) We complete the block side of things outside of the queue lock.

Note that this kills the ->cqe_seen as well. I haven't been able
to trigger any ill effects of this. If we do race with polling
every so often, it should be rare enough NOT to trigger any issues.

Signed-off-by: Jens Axboe <axboe at kernel.dk>

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 79bbfadcb7b9..ed2bd7840939 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -160,7 +160,6 @@ struct nvme_queue {
 	u16 cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 cqe_seen;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -954,7 +953,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 		return;
 	}
 
-	nvmeq->cqe_seen = 1;
 	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
 	nvme_end_request(req, cqe->status, cqe->result);
 }
@@ -974,29 +972,67 @@ static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
 	return false;
 }
 
-static void nvme_process_cq(struct nvme_queue *nvmeq)
+static bool nvme_process_cq_end(struct nvme_queue *nvmeq,
+				struct nvme_completion *cqes, unsigned int nr,
+				int tag)
+{
+	bool ret = false;
+	unsigned int i;
+
+	for (i = 0; i < nr; i++) {
+		if (!ret && tag == cqes[i].command_id)
+			ret |= true;
+
+		nvme_handle_cqe(nvmeq, &cqes[i]);
+	}
+
+	return ret;
+}
+
+static int nvme_process_cq_start(struct nvme_queue *nvmeq,
+				 struct nvme_completion *cqes, unsigned int nr)
 {
-	struct nvme_completion cqe;
 	int consumed = 0;
 
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
+	while (nvme_read_cqe(nvmeq, &cqes[consumed])) {
 		consumed++;
+		if (consumed == nr)
+			break;
 	}
 
 	if (consumed)
 		nvme_ring_cq_doorbell(nvmeq);
+
+	return consumed;
+}
+
+static void nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	struct nvme_completion cqe;
+
+	while (nvme_process_cq_start(nvmeq, &cqe, 1))
+		nvme_process_cq_end(nvmeq, &cqe, 1, -1U);
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
 {
-	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
-	spin_lock(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
-	nvmeq->cqe_seen = 0;
-	spin_unlock(&nvmeq->q_lock);
+	struct nvme_completion cqes[8];
+	irqreturn_t result = IRQ_NONE;
+	int done;
+
+	do {
+		spin_lock(&nvmeq->q_lock);
+		done = nvme_process_cq_start(nvmeq, cqes, ARRAY_SIZE(cqes));
+		spin_unlock(&nvmeq->q_lock);
+
+		if (!done)
+			break;
+
+		result = IRQ_HANDLED;
+		nvme_process_cq_end(nvmeq, cqes, done, -1U);
+	} while (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase));
+
 	return result;
 }
 
@@ -1010,26 +1046,19 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	struct nvme_completion cqe;
-	int found = 0, consumed = 0;
-
-	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
-		return 0;
+	struct nvme_completion cqes[8];
+	int done, found = 0;
 
-	spin_lock_irq(&nvmeq->q_lock);
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
+	while (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
+		spin_lock_irq(&nvmeq->q_lock);
+		done = nvme_process_cq_start(nvmeq, cqes, ARRAY_SIZE(cqes));
+		spin_unlock_irq(&nvmeq->q_lock);
 
-		if (tag == cqe.command_id) {
-			found = 1;
+		if (!done)
 			break;
-		}
-       }
 
-	if (consumed)
-		nvme_ring_cq_doorbell(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+		found |= nvme_process_cq_end(nvmeq, cqes, done, tag);
+	}
 
 	return found;
 }

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 20:37 nvme: batch completions and do them outside of the queue lock Jens Axboe
@ 2018-05-16 21:27 ` Keith Busch
  2018-05-16 22:35   ` Keith Busch
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2018-05-16 21:27 UTC (permalink / raw)


On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
> This patch splits up the reaping of completion entries, and the
> block side completion. The advantage of this is two-fold:
> 
> 1) We can batch completions, this patch pulls them off in units
>    of 8, but that number is fairly arbitrary. I wanted it to be
>    big enough to hold most use cases, but not big enough to be
>    a stack burden.
> 
> 2) We complete the block side of things outside of the queue lock.

Interesting idea. Since you bring this up, I think there may be more
optimizations on top of this concept. I'll stare at this a bit before
applying, or may have a follow-up proposal later.
 
> Note that this kills the ->cqe_seen as well. I haven't been able
> to trigger any ill effects of this. If we do race with polling
> every so often, it should be rare enough NOT to trigger any issues.

We'll know who to blame if spurious interrupt messages are seen. :)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 21:27 ` Keith Busch
@ 2018-05-16 22:35   ` Keith Busch
  2018-05-16 22:57     ` Jens Axboe
  2018-05-17  7:51     ` Christoph Hellwig
  0 siblings, 2 replies; 12+ messages in thread
From: Keith Busch @ 2018-05-16 22:35 UTC (permalink / raw)


On Wed, May 16, 2018@03:27:57PM -0600, Keith Busch wrote:
> On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
> > This patch splits up the reaping of completion entries, and the
> > block side completion. The advantage of this is two-fold:
> > 
> > 1) We can batch completions, this patch pulls them off in units
> >    of 8, but that number is fairly arbitrary. I wanted it to be
> >    big enough to hold most use cases, but not big enough to be
> >    a stack burden.
> > 
> > 2) We complete the block side of things outside of the queue lock.
> 
> Interesting idea. Since you bring this up, I think there may be more
> optimizations on top of this concept. I'll stare at this a bit before
> applying, or may have a follow-up proposal later.

While I'm not seeing a difference, I assume you are. I tried adding on
to this proposal by batching *all* completions without using the stack,
exploiting the fact we never wrap the queue so it can be accessed
lockless after moving the cq_head.

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 170642ad43fd..ac23314c94f5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -161,7 +161,6 @@ struct nvme_queue {
 	u16 cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 cqe_seen;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -955,17 +954,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 		return;
 	}
 
-	nvmeq->cqe_seen = 1;
 	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
 	nvme_end_request(req, cqe->status, cqe->result);
 }
 
-static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+static inline bool nvme_read_cqe(struct nvme_queue *nvmeq)
 {
 	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
-		*cqe = nvmeq->cqes[nvmeq->cq_head];
-
 		if (++nvmeq->cq_head == nvmeq->q_depth) {
 			nvmeq->cq_head = 0;
 			nvmeq->cq_phase = !nvmeq->cq_phase;
@@ -975,30 +970,41 @@ static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
 	return false;
 }
 
-static void nvme_process_cq(struct nvme_queue *nvmeq)
+static inline void nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+				   u16 *end)
 {
-	struct nvme_completion cqe;
-	int consumed = 0;
-
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
-	}
+	*start = nvmeq->cq_head;
+	while (nvme_read_cqe(nvmeq));
+	*end = nvmeq->cq_head;
 
-	if (consumed)
+	if (*start != *end)
 		nvme_ring_cq_doorbell(nvmeq);
 }
 
+static inline irqreturn_t nvme_complete_cqes(struct nvme_queue *nvmeq,
+					     u16 start, u16 end)
+{
+	if (start == end)
+		return IRQ_NONE;
+
+	while (start != end) {
+		nvme_handle_cqe(nvmeq, &nvmeq->cqes[start]);
+		if (++start == nvmeq->q_depth)
+			start = 0;
+	}
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t nvme_irq(int irq, void *data)
 {
-	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
+	u16 start, end;
+
 	spin_lock(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
-	nvmeq->cqe_seen = 0;
+	nvme_process_cq(nvmeq, &start, &end);
 	spin_unlock(&nvmeq->q_lock);
-	return result;
+
+	return nvme_complete_cqes(nvmeq, start, end);
 }
 
 static irqreturn_t nvme_irq_check(int irq, void *data)
@@ -1011,27 +1017,26 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	struct nvme_completion cqe;
-	int found = 0, consumed = 0;
+	u16 start, end;
+	int found = 0;
 
 	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
 		return 0;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
-
-		if (tag == cqe.command_id) {
-			found = 1;
-			break;
-		}
-       }
+	nvme_process_cq(nvmeq, &start, &end);
+	spin_unlock(&nvmeq->q_lock);
 
-	if (consumed)
-		nvme_ring_cq_doorbell(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+	if (start == end)
+		return 0;
 
+	while (start != end) {
+		nvme_handle_cqe(nvmeq, &nvmeq->cqes[start]);
+		if (++start == nvmeq->q_depth)
+			start = 0;
+		if (tag == cqe.command_id)
+			found = 1;
+	}
 	return found;
 }
 
@@ -1332,6 +1337,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
 	struct nvme_queue *nvmeq = &dev->queues[0];
+	u16 start, end;
 
 	if (shutdown)
 		nvme_shutdown_ctrl(&dev->ctrl);
@@ -1339,8 +1345,9 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
 	spin_lock_irq(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
+	nvme_process_cq(nvmeq, &start, &end);
 	spin_unlock_irq(&nvmeq->q_lock);
+	nvme_complete_cqes(nvmeq, start, end);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1987,6 +1994,7 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
+	u16 start, end;
 
 	if (!error) {
 		unsigned long flags;
@@ -1998,8 +2006,9 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
 		 */
 		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
 					SINGLE_DEPTH_NESTING);
-		nvme_process_cq(nvmeq);
+		nvme_process_cq(nvmeq, &start, &end);
 		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+		nvme_complete_cqes(nvmeq, start, end);
 	}
 
 	nvme_del_queue_end(req, error);
--

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 22:35   ` Keith Busch
@ 2018-05-16 22:57     ` Jens Axboe
  2018-05-16 23:10       ` Jens Axboe
  2018-05-17  7:51     ` Christoph Hellwig
  1 sibling, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2018-05-16 22:57 UTC (permalink / raw)


On 5/16/18 4:35 PM, Keith Busch wrote:
> On Wed, May 16, 2018@03:27:57PM -0600, Keith Busch wrote:
>> On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
>>> This patch splits up the reaping of completion entries, and the
>>> block side completion. The advantage of this is two-fold:
>>>
>>> 1) We can batch completions, this patch pulls them off in units
>>>    of 8, but that number is fairly arbitrary. I wanted it to be
>>>    big enough to hold most use cases, but not big enough to be
>>>    a stack burden.
>>>
>>> 2) We complete the block side of things outside of the queue lock.
>>
>> Interesting idea. Since you bring this up, I think there may be more
>> optimizations on top of this concept. I'll stare at this a bit before
>> applying, or may have a follow-up proposal later.
> 
> While I'm not seeing a difference, I assume you are. I tried adding on
> to this proposal by batching *all* completions without using the stack,
> exploiting the fact we never wrap the queue so it can be accessed
> lockless after moving the cq_head.

That looks nifty.

> +	*start = nvmeq->cq_head;
> +	while (nvme_read_cqe(nvmeq));

Probably want to make that

	*start = nvmeq->cq_head;
	while (nvme_read_cqe(nvmeq))
		;

so it doesn't look like a misplaced ;.

Apart from that, looks pretty clean to me. Haven't tested it yet.
 
-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 22:57     ` Jens Axboe
@ 2018-05-16 23:10       ` Jens Axboe
  2018-05-16 23:18         ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2018-05-16 23:10 UTC (permalink / raw)


On 5/16/18 4:57 PM, Jens Axboe wrote:
> On 5/16/18 4:35 PM, Keith Busch wrote:
>> On Wed, May 16, 2018@03:27:57PM -0600, Keith Busch wrote:
>>> On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
>>>> This patch splits up the reaping of completion entries, and the
>>>> block side completion. The advantage of this is two-fold:
>>>>
>>>> 1) We can batch completions, this patch pulls them off in units
>>>>    of 8, but that number is fairly arbitrary. I wanted it to be
>>>>    big enough to hold most use cases, but not big enough to be
>>>>    a stack burden.
>>>>
>>>> 2) We complete the block side of things outside of the queue lock.
>>>
>>> Interesting idea. Since you bring this up, I think there may be more
>>> optimizations on top of this concept. I'll stare at this a bit before
>>> applying, or may have a follow-up proposal later.
>>
>> While I'm not seeing a difference, I assume you are. I tried adding on
>> to this proposal by batching *all* completions without using the stack,
>> exploiting the fact we never wrap the queue so it can be accessed
>> lockless after moving the cq_head.
> 
> That looks nifty.
> 
>> +	*start = nvmeq->cq_head;
>> +	while (nvme_read_cqe(nvmeq));
> 
> Probably want to make that
> 
> 	*start = nvmeq->cq_head;
> 	while (nvme_read_cqe(nvmeq))
> 		;
> 
> so it doesn't look like a misplaced ;.
> 
> Apart from that, looks pretty clean to me. Haven't tested it yet.

Below makes it actually compile for me. Ran a quick test, seems slower
than my version in polling, for some reason.


diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index de279ef8c446..ff8d88fd89d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -929,7 +929,7 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 }
 
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+				   volatile struct nvme_completion *cqe)
 {
 	struct request *req;
 
@@ -949,7 +949,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	if (unlikely(nvmeq->qid == 0 &&
 			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
-				cqe->status, &cqe->result);
+				cqe->status, (union nvme_result *) &cqe->result);
 		return;
 	}
 
@@ -1031,10 +1031,10 @@ static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 
 	while (start != end) {
 		nvme_handle_cqe(nvmeq, &nvmeq->cqes[start]);
+		if (tag == nvmeq->cqes[start].command_id)
+			found = 1;
 		if (++start == nvmeq->q_depth)
 			start = 0;
-		if (tag == cqe.command_id)
-			found = 1;
 	}
 	return found;
 }

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 23:10       ` Jens Axboe
@ 2018-05-16 23:18         ` Jens Axboe
  2018-05-16 23:39           ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2018-05-16 23:18 UTC (permalink / raw)


On 5/16/18 5:10 PM, Jens Axboe wrote:
> On 5/16/18 4:57 PM, Jens Axboe wrote:
>> On 5/16/18 4:35 PM, Keith Busch wrote:
>>> On Wed, May 16, 2018@03:27:57PM -0600, Keith Busch wrote:
>>>> On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
>>>>> This patch splits up the reaping of completion entries, and the
>>>>> block side completion. The advantage of this is two-fold:
>>>>>
>>>>> 1) We can batch completions, this patch pulls them off in units
>>>>>    of 8, but that number is fairly arbitrary. I wanted it to be
>>>>>    big enough to hold most use cases, but not big enough to be
>>>>>    a stack burden.
>>>>>
>>>>> 2) We complete the block side of things outside of the queue lock.
>>>>
>>>> Interesting idea. Since you bring this up, I think there may be more
>>>> optimizations on top of this concept. I'll stare at this a bit before
>>>> applying, or may have a follow-up proposal later.
>>>
>>> While I'm not seeing a difference, I assume you are. I tried adding on
>>> to this proposal by batching *all* completions without using the stack,
>>> exploiting the fact we never wrap the queue so it can be accessed
>>> lockless after moving the cq_head.
>>
>> That looks nifty.
>>
>>> +	*start = nvmeq->cq_head;
>>> +	while (nvme_read_cqe(nvmeq));
>>
>> Probably want to make that
>>
>> 	*start = nvmeq->cq_head;
>> 	while (nvme_read_cqe(nvmeq))
>> 		;
>>
>> so it doesn't look like a misplaced ;.
>>
>> Apart from that, looks pretty clean to me. Haven't tested it yet.
> 
> Below makes it actually compile for me. Ran a quick test, seems slower
> than my version in polling, for some reason.

Scratch that, looks like a one-off fluke. Perf seems fine.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 23:18         ` Jens Axboe
@ 2018-05-16 23:39           ` Jens Axboe
  2018-05-17  2:09             ` Keith Busch
  2018-05-17  7:16             ` Christoph Hellwig
  0 siblings, 2 replies; 12+ messages in thread
From: Jens Axboe @ 2018-05-16 23:39 UTC (permalink / raw)


On 5/16/18 5:18 PM, Jens Axboe wrote:
> On 5/16/18 5:10 PM, Jens Axboe wrote:
>> On 5/16/18 4:57 PM, Jens Axboe wrote:
>>> On 5/16/18 4:35 PM, Keith Busch wrote:
>>>> On Wed, May 16, 2018@03:27:57PM -0600, Keith Busch wrote:
>>>>> On Wed, May 16, 2018@02:37:40PM -0600, Jens Axboe wrote:
>>>>>> This patch splits up the reaping of completion entries, and the
>>>>>> block side completion. The advantage of this is two-fold:
>>>>>>
>>>>>> 1) We can batch completions, this patch pulls them off in units
>>>>>>    of 8, but that number is fairly arbitrary. I wanted it to be
>>>>>>    big enough to hold most use cases, but not big enough to be
>>>>>>    a stack burden.
>>>>>>
>>>>>> 2) We complete the block side of things outside of the queue lock.
>>>>>
>>>>> Interesting idea. Since you bring this up, I think there may be more
>>>>> optimizations on top of this concept. I'll stare at this a bit before
>>>>> applying, or may have a follow-up proposal later.
>>>>
>>>> While I'm not seeing a difference, I assume you are. I tried adding on
>>>> to this proposal by batching *all* completions without using the stack,
>>>> exploiting the fact we never wrap the queue so it can be accessed
>>>> lockless after moving the cq_head.
>>>
>>> That looks nifty.
>>>
>>>> +	*start = nvmeq->cq_head;
>>>> +	while (nvme_read_cqe(nvmeq));
>>>
>>> Probably want to make that
>>>
>>> 	*start = nvmeq->cq_head;
>>> 	while (nvme_read_cqe(nvmeq))
>>> 		;
>>>
>>> so it doesn't look like a misplaced ;.
>>>
>>> Apart from that, looks pretty clean to me. Haven't tested it yet.
>>
>> Below makes it actually compile for me. Ran a quick test, seems slower
>> than my version in polling, for some reason.
> 
> Scratch that, looks like a one-off fluke. Perf seems fine.

FWIW, this is what I'm running. Narrows it down to just one completion
loop.


diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 79bbfadcb7b9..ce98b47144f0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -160,7 +160,6 @@ struct nvme_queue {
 	u16 cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 cqe_seen;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -930,7 +929,7 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 }
 
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+				   volatile struct nvme_completion *cqe)
 {
 	struct request *req;
 
@@ -950,21 +949,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	if (unlikely(nvmeq->qid == 0 &&
 			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
-				cqe->status, &cqe->result);
+				cqe->status, (union nvme_result *) &cqe->result);
 		return;
 	}
 
-	nvmeq->cqe_seen = 1;
 	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
 	nvme_end_request(req, cqe->status, cqe->result);
 }
 
-static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
-		struct nvme_completion *cqe)
+static inline bool nvme_read_cqe(struct nvme_queue *nvmeq)
 {
 	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
-		*cqe = nvmeq->cqes[nvmeq->cq_head];
-
 		if (++nvmeq->cq_head == nvmeq->q_depth) {
 			nvmeq->cq_head = 0;
 			nvmeq->cq_phase = !nvmeq->cq_phase;
@@ -974,30 +969,54 @@ static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
 	return false;
 }
 
-static void nvme_process_cq(struct nvme_queue *nvmeq)
+static inline void nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+				   u16 *end)
 {
-	struct nvme_completion cqe;
-	int consumed = 0;
+	*start = nvmeq->cq_head;
+	while (nvme_read_cqe(nvmeq))
+		;
+	*end = nvmeq->cq_head;
+
+	if (*start != *end)
+		nvme_ring_cq_doorbell(nvmeq);
+}
 
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
+static bool nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end,
+			       unsigned int tag)
+{
+	bool found = false;
+
+	if (start == end)
+		return false;
+
+	while (start != end) {
+		volatile struct nvme_completion *cqe = &nvmeq->cqes[start];
+
+		if (!found && tag == cqe->command_id)
+			found = true;
+		nvme_handle_cqe(nvmeq, cqe);
+		if (++start == nvmeq->q_depth)
+			start = 0;
 	}
 
-	if (consumed)
-		nvme_ring_cq_doorbell(nvmeq);
+	return found;
 }
 
 static irqreturn_t nvme_irq(int irq, void *data)
 {
-	irqreturn_t result;
 	struct nvme_queue *nvmeq = data;
+	u16 start, end;
+
 	spin_lock(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
-	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
-	nvmeq->cqe_seen = 0;
+	nvme_process_cq(nvmeq, &start, &end);
 	spin_unlock(&nvmeq->q_lock);
-	return result;
+
+	if (start != end) {
+		nvme_complete_cqes(nvmeq, start, end, -1);
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_NONE;
 }
 
 static irqreturn_t nvme_irq_check(int irq, void *data)
@@ -1010,28 +1029,16 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	struct nvme_completion cqe;
-	int found = 0, consumed = 0;
+	u16 start, end;
 
 	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
 		return 0;
 
 	spin_lock_irq(&nvmeq->q_lock);
-	while (nvme_read_cqe(nvmeq, &cqe)) {
-		nvme_handle_cqe(nvmeq, &cqe);
-		consumed++;
-
-		if (tag == cqe.command_id) {
-			found = 1;
-			break;
-		}
-       }
-
-	if (consumed)
-		nvme_ring_cq_doorbell(nvmeq);
-	spin_unlock_irq(&nvmeq->q_lock);
+	nvme_process_cq(nvmeq, &start, &end);
+	spin_unlock(&nvmeq->q_lock);
 
-	return found;
+	return nvme_complete_cqes(nvmeq, start, end, tag);
 }
 
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
@@ -1340,6 +1347,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
 	struct nvme_queue *nvmeq = &dev->queues[0];
+	u16 start, end;
 
 	if (shutdown)
 		nvme_shutdown_ctrl(&dev->ctrl);
@@ -1347,8 +1355,9 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
 	spin_lock_irq(&nvmeq->q_lock);
-	nvme_process_cq(nvmeq);
+	nvme_process_cq(nvmeq, &start, &end);
 	spin_unlock_irq(&nvmeq->q_lock);
+	nvme_complete_cqes(nvmeq, start, end, -1U);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1995,6 +2004,7 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
+	u16 start, end;
 
 	if (!error) {
 		unsigned long flags;
@@ -2006,8 +2016,9 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
 		 */
 		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
 					SINGLE_DEPTH_NESTING);
-		nvme_process_cq(nvmeq);
+		nvme_process_cq(nvmeq, &start, &end);
 		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+		nvme_complete_cqes(nvmeq, start, end, -1U);
 	}
 
 	nvme_del_queue_end(req, error);

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 23:39           ` Jens Axboe
@ 2018-05-17  2:09             ` Keith Busch
  2018-05-17  3:16               ` Jens Axboe
  2018-05-17  3:16               ` Jens Axboe
  2018-05-17  7:16             ` Christoph Hellwig
  1 sibling, 2 replies; 12+ messages in thread
From: Keith Busch @ 2018-05-17  2:09 UTC (permalink / raw)


On Wed, May 16, 2018@05:39:02PM -0600, Jens Axboe wrote:
>  static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
>  {
> -	struct nvme_completion cqe;
> -	int found = 0, consumed = 0;
> +	u16 start, end;
>  
>  	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
>  		return 0;
>  
>  	spin_lock_irq(&nvmeq->q_lock);
> -	while (nvme_read_cqe(nvmeq, &cqe)) {
> -		nvme_handle_cqe(nvmeq, &cqe);
> -		consumed++;
> -
> -		if (tag == cqe.command_id) {
> -			found = 1;
> -			break;
> -		}
> -       }
> -
> -	if (consumed)
> -		nvme_ring_cq_doorbell(nvmeq);
> -	spin_unlock_irq(&nvmeq->q_lock);
> +	nvme_process_cq(nvmeq, &start, &end);
> +	spin_unlock(&nvmeq->q_lock);

You'll need to use the spin_unlock_irq() here, otherwise looks good.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-17  2:09             ` Keith Busch
@ 2018-05-17  3:16               ` Jens Axboe
  2018-05-17  3:16               ` Jens Axboe
  1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2018-05-17  3:16 UTC (permalink / raw)


On 5/16/18 8:09 PM, Keith Busch wrote:
> On Wed, May 16, 2018@05:39:02PM -0600, Jens Axboe wrote:
>>  static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
>>  {
>> -	struct nvme_completion cqe;
>> -	int found = 0, consumed = 0;
>> +	u16 start, end;
>>  
>>  	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
>>  		return 0;
>>  
>>  	spin_lock_irq(&nvmeq->q_lock);
>> -	while (nvme_read_cqe(nvmeq, &cqe)) {
>> -		nvme_handle_cqe(nvmeq, &cqe);
>> -		consumed++;
>> -
>> -		if (tag == cqe.command_id) {
>> -			found = 1;
>> -			break;
>> -		}
>> -       }
>> -
>> -	if (consumed)
>> -		nvme_ring_cq_doorbell(nvmeq);
>> -	spin_unlock_irq(&nvmeq->q_lock);
>> +	nvme_process_cq(nvmeq, &start, &end);
>> +	spin_unlock(&nvmeq->q_lock);
> 
> You'll need to use the spin_unlock_irq() here, otherwise looks good.

Ugh good catch, fixed up. How do you want to proceed?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-17  2:09             ` Keith Busch
  2018-05-17  3:16               ` Jens Axboe
@ 2018-05-17  3:16               ` Jens Axboe
  1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2018-05-17  3:16 UTC (permalink / raw)


On 5/16/18 8:09 PM, Keith Busch wrote:
> On Wed, May 16, 2018@05:39:02PM -0600, Jens Axboe wrote:
>>  static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
>>  {
>> -	struct nvme_completion cqe;
>> -	int found = 0, consumed = 0;
>> +	u16 start, end;
>>  
>>  	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
>>  		return 0;
>>  
>>  	spin_lock_irq(&nvmeq->q_lock);
>> -	while (nvme_read_cqe(nvmeq, &cqe)) {
>> -		nvme_handle_cqe(nvmeq, &cqe);
>> -		consumed++;
>> -
>> -		if (tag == cqe.command_id) {
>> -			found = 1;
>> -			break;
>> -		}
>> -       }
>> -
>> -	if (consumed)
>> -		nvme_ring_cq_doorbell(nvmeq);
>> -	spin_unlock_irq(&nvmeq->q_lock);
>> +	nvme_process_cq(nvmeq, &start, &end);
>> +	spin_unlock(&nvmeq->q_lock);
> 
> You'll need to use the spin_unlock_irq() here, otherwise looks good.

Ugh good catch, fixed up. How do you want to proceed?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 23:39           ` Jens Axboe
  2018-05-17  2:09             ` Keith Busch
@ 2018-05-17  7:16             ` Christoph Hellwig
  1 sibling, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2018-05-17  7:16 UTC (permalink / raw)


>  static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
> -		struct nvme_completion *cqe)
> +				   volatile struct nvme_completion *cqe)

Skip the bogus reindentation here :)

>  {
>  	struct request *req;
>  
> @@ -950,21 +949,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
>  	if (unlikely(nvmeq->qid == 0 &&
>  			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
>  		nvme_complete_async_event(&nvmeq->dev->ctrl,
> -				cqe->status, &cqe->result);
> +				cqe->status, (union nvme_result *) &cqe->result);

Please find a way to avoid that cast.  Either always make it volatile
or pass by value if modern complilers have stopped generating shit code
for passing unions by value.

> -static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
> -		struct nvme_completion *cqe)
> +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq)
>  {
>  	if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
> -		*cqe = nvmeq->cqes[nvmeq->cq_head];
> -

Without actually reading the CQE this function is now grossly misnamed.
What about nvme_consume_cqe or something like that instead?

> +static inline void nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
> +				   u16 *end)
>  {
> +	*start = nvmeq->cq_head;
> +	while (nvme_read_cqe(nvmeq))
> +		;
> +	*end = nvmeq->cq_head;
> +
> +	if (*start != *end)
> +		nvme_ring_cq_doorbell(nvmeq);
> +}

Or in fact just kill off nvme_read_cqe, as that appears to be the only
callers with your patch (just reading the patch so I might be wrong).

The this could become:

	*start = nvmeq->cq_head;
	while (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
		if (++nvmeq->cq_head == nvmeq->q_depth) {
			nvmeq->cq_head = 0;
			nvmeq->cq_phase = !nvmeq->cq_phase;
		}
	}
	*end = nvmeq->cq_head;

	if (*start != *end)
		nvme_ring_cq_doorbell(nvmeq);
}

which starts to make a lot more sense.

> +static bool nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end,
> +			       unsigned int tag)
> +{
> +	bool found = false;
> +
> +	if (start == end)
> +		return false;
> +
> +	while (start != end) {
> +		volatile struct nvme_completion *cqe = &nvmeq->cqes[start];
> +
> +		if (!found && tag == cqe->command_id)
> +			found = true;
> +		nvme_handle_cqe(nvmeq, cqe);
> +		if (++start == nvmeq->q_depth)
> +			start = 0;
>  	}
>  
> +	return found;

I don't think we need the if (start == end) check, the while loop already
handles that.  I also wonder if we should move the tag matching into
nvme_handle_cqe.  It already looks at cqe->command_id, so that would keep
the access together, and would remove the need to even look at the CQE at
all in this function.  And nvme_complete_cqes would be so trivial now that
we can still keep the poll optimization.

Irq/delete queue path, including handling the irqreturn_t value:

static irqreturn_t nvme_complete_cqes(struct nvme_queue *nvmeq,
		u16 start, u16 end)
{
	irqreturn_t ret = IRQ_NONE;

	while (start != end) {
		nvme_handle_cqe(nvmeq, cqe, -1);
		if (++start == nvmeq->q_depth)
			start = 0;
		ret = IRQ_HANDLED;
 	}

	return ret;
}

poll path:

static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
	struct nvme_completion cqe;
	u16 start, end;

 	if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase))
 		return 0;

 	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq, &start, &end);
	spin_unlock(&nvmeq->q_lock);

	while (start != end) {
		if (nvme_handle_cqe(nvmeq, cqe, tag))
			return 1;
		if (++start == nvmeq->q_depth)
			start = 0;
 	}

	return 0;
}

that would also keep the early exit behavior for poll once we found
the tag.

> @@ -2006,8 +2016,9 @@ static void nvme_del_cq_end(struct request *req, blk_status_t error)
>  		 */
>  		spin_lock_irqsave_nested(&nvmeq->q_lock, flags,
>  					SINGLE_DEPTH_NESTING);
> -		nvme_process_cq(nvmeq);
> +		nvme_process_cq(nvmeq, &start, &end);
>  		spin_unlock_irqrestore(&nvmeq->q_lock, flags);
> +		nvme_complete_cqes(nvmeq, start, end, -1U);

If we could somehow move this into a workqueue we could even move the
locking into nvme_process_cq, but that's something left for another time.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* nvme: batch completions and do them outside of the queue lock
  2018-05-16 22:35   ` Keith Busch
  2018-05-16 22:57     ` Jens Axboe
@ 2018-05-17  7:51     ` Christoph Hellwig
  1 sibling, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2018-05-17  7:51 UTC (permalink / raw)


On Wed, May 16, 2018@04:35:38PM -0600, Keith Busch wrote:
> While I'm not seeing a difference, I assume you are. I tried adding on
> to this proposal by batching *all* completions without using the stack,
> exploiting the fact we never wrap the queue so it can be accessed
> lockless after moving the cq_head.

But we do wrap around, in nvme_read_cqe don't we?  From a quick look
we probably are save because we size our SQs and CQs to the number
of outstanding command, and thus the device won't be able to reuse
the slots until we complete and reuse the command ids.  But that isn't
exactly obvious from the code, so I think we really need some good
comments in the code.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-05-17  7:51 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-16 20:37 nvme: batch completions and do them outside of the queue lock Jens Axboe
2018-05-16 21:27 ` Keith Busch
2018-05-16 22:35   ` Keith Busch
2018-05-16 22:57     ` Jens Axboe
2018-05-16 23:10       ` Jens Axboe
2018-05-16 23:18         ` Jens Axboe
2018-05-16 23:39           ` Jens Axboe
2018-05-17  2:09             ` Keith Busch
2018-05-17  3:16               ` Jens Axboe
2018-05-17  3:16               ` Jens Axboe
2018-05-17  7:16             ` Christoph Hellwig
2018-05-17  7:51     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.