All of lore.kernel.org
 help / color / mirror / Atom feed
* add a proper completion queue abstraction
@ 2015-11-13 13:46 Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
                   ` (8 more replies)
  0 siblings, 9 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

This series adds a new RDMA core abstraction that insulated the
ULPs from the nitty gritty details of CQ polling.  See the individual
patches for more details.

Note that this series should be applied on top of my
"IB: merge struct ib_device_attr into struct ib_device" patch.

A git tree is also available:

	http://git.infradead.org/users/hch/rdma.git/shortlog/refs/heads/rdma-cq
	git://git.infradead.org/users/hch/rdma.git rdma-cq

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-2-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

The new name is irq_poll as iopoll is already taken.  Better suggestions
welcome.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/kernel-per-CPU-kthreads.txt |   2 +-
 block/Makefile                            |   2 +-
 block/blk-iopoll.c                        | 224 ------------------------------
 drivers/scsi/Kconfig                      |   1 +
 drivers/scsi/be2iscsi/Kconfig             |   1 +
 drivers/scsi/be2iscsi/be.h                |   4 +-
 drivers/scsi/be2iscsi/be_iscsi.c          |   4 +-
 drivers/scsi/be2iscsi/be_main.c           |  24 ++--
 drivers/scsi/ipr.c                        |  28 ++--
 drivers/scsi/ipr.h                        |   4 +-
 include/linux/blk-iopoll.h                |  46 ------
 include/linux/interrupt.h                 |   2 +-
 include/linux/irq_poll.h                  |  46 ++++++
 include/trace/events/irq.h                |   2 +-
 lib/Kconfig                               |   5 +
 lib/Makefile                              |   1 +
 lib/irq_poll.c                            | 221 +++++++++++++++++++++++++++++
 tools/lib/traceevent/event-parse.c        |   2 +-
 tools/perf/util/trace-event-parse.c       |   2 +-
 19 files changed, 313 insertions(+), 308 deletions(-)
 delete mode 100644 block/blk-iopoll.c
 delete mode 100644 include/linux/blk-iopoll.h
 create mode 100644 include/linux/irq_poll.h
 create mode 100644 lib/irq_poll.c

diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index f4cbfe0..edec3a3 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -90,7 +90,7 @@ BLOCK_SOFTIRQ:  Do all of the following:
 	from being initiated from tasks that might run on the CPU to
 	be de-jittered.  (It is OK to force this CPU offline and then
 	bring it back online before you start your application.)
-BLOCK_IOPOLL_SOFTIRQ:  Do all of the following:
+IRQ_POLL_SOFTIRQ:  Do all of the following:
 1.	Force block-device interrupts onto some other CPU.
 2.	Initiate any block I/O and block-I/O polling on other CPUs.
 3.	Once your application has started, prevent CPU-hotplug operations
diff --git a/block/Makefile b/block/Makefile
index 00ecc97..e850474 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			partitions/
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
deleted file mode 100644
index 0736729..0000000
--- a/block/blk-iopoll.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Functions related to interrupt-poll handling in the block layer. This
- * is similar to NAPI for network devices.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
-#include <linux/delay.h>
-
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
-
-/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Add this blk_iopoll structure to the pending poll list and trigger the
- *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     successful return from blk_iopoll_sched_prep() before calling this.
- **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
-	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_sched);
-
-/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     See blk_iopoll_complete(). This function must be called with interrupts
- *     disabled.
- **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
-{
-	list_del(&iop->list);
-	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(__blk_iopoll_complete);
-
-/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     If a driver consumes less than the assigned budget in its run of the
- *     iopoll handler, it'll end the polled mode by calling this function. The
- *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
- *     is called.
- **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__blk_iopoll_complete(iop);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_complete);
-
-static void blk_iopoll_softirq(struct softirq_action *h)
-{
-	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
-	int rearm = 0, budget = blk_iopoll_budget;
-	unsigned long start_time = jiffies;
-
-	local_irq_disable();
-
-	while (!list_empty(list)) {
-		struct blk_iopoll *iop;
-		int work, weight;
-
-		/*
-		 * If softirq window is exhausted then punt.
-		 */
-		if (budget <= 0 || time_after(jiffies, start_time)) {
-			rearm = 1;
-			break;
-		}
-
-		local_irq_enable();
-
-		/* Even though interrupts have been re-enabled, this
-		 * access is safe because interrupts can only add new
-		 * entries to the tail of this list, and only ->poll()
-		 * calls can remove this head entry from the list.
-		 */
-		iop = list_entry(list->next, struct blk_iopoll, list);
-
-		weight = iop->weight;
-		work = 0;
-		if (test_bit(IOPOLL_F_SCHED, &iop->state))
-			work = iop->poll(iop, weight);
-
-		budget -= work;
-
-		local_irq_disable();
-
-		/*
-		 * Drivers must not modify the iopoll state, if they
-		 * consume their assigned weight (or more, some drivers can't
-		 * easily just stop processing, they have to complete an
-		 * entire mask of commands).In such cases this code
-		 * still "owns" the iopoll instance and therefore can
-		 * move the instance around on the list at-will.
-		 */
-		if (work >= weight) {
-			if (blk_iopoll_disable_pending(iop))
-				__blk_iopoll_complete(iop);
-			else
-				list_move_tail(&iop->list, list);
-		}
-	}
-
-	if (rearm)
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-
-	local_irq_enable();
-}
-
-/**
- * blk_iopoll_disable - Disable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Disable io polling and wait for any pending callbacks to have completed.
- **/
-void blk_iopoll_disable(struct blk_iopoll *iop)
-{
-	set_bit(IOPOLL_F_DISABLE, &iop->state);
-	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
-		msleep(1);
-	clear_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_disable);
-
-/**
- * blk_iopoll_enable - Enable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Enable iopoll on this @iop. Note that the handler run will not be
- *     scheduled, it will only mark it as active.
- **/
-void blk_iopoll_enable(struct blk_iopoll *iop)
-{
-	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
-	smp_mb__before_atomic();
-	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_enable);
-
-/**
- * blk_iopoll_init - Initialize this @iop
- * @iop:      The parent iopoll structure
- * @weight:   The default weight (or command completion budget)
- * @poll_fn:  The handler to invoke
- *
- * Description:
- *     Initialize this blk_iopoll structure. Before being actively used, the
- *     driver must call blk_iopoll_enable().
- **/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
-{
-	memset(iop, 0, sizeof(*iop));
-	INIT_LIST_HEAD(&iop->list);
-	iop->weight = weight;
-	iop->poll = poll_fn;
-	set_bit(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_init);
-
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
-				 unsigned long action, void *hcpu)
-{
-	/*
-	 * If a CPU goes away, splice its entries to the current CPU
-	 * and trigger a run of the softirq
-	 */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		int cpu = (unsigned long) hcpu;
-
-		local_irq_disable();
-		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
-				 this_cpu_ptr(&blk_cpu_iopoll));
-		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-		local_irq_enable();
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block blk_iopoll_cpu_notifier = {
-	.notifier_call	= blk_iopoll_cpu_notify,
-};
-
-static __init int blk_iopoll_setup(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
-
-	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
-	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
-	return 0;
-}
-subsys_initcall(blk_iopoll_setup);
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index d2f480b..cea683e 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1103,6 +1103,7 @@ config SCSI_IPR
 	tristate "IBM Power Linux RAID adapter support"
 	depends on PCI && SCSI && ATA
 	select FW_LOADER
+	select IRQ_POLL
 	---help---
 	  This driver supports the IBM Power Linux family RAID adapters.
 	  This includes IBM pSeries 5712, 5703, 5709, and 570A, as well
diff --git a/drivers/scsi/be2iscsi/Kconfig b/drivers/scsi/be2iscsi/Kconfig
index 4e7cad2..bad5f32 100644
--- a/drivers/scsi/be2iscsi/Kconfig
+++ b/drivers/scsi/be2iscsi/Kconfig
@@ -3,6 +3,7 @@ config BE2ISCSI
 	depends on PCI && SCSI && NET
 	select SCSI_ISCSI_ATTRS
 	select ISCSI_BOOT_SYSFS
+	select IRQ_POLL
 
 	help
 	This driver implements the iSCSI functionality for Emulex
diff --git a/drivers/scsi/be2iscsi/be.h b/drivers/scsi/be2iscsi/be.h
index 77f992e..a41c643 100644
--- a/drivers/scsi/be2iscsi/be.h
+++ b/drivers/scsi/be2iscsi/be.h
@@ -20,7 +20,7 @@
 
 #include <linux/pci.h>
 #include <linux/if_vlan.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #define FW_VER_LEN	32
 #define MCC_Q_LEN	128
 #define MCC_CQ_LEN	256
@@ -101,7 +101,7 @@ struct be_eq_obj {
 	struct beiscsi_hba *phba;
 	struct be_queue_info *cq;
 	struct work_struct work_cqs; /* Work Item */
-	struct blk_iopoll	iopoll;
+	struct irq_poll	iopoll;
 };
 
 struct be_mcc_obj {
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index b7087ba..022e87b 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -1292,9 +1292,9 @@ static void beiscsi_flush_cq(struct beiscsi_hba *phba)
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 		beiscsi_process_cq(pbe_eq);
-		blk_iopoll_enable(&pbe_eq->iopoll);
+		irq_poll_enable(&pbe_eq->iopoll);
 	}
 }
 
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 2e6abe7..7f0fdbe 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -910,8 +910,8 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
 	num_eq_processed = 0;
 	while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
 				& EQE_VALID_MASK) {
-		if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-			blk_iopoll_sched(&pbe_eq->iopoll);
+		if (!irq_poll_sched_prep(&pbe_eq->iopoll))
+			irq_poll_sched(&pbe_eq->iopoll);
 
 		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
 		queue_tail_inc(eq);
@@ -972,8 +972,8 @@ static irqreturn_t be_isr(int irq, void *dev_id)
 			spin_unlock_irqrestore(&phba->isr_lock, flags);
 			num_mcceq_processed++;
 		} else {
-			if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-				blk_iopoll_sched(&pbe_eq->iopoll);
+			if (!irq_poll_sched_prep(&pbe_eq->iopoll))
+				irq_poll_sched(&pbe_eq->iopoll);
 			num_ioeq_processed++;
 		}
 		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
@@ -2293,7 +2293,7 @@ void beiscsi_process_all_cqs(struct work_struct *work)
 	hwi_ring_eq_db(phba, pbe_eq->q.id, 0, 0, 1, 1);
 }
 
-static int be_iopoll(struct blk_iopoll *iop, int budget)
+static int be_iopoll(struct irq_poll *iop, int budget)
 {
 	unsigned int ret;
 	struct beiscsi_hba *phba;
@@ -2304,7 +2304,7 @@ static int be_iopoll(struct blk_iopoll *iop, int budget)
 	pbe_eq->cq_count += ret;
 	if (ret < budget) {
 		phba = pbe_eq->phba;
-		blk_iopoll_complete(iop);
+		irq_poll_complete(iop);
 		beiscsi_log(phba, KERN_INFO,
 			    BEISCSI_LOG_CONFIG | BEISCSI_LOG_IO,
 			    "BM_%d : rearm pbe_eq->q.id =%d\n",
@@ -5261,7 +5261,7 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba,
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 	}
 
 	if (unload_state == BEISCSI_CLEAN_UNLOAD) {
@@ -5547,9 +5547,9 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev)
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+		irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
 				be_iopoll);
-		blk_iopoll_enable(&pbe_eq->iopoll);
+		irq_poll_enable(&pbe_eq->iopoll);
 	}
 
 	i = (phba->msix_enabled) ? i : 0;
@@ -5720,9 +5720,9 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+		irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
 				be_iopoll);
-		blk_iopoll_enable(&pbe_eq->iopoll);
+		irq_poll_enable(&pbe_eq->iopoll);
 	}
 
 	i = (phba->msix_enabled) ? i : 0;
@@ -5763,7 +5763,7 @@ free_blkenbld:
 	destroy_workqueue(phba->wq);
 	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		blk_iopoll_disable(&pbe_eq->iopoll);
+		irq_poll_disable(&pbe_eq->iopoll);
 	}
 free_twq:
 	beiscsi_clean_port(phba);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index b62836d..6b98e75 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3638,7 +3638,7 @@ static struct device_attribute ipr_ioa_reset_attr = {
 	.store = ipr_store_reset_adapter
 };
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget);
+static int ipr_iopoll(struct irq_poll *iop, int budget);
  /**
  * ipr_show_iopoll_weight - Show ipr polling mode
  * @dev:	class device struct
@@ -3681,34 +3681,34 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev,
 	int i;
 
 	if (!ioa_cfg->sis64) {
-		dev_info(&ioa_cfg->pdev->dev, "blk-iopoll not supported on this adapter\n");
+		dev_info(&ioa_cfg->pdev->dev, "irq_poll not supported on this adapter\n");
 		return -EINVAL;
 	}
 	if (kstrtoul(buf, 10, &user_iopoll_weight))
 		return -EINVAL;
 
 	if (user_iopoll_weight > 256) {
-		dev_info(&ioa_cfg->pdev->dev, "Invalid blk-iopoll weight. It must be less than 256\n");
+		dev_info(&ioa_cfg->pdev->dev, "Invalid irq_poll weight. It must be less than 256\n");
 		return -EINVAL;
 	}
 
 	if (user_iopoll_weight == ioa_cfg->iopoll_weight) {
-		dev_info(&ioa_cfg->pdev->dev, "Current blk-iopoll weight has the same weight\n");
+		dev_info(&ioa_cfg->pdev->dev, "Current irq_poll weight has the same weight\n");
 		return strlen(buf);
 	}
 
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
-			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
 	}
 
 	spin_lock_irqsave(shost->host_lock, lock_flags);
 	ioa_cfg->iopoll_weight = user_iopoll_weight;
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+			irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
-			blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_enable(&ioa_cfg->hrrq[i].iopoll);
 		}
 	}
 	spin_unlock_irqrestore(shost->host_lock, lock_flags);
@@ -5569,7 +5569,7 @@ static int ipr_process_hrrq(struct ipr_hrr_queue *hrr_queue, int budget,
 	return num_hrrq;
 }
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget)
+static int ipr_iopoll(struct irq_poll *iop, int budget)
 {
 	struct ipr_ioa_cfg *ioa_cfg;
 	struct ipr_hrr_queue *hrrq;
@@ -5585,7 +5585,7 @@ static int ipr_iopoll(struct blk_iopoll *iop, int budget)
 	completed_ops = ipr_process_hrrq(hrrq, budget, &doneq);
 
 	if (completed_ops < budget)
-		blk_iopoll_complete(iop);
+		irq_poll_complete(iop);
 	spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
 
 	list_for_each_entry_safe(ipr_cmd, temp, &doneq, queue) {
@@ -5693,8 +5693,8 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp)
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) ==
 		       hrrq->toggle_bit) {
-			if (!blk_iopoll_sched_prep(&hrrq->iopoll))
-				blk_iopoll_sched(&hrrq->iopoll);
+			if (!irq_poll_sched_prep(&hrrq->iopoll))
+				irq_poll_sched(&hrrq->iopoll);
 			spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
 			return IRQ_HANDLED;
 		}
@@ -10285,9 +10285,9 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+			irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
-			blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_enable(&ioa_cfg->hrrq[i].iopoll);
 		}
 	}
 
@@ -10316,7 +10316,7 @@ static void ipr_shutdown(struct pci_dev *pdev)
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		ioa_cfg->iopoll_weight = 0;
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
-			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+			irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
 	}
 
 	while (ioa_cfg->in_reset_reload) {
diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h
index e4fb17a..022fc3c 100644
--- a/drivers/scsi/ipr.h
+++ b/drivers/scsi/ipr.h
@@ -32,7 +32,7 @@
 #include <linux/libata.h>
 #include <linux/list.h>
 #include <linux/kref.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 
@@ -510,7 +510,7 @@ struct ipr_hrr_queue {
 	u8 allow_cmds:1;
 	u8 removing_ioa:1;
 
-	struct blk_iopoll iopoll;
+	struct irq_poll iopoll;
 };
 
 /* Command packet structure */
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
deleted file mode 100644
index 77ae77c..0000000
--- a/include/linux/blk-iopoll.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef BLK_IOPOLL_H
-#define BLK_IOPOLL_H
-
-struct blk_iopoll;
-typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
-
-struct blk_iopoll {
-	struct list_head list;
-	unsigned long state;
-	unsigned long data;
-	int weight;
-	int max;
-	blk_iopoll_fn *poll;
-};
-
-enum {
-	IOPOLL_F_SCHED		= 0,
-	IOPOLL_F_DISABLE	= 1,
-};
-
-/*
- * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
- * that we were the first to acquire this iop for scheduling. If this iop
- * is currently disabled, return "failure".
- */
-static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
-{
-	if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
-		return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
-
-	return 1;
-}
-
-static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-{
-	return test_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-
-extern void blk_iopoll_sched(struct blk_iopoll *);
-extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
-extern void blk_iopoll_complete(struct blk_iopoll *);
-extern void __blk_iopoll_complete(struct blk_iopoll *);
-extern void blk_iopoll_enable(struct blk_iopoll *);
-extern void blk_iopoll_disable(struct blk_iopoll *);
-
-#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ad16809..7ff98c2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -412,7 +412,7 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	BLOCK_SOFTIRQ,
-	BLOCK_IOPOLL_SOFTIRQ,
+	IRQ_POLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h
new file mode 100644
index 0000000..0cf7c26
--- /dev/null
+++ b/include/linux/irq_poll.h
@@ -0,0 +1,46 @@
+#ifndef IRQ_POLL_H
+#define IRQ_POLL_H
+
+struct irq_poll;
+typedef int (irq_poll_fn)(struct irq_poll *, int);
+
+struct irq_poll {
+	struct list_head list;
+	unsigned long state;
+	unsigned long data;
+	int weight;
+	int max;
+	irq_poll_fn *poll;
+};
+
+enum {
+	IRQ_POLL_F_SCHED		= 0,
+	IRQ_POLL_F_DISABLE	= 1,
+};
+
+/*
+ * Returns 0 if we successfully set the IRQ_POLL_F_SCHED bit, indicating
+ * that we were the first to acquire this iop for scheduling. If this iop
+ * is currently disabled, return "failure".
+ */
+static inline int irq_poll_sched_prep(struct irq_poll *iop)
+{
+	if (!test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+		return test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state);
+
+	return 1;
+}
+
+static inline int irq_poll_disable_pending(struct irq_poll *iop)
+{
+	return test_bit(IRQ_POLL_F_DISABLE, &iop->state);
+}
+
+extern void irq_poll_sched(struct irq_poll *);
+extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *);
+extern void irq_poll_complete(struct irq_poll *);
+extern void __irq_poll_complete(struct irq_poll *);
+extern void irq_poll_enable(struct irq_poll *);
+extern void irq_poll_disable(struct irq_poll *);
+
+#endif
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index ff8f6c0..f95f25e 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -15,7 +15,7 @@ struct softirq_action;
 			 softirq_name(NET_TX)		\
 			 softirq_name(NET_RX)		\
 			 softirq_name(BLOCK)		\
-			 softirq_name(BLOCK_IOPOLL)	\
+			 softirq_name(IRQ_POLL)		\
 			 softirq_name(TASKLET)		\
 			 softirq_name(SCHED)		\
 			 softirq_name(HRTIMER)		\
diff --git a/lib/Kconfig b/lib/Kconfig
index f0df318..e00e196 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -475,6 +475,11 @@ config DDR
 	  information. This data is useful for drivers handling
 	  DDR SDRAM controllers.
 
+config IRQ_POLL
+	bool "IRQ polling library"
+	help
+	  Helper library to poll interrupt mitigation using polling.
+
 config MPILIB
 	tristate
 	select CLZ_TAB
diff --git a/lib/Makefile b/lib/Makefile
index 7f1de26..1478ae2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -164,6 +164,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
 	       fdt_empty_tree.o
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
new file mode 100644
index 0000000..e6fd1dc
--- /dev/null
+++ b/lib/irq_poll.c
@@ -0,0 +1,221 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq_poll.h>
+#include <linux/delay.h>
+
+static unsigned int irq_poll_budget __read_mostly = 256;
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * irq_poll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this irq_poll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq. The driver must already have gotten a
+ *     successful return from irq_poll_sched_prep() before calling this.
+ **/
+void irq_poll_sched(struct irq_poll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
+	__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(irq_poll_sched);
+
+/**
+ * __irq_poll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See irq_poll_complete(). This function must be called with interrupts
+ *     disabled.
+ **/
+void __irq_poll_complete(struct irq_poll *iop)
+{
+	list_del(&iop->list);
+	smp_mb__before_atomic();
+	clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__irq_poll_complete);
+
+/**
+ * irq_poll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the
+ *     iopoll handler, it'll end the polled mode by calling this function. The
+ *     iopoll handler will not be invoked again before irq_poll_sched_prep()
+ *     is called.
+ **/
+void irq_poll_complete(struct irq_poll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__irq_poll_complete(iop);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(irq_poll_complete);
+
+static void irq_poll_softirq(struct softirq_action *h)
+{
+	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
+	int rearm = 0, budget = irq_poll_budget;
+	unsigned long start_time = jiffies;
+
+	local_irq_disable();
+
+	while (!list_empty(list)) {
+		struct irq_poll *iop;
+		int work, weight;
+
+		/*
+		 * If softirq window is exhausted then punt.
+		 */
+		if (budget <= 0 || time_after(jiffies, start_time)) {
+			rearm = 1;
+			break;
+		}
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->poll()
+		 * calls can remove this head entry from the list.
+		 */
+		iop = list_entry(list->next, struct irq_poll, list);
+
+		weight = iop->weight;
+		work = 0;
+		if (test_bit(IRQ_POLL_F_SCHED, &iop->state))
+			work = iop->poll(iop, weight);
+
+		budget -= work;
+
+		local_irq_disable();
+
+		/*
+		 * Drivers must not modify the iopoll state, if they
+		 * consume their assigned weight (or more, some drivers can't
+		 * easily just stop processing, they have to complete an
+		 * entire mask of commands).In such cases this code
+		 * still "owns" the iopoll instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (work >= weight) {
+			if (irq_poll_disable_pending(iop))
+				__irq_poll_complete(iop);
+			else
+				list_move_tail(&iop->list, list);
+		}
+	}
+
+	if (rearm)
+		__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+
+	local_irq_enable();
+}
+
+/**
+ * irq_poll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void irq_poll_disable(struct irq_poll *iop)
+{
+	set_bit(IRQ_POLL_F_DISABLE, &iop->state);
+	while (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
+		msleep(1);
+	clear_bit(IRQ_POLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(irq_poll_disable);
+
+/**
+ * irq_poll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be
+ *     scheduled, it will only mark it as active.
+ **/
+void irq_poll_enable(struct irq_poll *iop)
+{
+	BUG_ON(!test_bit(IRQ_POLL_F_SCHED, &iop->state));
+	smp_mb__before_atomic();
+	clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(irq_poll_enable);
+
+/**
+ * irq_poll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize this irq_poll structure. Before being actively used, the
+ *     driver must call irq_poll_enable().
+ **/
+void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn)
+{
+	memset(iop, 0, sizeof(*iop));
+	INIT_LIST_HEAD(&iop->list);
+	iop->weight = weight;
+	iop->poll = poll_fn;
+	set_bit(IRQ_POLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(irq_poll_init);
+
+static int irq_poll_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+				 this_cpu_ptr(&blk_cpu_iopoll));
+		__raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block irq_poll_cpu_notifier = {
+	.notifier_call	= irq_poll_cpu_notify,
+};
+
+static __init int irq_poll_setup(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+	open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
+	register_hotcpu_notifier(&irq_poll_cpu_notifier);
+	return 0;
+}
+subsys_initcall(irq_poll_setup);
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 2a912df..af5a316 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -3746,7 +3746,7 @@ static const struct flag flags[] = {
 	{ "NET_TX_SOFTIRQ", 2 },
 	{ "NET_RX_SOFTIRQ", 3 },
 	{ "BLOCK_SOFTIRQ", 4 },
-	{ "BLOCK_IOPOLL_SOFTIRQ", 5 },
+	{ "IRQ_POLL_SOFTIRQ", 5 },
 	{ "TASKLET_SOFTIRQ", 6 },
 	{ "SCHED_SOFTIRQ", 7 },
 	{ "HRTIMER_SOFTIRQ", 8 },
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 8ff7d62..33b52ea 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -209,7 +209,7 @@ static const struct flag flags[] = {
 	{ "NET_TX_SOFTIRQ", 2 },
 	{ "NET_RX_SOFTIRQ", 3 },
 	{ "BLOCK_SOFTIRQ", 4 },
-	{ "BLOCK_IOPOLL_SOFTIRQ", 5 },
+	{ "IRQ_POLL_SOFTIRQ", 5 },
 	{ "TASKLET_SOFTIRQ", 6 },
 	{ "SCHED_SOFTIRQ", 7 },
 	{ "HRTIMER_SOFTIRQ", 8 },
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
  2015-11-15  9:40   ` Sagi Grimberg
       [not found]   ` <1447422410-20891-3-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-13 13:46 ` [PATCH 3/9] IB: add a helper to safely drain a QP Christoph Hellwig
                   ` (6 subsequent siblings)
  8 siblings, 2 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

This adds an abstraction that allows ULP to simply pass a completion
object and completion callback with each submitted WR and let the RDMA
core handle the nitty gritty details of how to handle completion
interrupts and poll the CQ.

In detail there is a new ib_cqe structure which just contains the
completion callback, and which can be used to get at the containing
object using container_of.  It is pointed to by the WR and WC as an
alternative to the wr_id field, similar to how many ULPs already use
the field to store a pointer using casts.

A driver using the new completion callbacks allocates it's CQs using
the new ib_create_cq API, which in addition to the number of CQEs and
the completion vectors also takes a mode on how we poll for CQEs.
Three modes are available: direct for drivers that never take CQ
interrupts and just poll for them, softirq to poll from softirq context
using the to be renamed blk-iopoll infrastructure which takes care of
rearming and budgeting, or a workqueue for consumer who want to be
called from user context.

Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote
the current version of the workqueue code because my two previous
attempts sucked too much and converted the iSER initiator to the new
API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/Kconfig              |   1 +
 drivers/infiniband/core/Makefile        |   2 +-
 drivers/infiniband/core/cq.c            | 208 ++++++++++++++++++++++++++++++++
 drivers/infiniband/core/device.c        |  15 ++-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |   2 +-
 drivers/infiniband/ulp/srp/ib_srp.c     |   6 +-
 include/rdma/ib_verbs.h                 |  38 +++++-
 7 files changed, 263 insertions(+), 9 deletions(-)
 create mode 100644 drivers/infiniband/core/cq.c

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa26f3c..282ec0b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
 	depends on NET
 	depends on INET
 	depends on m || IPV6 != m
+	select IRQ_POLL
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..ae48d8740 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
-ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y :=			packer.o ud_header.o verbs.o cq.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 0000000..d9eb796
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH			16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ		256
+#define IB_POLL_BUDGET_WORKQUEUE	65536
+
+#define IB_POLL_FLAGS \
+	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+	int i, n, completed = 0;
+
+	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = &cq->wc[i];
+
+			if (wc->wr_cqe)
+				wc->wr_cqe->done(cq, wc);
+			else
+				WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+		}
+
+		completed += n;
+		if (completed >= budget)
+			break;
+	}
+
+	return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:		CQ to process
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
+ * context and does not ask from completion interrupts from the HCA.
+ */
+void ib_process_cq_direct(struct ib_cq *cq)
+{
+	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+
+	__ib_process_cq(cq, INT_MAX);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+	WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+	int completed;
+
+	completed = __ib_process_cq(cq, budget);
+	if (completed < budget) {
+		irq_poll_complete(&cq->iop);
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
+			if (!irq_poll_sched_prep(&cq->iop))
+				irq_poll_sched(&cq->iop);
+		}
+			
+	}
+
+	return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+	if (!irq_poll_sched_prep(&cq->iop))
+		irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	int completed;
+
+	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+		queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+	queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * ib_alloc_cq - allocate a completion queue
+ * @dev:		device to allocate the CQ for
+ * @private:		driver private data, accessible from cq->cq_context
+ * @nr_cqe:		number of CQEs to allocate
+ * @comp_vector:	HCA completion vectors for this CQ
+ * @poll_ctx:		context to poll the CQ from.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context.  The ULP needs must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+		int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+	struct ib_cq_init_attr cq_attr = {
+		.cqe		= nr_cqe,
+		.comp_vector	= comp_vector,
+	};
+	struct ib_cq *cq;
+	int ret = -ENOMEM;
+
+	cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+	if (IS_ERR(cq))
+		return cq;
+
+	cq->device = dev;
+	cq->uobject = NULL;
+	cq->event_handler = NULL;
+	cq->cq_context = private;;
+	cq->poll_ctx = poll_ctx;
+	atomic_set(&cq->usecnt, 0);
+
+	cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+	if (!cq->wc)
+		goto out_destroy_cq;
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		cq->comp_handler = ib_cq_completion_direct;
+		break;
+	case IB_POLL_SOFTIRQ:
+		cq->comp_handler = ib_cq_completion_softirq;
+
+		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+		irq_poll_enable(&cq->iop);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	case IB_POLL_WORKQUEUE:
+		cq->comp_handler = ib_cq_completion_workqueue;
+		INIT_WORK(&cq->work, ib_cq_poll_work);
+		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out_free_wc;
+	}
+
+	return cq;
+
+out_free_wc:
+	kfree(cq->wc);
+out_destroy_cq:
+	cq->device->destroy_cq(cq);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq:		completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+	int ret;
+
+	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+		return;
+
+	switch (cq->poll_ctx) {
+	case IB_POLL_DIRECT:
+		break;
+	case IB_POLL_SOFTIRQ:
+		irq_poll_disable(&cq->iop);
+		break;
+	case IB_POLL_WORKQUEUE:
+		flush_work(&cq->work);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	kfree(cq->wc);
+	ret = cq->device->destroy_cq(cq);
+	WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 0315bd7..f0ac300 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -58,6 +58,7 @@ struct ib_client_data {
 	bool		  going_down;
 };
 
+struct workqueue_struct *ib_comp_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
@@ -934,10 +935,18 @@ static int __init ib_core_init(void)
 	if (!ib_wq)
 		return -ENOMEM;
 
+	ib_comp_wq = alloc_workqueue("ib-comp-wq",
+			WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+			WQ_UNBOUND_MAX_ACTIVE);
+	if (!ib_comp_wq) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	ret = class_register(&ib_class);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
-		goto err;
+		goto err_comp;
 	}
 
 	ret = ibnl_init();
@@ -952,7 +961,8 @@ static int __init ib_core_init(void)
 
 err_sysfs:
 	class_unregister(&ib_class);
-
+err_comp:
+	destroy_workqueue(ib_comp_wq);
 err:
 	destroy_workqueue(ib_wq);
 	return ret;
@@ -963,6 +973,7 @@ static void __exit ib_core_cleanup(void)
 	ib_cache_cleanup();
 	ibnl_cleanup();
 	class_unregister(&ib_class);
+	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 737d273..d315367 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
-	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
 	.opcode = IB_WR_SEND,
 };
 
@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
 	 * error" WC will be immediately generated for each WR we post.
 	 */
 	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
 	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
 		ipoib_warn(priv, "failed to post drain wr\n");
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 62b6cba..3027824 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
 	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+	static struct ib_recv_wr wr = { 0 };
 	struct ib_recv_wr *bad_wr;
 	int ret;
 
+	wr.wr_id = SRP_LAST_WR_ID;
 	/* Destroying a QP and reusing ch->done is only safe if not connected */
 	WARN_ON_ONCE(ch->connected);
 
@@ -1042,13 +1043,14 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
 	struct ib_send_wr *bad_wr;
 	struct ib_send_wr wr = {
 		.opcode		    = IB_WR_LOCAL_INV,
-		.wr_id		    = LOCAL_INV_WR_ID_MASK,
 		.next		    = NULL,
 		.num_sge	    = 0,
 		.send_flags	    = 0,
 		.ex.invalidate_rkey = rkey,
 	};
 
+	wr.wr_id = LOCAL_INV_WR_ID_MASK;
+
 	return ib_post_send(ch->qp, &wr, &bad_wr);
 }
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 45ce36e..e11e038 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/socket.h>
+#include <linux/irq_poll.h>
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
@@ -56,6 +57,7 @@
 #include <asm/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
 
 union ib_gid {
 	u8	raw[16];
@@ -710,7 +712,10 @@ enum ib_wc_flags {
 };
 
 struct ib_wc {
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	enum ib_wc_status	status;
 	enum ib_wc_opcode	opcode;
 	u32			vendor_err;
@@ -1014,6 +1019,10 @@ struct ib_sge {
 	u32	lkey;
 };
 
+struct ib_cqe {
+	void (*done)(struct ib_cq *cq, struct ib_wc *wc);
+};
+
 /**
  * struct ib_mw_bind_info - Parameters for a memory window bind operation.
  * @mr: A memory region to bind the memory window to.
@@ -1033,7 +1042,10 @@ struct ib_mw_bind_info {
 
 struct ib_send_wr {
 	struct ib_send_wr      *next;
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	struct ib_sge	       *sg_list;
 	int			num_sge;
 	enum ib_wr_opcode	opcode;
@@ -1127,7 +1139,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
 
 struct ib_recv_wr {
 	struct ib_recv_wr      *next;
-	u64			wr_id;
+	union {
+		u64		wr_id;
+		struct ib_cqe	*wr_cqe;
+	};
 	struct ib_sge	       *sg_list;
 	int			num_sge;
 };
@@ -1258,6 +1273,12 @@ struct ib_ah {
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
+enum ib_poll_context {
+	IB_POLL_DIRECT,		/* caller context, no hw completions */
+	IB_POLL_SOFTIRQ,	/* poll from softirq context */
+	IB_POLL_WORKQUEUE,	/* poll from workqueue */
+};
+
 struct ib_cq {
 	struct ib_device       *device;
 	struct ib_uobject      *uobject;
@@ -1266,6 +1287,12 @@ struct ib_cq {
 	void                   *cq_context;
 	int               	cqe;
 	atomic_t          	usecnt; /* count number of work queues */
+	enum ib_poll_context	poll_ctx;
+	struct ib_wc		*wc;
+	union {
+		struct irq_poll		iop;
+		struct work_struct	work;
+	};
 };
 
 struct ib_srq {
@@ -2447,6 +2474,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
 	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
 }
 
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+		int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+void ib_process_cq_direct(struct ib_cq *cq);
+
 /**
  * ib_create_cq - Creates a CQ on the specified device.
  * @device: The device on which to create the CQ.
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-4-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-15  9:34   ` Sagi Grimberg
       [not found] ` <1447422410-20891-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/core/cq.c | 46 ++++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h      |  2 ++
 2 files changed, 48 insertions(+)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index d9eb796..bf2a079 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -206,3 +206,49 @@ void ib_free_cq(struct ib_cq *cq)
 	WARN_ON_ONCE(ret);
 }
 EXPORT_SYMBOL(ib_free_cq);
+
+struct ib_stop_cqe {
+	struct ib_cqe	cqe;
+	struct completion done;
+};
+
+static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_stop_cqe *stop =
+		container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
+
+	complete(&stop->done);
+}
+
+/*
+ * Change a queue pair into the error state and wait until all receive
+ * completions have been processed before destroying it. This avoids that
+ * the receive completion handler can access the queue pair while it is
+ * being destroyed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	struct ib_stop_cqe stop = { };
+	struct ib_recv_wr wr, *bad_wr;
+	int ret;
+
+	wr.wr_cqe = &stop.cqe;
+	stop.cqe.done = ib_stop_done;
+	init_completion(&stop.done);
+
+	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+		return;
+	}
+
+	ret = ib_post_recv(qp, &wr, &bad_wr);
+	if (ret) {
+		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+		return;
+	}
+
+	wait_for_completion(&stop.done);
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e11e038..f59a8d3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3075,4 +3075,6 @@ int ib_sg_to_pages(struct ib_mr *mr,
 		   int sg_nents,
 		   int (*set_page)(struct ib_mr *, u64));
 
+void ib_drain_qp(struct ib_qp *qp);
+
 #endif /* IB_VERBS_H */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-13 13:46     ` Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
                       ` (7 subsequent siblings)
  8 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Remove struct rdma_iu and instead allocate the struct ib_rdma_wr array
early and fill out directly.  This allows us to chain the WRs, and thus
archive both less lock contention on the HCA workqueue as well as much
simpler error handling.

Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 100 +++++++++++++---------------------
 drivers/infiniband/ulp/srpt/ib_srpt.h |  14 +----
 2 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 14b361a..2b6dd71 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1057,7 +1057,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
 
 	while (ioctx->n_rdma)
-		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+		kfree(ioctx->rdma_ius[--ioctx->n_rdma].wr.sg_list);
 
 	kfree(ioctx->rdma_ius);
 	ioctx->rdma_ius = NULL;
@@ -1084,7 +1084,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	struct scatterlist *sg, *sg_orig;
 	int sg_cnt;
 	enum dma_data_direction dir;
-	struct rdma_iu *riu;
+	struct ib_rdma_wr *riu;
 	struct srp_direct_buf *db;
 	dma_addr_t dma_addr;
 	struct ib_sge *sge;
@@ -1117,7 +1117,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
 			+ ioctx->n_rbuf;
 
-		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
+		ioctx->rdma_ius = kcalloc(nrdma, sizeof(*ioctx->rdma_ius),
+				GFP_KERNEL);
 		if (!ioctx->rdma_ius)
 			goto free_mem;
 
@@ -1141,9 +1142,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
 		raddr = be64_to_cpu(db->va);
-		riu->raddr = raddr;
+		riu->remote_addr = raddr;
 		riu->rkey = be32_to_cpu(db->key);
-		riu->sge_cnt = 0;
+		riu->wr.num_sge = 0;
 
 		/* calculate how many sge required for this remote_buf */
 		while (rsize > 0 && tsize > 0) {
@@ -1167,27 +1168,29 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 				rsize = 0;
 			}
 
-			++riu->sge_cnt;
+			++riu->wr.num_sge;
 
-			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+			if (rsize > 0 &&
+			    riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
 				++ioctx->n_rdma;
-				riu->sge =
-				    kmalloc(riu->sge_cnt * sizeof *riu->sge,
-					    GFP_KERNEL);
-				if (!riu->sge)
+				riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+						sizeof(*riu->wr.sg_list),
+						GFP_KERNEL);
+				if (!riu->wr.sg_list)
 					goto free_mem;
 
 				++riu;
-				riu->sge_cnt = 0;
-				riu->raddr = raddr;
+				riu->wr.num_sge = 0;
+				riu->remote_addr = raddr;
 				riu->rkey = be32_to_cpu(db->key);
 			}
 		}
 
 		++ioctx->n_rdma;
-		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
-				   GFP_KERNEL);
-		if (!riu->sge)
+		riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+					sizeof(*riu->wr.sg_list),
+					GFP_KERNEL);
+		if (!riu->wr.sg_list)
 			goto free_mem;
 	}
 
@@ -1202,7 +1205,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	for (i = 0, j = 0;
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
-		sge = riu->sge;
+		sge = riu->wr.sg_list;
 		k = 0;
 
 		while (rsize > 0 && tsize > 0) {
@@ -1234,9 +1237,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 			}
 
 			++k;
-			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+			if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
 				++riu;
-				sge = riu->sge;
+				sge = riu->wr.sg_list;
 				k = 0;
 			} else if (rsize > 0 && tsize > 0)
 				++sge;
@@ -1457,8 +1460,6 @@ static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
 		else
 			pr_err("%s[%d]: wrong state = %d\n", __func__,
 			       __LINE__, srpt_get_cmd_state(ioctx));
-	} else if (opcode == SRPT_RDMA_ABORT) {
-		ioctx->rdma_aborted = true;
 	} else {
 		WARN(true, "unexpected opcode %d\n", opcode);
 	}
@@ -1981,8 +1982,7 @@ static void srpt_process_send_completion(struct ib_cq *cq,
 		if (opcode == SRPT_SEND)
 			srpt_handle_send_comp(ch, send_ioctx);
 		else {
-			WARN_ON(opcode != SRPT_RDMA_ABORT &&
-				wc->opcode != IB_WC_RDMA_READ);
+			WARN_ON(wc->opcode != IB_WC_RDMA_READ);
 			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
 		}
 	} else {
@@ -2823,9 +2823,7 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 			      struct srpt_send_ioctx *ioctx)
 {
-	struct ib_rdma_wr wr;
 	struct ib_send_wr *bad_wr;
-	struct rdma_iu *riu;
 	int i;
 	int ret;
 	int sq_wr_avail;
@@ -2844,59 +2842,37 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 		}
 	}
 
-	ioctx->rdma_aborted = false;
-	ret = 0;
-	riu = ioctx->rdma_ius;
-	memset(&wr, 0, sizeof wr);
+	for (i = 0; i < n_rdma; i++) {
+		struct ib_rdma_wr *wr = &ioctx->rdma_ius[i];
 
-	for (i = 0; i < n_rdma; ++i, ++riu) {
 		if (dir == DMA_FROM_DEVICE) {
-			wr.wr.opcode = IB_WR_RDMA_WRITE;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+			wr->wr.opcode = IB_WR_RDMA_WRITE;
+			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
 						SRPT_RDMA_WRITE_LAST :
 						SRPT_RDMA_MID,
 						ioctx->ioctx.index);
 		} else {
-			wr.wr.opcode = IB_WR_RDMA_READ;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+			wr->wr.opcode = IB_WR_RDMA_READ;
+			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
 						SRPT_RDMA_READ_LAST :
 						SRPT_RDMA_MID,
 						ioctx->ioctx.index);
 		}
-		wr.wr.next = NULL;
-		wr.remote_addr = riu->raddr;
-		wr.rkey = riu->rkey;
-		wr.wr.num_sge = riu->sge_cnt;
-		wr.wr.sg_list = riu->sge;
-
-		/* only get completion event for the last rdma write */
-		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
-			wr.wr.send_flags = IB_SEND_SIGNALED;
 
-		ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-		if (ret)
-			break;
+		if (i == n_rdma - 1) {
+			/* only get completion event for the last rdma read */
+			if (dir == DMA_TO_DEVICE)
+				wr->wr.send_flags = IB_SEND_SIGNALED;
+			wr->wr.next = NULL;
+		} else {
+			wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
+		}
 	}
 
+	ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
 	if (ret)
 		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
 				 __func__, __LINE__, ret, i, n_rdma);
-	if (ret && i > 0) {
-		wr.wr.num_sge = 0;
-		wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
-		wr.wr.send_flags = IB_SEND_SIGNALED;
-		while (ch->state == CH_LIVE &&
-			ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) {
-			pr_info("Trying to abort failed RDMA transfer [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
-			pr_info("Waiting until RDMA abort finished [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-	}
 out:
 	if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
 		atomic_add(n_rdma, &ch->sq_wr_avail);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 0df7d61..fd6097e 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -132,7 +132,6 @@ enum srpt_opcode {
 	SRPT_RECV,
 	SRPT_SEND,
 	SRPT_RDMA_MID,
-	SRPT_RDMA_ABORT,
 	SRPT_RDMA_READ_LAST,
 	SRPT_RDMA_WRITE_LAST,
 };
@@ -150,14 +149,6 @@ static inline u32 idx_from_wr_id(u64 wr_id)
 	return (u32)wr_id;
 }
 
-struct rdma_iu {
-	u64		raddr;
-	u32		rkey;
-	struct ib_sge	*sge;
-	u32		sge_cnt;
-	int		mem_id;
-};
-
 /**
  * enum srpt_command_state - SCSI command state managed by SRPT.
  * @SRPT_STATE_NEW:           New command arrived and is being processed.
@@ -220,22 +211,19 @@ struct srpt_recv_ioctx {
  * @tag:         Tag of the received SRP information unit.
  * @spinlock:    Protects 'state'.
  * @state:       I/O context state.
- * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
- * 		 the already initiated transfers have finished.
  * @cmd:         Target core command data structure.
  * @sense_data:  SCSI sense data.
  */
 struct srpt_send_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct srpt_rdma_ch	*ch;
-	struct rdma_iu		*rdma_ius;
+	struct ib_rdma_wr	*rdma_ius;
 	struct srp_direct_buf	*rbufs;
 	struct srp_direct_buf	single_rbuf;
 	struct scatterlist	*sg;
 	struct list_head	free_list;
 	spinlock_t		spinlock;
 	enum srpt_command_state	state;
-	bool			rdma_aborted;
 	struct se_cmd		cmd;
 	struct completion	tx_done;
 	int			sg_cnt;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
@ 2015-11-13 13:46     ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

Remove struct rdma_iu and instead allocate the struct ib_rdma_wr array
early and fill out directly.  This allows us to chain the WRs, and thus
archive both less lock contention on the HCA workqueue as well as much
simpler error handling.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 100 +++++++++++++---------------------
 drivers/infiniband/ulp/srpt/ib_srpt.h |  14 +----
 2 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 14b361a..2b6dd71 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1057,7 +1057,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
 
 	while (ioctx->n_rdma)
-		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+		kfree(ioctx->rdma_ius[--ioctx->n_rdma].wr.sg_list);
 
 	kfree(ioctx->rdma_ius);
 	ioctx->rdma_ius = NULL;
@@ -1084,7 +1084,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	struct scatterlist *sg, *sg_orig;
 	int sg_cnt;
 	enum dma_data_direction dir;
-	struct rdma_iu *riu;
+	struct ib_rdma_wr *riu;
 	struct srp_direct_buf *db;
 	dma_addr_t dma_addr;
 	struct ib_sge *sge;
@@ -1117,7 +1117,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
 			+ ioctx->n_rbuf;
 
-		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
+		ioctx->rdma_ius = kcalloc(nrdma, sizeof(*ioctx->rdma_ius),
+				GFP_KERNEL);
 		if (!ioctx->rdma_ius)
 			goto free_mem;
 
@@ -1141,9 +1142,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
 		raddr = be64_to_cpu(db->va);
-		riu->raddr = raddr;
+		riu->remote_addr = raddr;
 		riu->rkey = be32_to_cpu(db->key);
-		riu->sge_cnt = 0;
+		riu->wr.num_sge = 0;
 
 		/* calculate how many sge required for this remote_buf */
 		while (rsize > 0 && tsize > 0) {
@@ -1167,27 +1168,29 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 				rsize = 0;
 			}
 
-			++riu->sge_cnt;
+			++riu->wr.num_sge;
 
-			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+			if (rsize > 0 &&
+			    riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
 				++ioctx->n_rdma;
-				riu->sge =
-				    kmalloc(riu->sge_cnt * sizeof *riu->sge,
-					    GFP_KERNEL);
-				if (!riu->sge)
+				riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+						sizeof(*riu->wr.sg_list),
+						GFP_KERNEL);
+				if (!riu->wr.sg_list)
 					goto free_mem;
 
 				++riu;
-				riu->sge_cnt = 0;
-				riu->raddr = raddr;
+				riu->wr.num_sge = 0;
+				riu->remote_addr = raddr;
 				riu->rkey = be32_to_cpu(db->key);
 			}
 		}
 
 		++ioctx->n_rdma;
-		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
-				   GFP_KERNEL);
-		if (!riu->sge)
+		riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+					sizeof(*riu->wr.sg_list),
+					GFP_KERNEL);
+		if (!riu->wr.sg_list)
 			goto free_mem;
 	}
 
@@ -1202,7 +1205,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 	for (i = 0, j = 0;
 	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
 		rsize = be32_to_cpu(db->len);
-		sge = riu->sge;
+		sge = riu->wr.sg_list;
 		k = 0;
 
 		while (rsize > 0 && tsize > 0) {
@@ -1234,9 +1237,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 			}
 
 			++k;
-			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+			if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
 				++riu;
-				sge = riu->sge;
+				sge = riu->wr.sg_list;
 				k = 0;
 			} else if (rsize > 0 && tsize > 0)
 				++sge;
@@ -1457,8 +1460,6 @@ static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
 		else
 			pr_err("%s[%d]: wrong state = %d\n", __func__,
 			       __LINE__, srpt_get_cmd_state(ioctx));
-	} else if (opcode == SRPT_RDMA_ABORT) {
-		ioctx->rdma_aborted = true;
 	} else {
 		WARN(true, "unexpected opcode %d\n", opcode);
 	}
@@ -1981,8 +1982,7 @@ static void srpt_process_send_completion(struct ib_cq *cq,
 		if (opcode == SRPT_SEND)
 			srpt_handle_send_comp(ch, send_ioctx);
 		else {
-			WARN_ON(opcode != SRPT_RDMA_ABORT &&
-				wc->opcode != IB_WC_RDMA_READ);
+			WARN_ON(wc->opcode != IB_WC_RDMA_READ);
 			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
 		}
 	} else {
@@ -2823,9 +2823,7 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 			      struct srpt_send_ioctx *ioctx)
 {
-	struct ib_rdma_wr wr;
 	struct ib_send_wr *bad_wr;
-	struct rdma_iu *riu;
 	int i;
 	int ret;
 	int sq_wr_avail;
@@ -2844,59 +2842,37 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 		}
 	}
 
-	ioctx->rdma_aborted = false;
-	ret = 0;
-	riu = ioctx->rdma_ius;
-	memset(&wr, 0, sizeof wr);
+	for (i = 0; i < n_rdma; i++) {
+		struct ib_rdma_wr *wr = &ioctx->rdma_ius[i];
 
-	for (i = 0; i < n_rdma; ++i, ++riu) {
 		if (dir == DMA_FROM_DEVICE) {
-			wr.wr.opcode = IB_WR_RDMA_WRITE;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+			wr->wr.opcode = IB_WR_RDMA_WRITE;
+			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
 						SRPT_RDMA_WRITE_LAST :
 						SRPT_RDMA_MID,
 						ioctx->ioctx.index);
 		} else {
-			wr.wr.opcode = IB_WR_RDMA_READ;
-			wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+			wr->wr.opcode = IB_WR_RDMA_READ;
+			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
 						SRPT_RDMA_READ_LAST :
 						SRPT_RDMA_MID,
 						ioctx->ioctx.index);
 		}
-		wr.wr.next = NULL;
-		wr.remote_addr = riu->raddr;
-		wr.rkey = riu->rkey;
-		wr.wr.num_sge = riu->sge_cnt;
-		wr.wr.sg_list = riu->sge;
-
-		/* only get completion event for the last rdma write */
-		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
-			wr.wr.send_flags = IB_SEND_SIGNALED;
 
-		ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-		if (ret)
-			break;
+		if (i == n_rdma - 1) {
+			/* only get completion event for the last rdma read */
+			if (dir == DMA_TO_DEVICE)
+				wr->wr.send_flags = IB_SEND_SIGNALED;
+			wr->wr.next = NULL;
+		} else {
+			wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
+		}
 	}
 
+	ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
 	if (ret)
 		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
 				 __func__, __LINE__, ret, i, n_rdma);
-	if (ret && i > 0) {
-		wr.wr.num_sge = 0;
-		wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
-		wr.wr.send_flags = IB_SEND_SIGNALED;
-		while (ch->state == CH_LIVE &&
-			ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) {
-			pr_info("Trying to abort failed RDMA transfer [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
-			pr_info("Waiting until RDMA abort finished [%d]\n",
-				ioctx->ioctx.index);
-			msleep(1000);
-		}
-	}
 out:
 	if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
 		atomic_add(n_rdma, &ch->sq_wr_avail);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index 0df7d61..fd6097e 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -132,7 +132,6 @@ enum srpt_opcode {
 	SRPT_RECV,
 	SRPT_SEND,
 	SRPT_RDMA_MID,
-	SRPT_RDMA_ABORT,
 	SRPT_RDMA_READ_LAST,
 	SRPT_RDMA_WRITE_LAST,
 };
@@ -150,14 +149,6 @@ static inline u32 idx_from_wr_id(u64 wr_id)
 	return (u32)wr_id;
 }
 
-struct rdma_iu {
-	u64		raddr;
-	u32		rkey;
-	struct ib_sge	*sge;
-	u32		sge_cnt;
-	int		mem_id;
-};
-
 /**
  * enum srpt_command_state - SCSI command state managed by SRPT.
  * @SRPT_STATE_NEW:           New command arrived and is being processed.
@@ -220,22 +211,19 @@ struct srpt_recv_ioctx {
  * @tag:         Tag of the received SRP information unit.
  * @spinlock:    Protects 'state'.
  * @state:       I/O context state.
- * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
- * 		 the already initiated transfers have finished.
  * @cmd:         Target core command data structure.
  * @sense_data:  SCSI sense data.
  */
 struct srpt_send_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct srpt_rdma_ch	*ch;
-	struct rdma_iu		*rdma_ius;
+	struct ib_rdma_wr	*rdma_ius;
 	struct srp_direct_buf	*rbufs;
 	struct srp_direct_buf	single_rbuf;
 	struct scatterlist	*sg;
 	struct list_head	free_list;
 	spinlock_t		spinlock;
 	enum srpt_command_state	state;
-	bool			rdma_aborted;
 	struct se_cmd		cmd;
 	struct completion	tx_done;
 	int			sg_cnt;
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 5/9] srpt: use the new CQ API
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
                   ` (3 preceding siblings ...)
       [not found] ` <1447422410-20891-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-6-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-17 19:38     ` Bart Van Assche
  2015-11-13 13:46 ` [PATCH 6/9] srp: " Christoph Hellwig
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 327 +++++++++-------------------------
 drivers/infiniband/ulp/srpt/ib_srpt.h |  28 +--
 2 files changed, 88 insertions(+), 267 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 2b6dd71..d4bbad3 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -95,6 +95,8 @@ MODULE_PARM_DESC(srpt_service_guid,
 static struct ib_client srpt_client;
 static void srpt_release_channel(struct srpt_rdma_ch *ch);
 static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /**
  * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
@@ -780,12 +782,12 @@ static int srpt_post_recv(struct srpt_device *sdev,
 	struct ib_recv_wr wr, *bad_wr;
 
 	BUG_ON(!sdev);
-	wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
-
 	list.addr = ioctx->ioctx.dma;
 	list.length = srp_max_req_size;
 	list.lkey = sdev->pd->local_dma_lkey;
 
+	ioctx->ioctx.cqe.done = srpt_recv_done;
+	wr.wr_cqe = &ioctx->ioctx.cqe;
 	wr.next = NULL;
 	wr.sg_list = &list;
 	wr.num_sge = 1;
@@ -821,8 +823,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
 	list.length = len;
 	list.lkey = sdev->pd->local_dma_lkey;
 
+	ioctx->ioctx.cqe.done = srpt_send_done;
 	wr.next = NULL;
-	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+	wr.wr_cqe = &ioctx->ioctx.cqe;
 	wr.sg_list = &list;
 	wr.num_sge = 1;
 	wr.opcode = IB_WR_SEND;
@@ -1385,116 +1388,44 @@ out:
 }
 
 /**
- * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
- */
-static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
-{
-	struct srpt_send_ioctx *ioctx;
-	enum srpt_command_state state;
-	u32 index;
-
-	atomic_inc(&ch->sq_wr_avail);
-
-	index = idx_from_wr_id(wr_id);
-	ioctx = ch->ioctx_ring[index];
-	state = srpt_get_cmd_state(ioctx);
-
-	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-		&& state != SRPT_STATE_MGMT_RSP_SENT
-		&& state != SRPT_STATE_NEED_DATA
-		&& state != SRPT_STATE_DONE);
-
-	/* If SRP_RSP sending failed, undo the ch->req_lim change. */
-	if (state == SRPT_STATE_CMD_RSP_SENT
-	    || state == SRPT_STATE_MGMT_RSP_SENT)
-		atomic_dec(&ch->req_lim);
-
-	srpt_abort_cmd(ioctx);
-}
-
-/**
- * srpt_handle_send_comp() - Process an IB send completion notification.
- */
-static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
-				  struct srpt_send_ioctx *ioctx)
-{
-	enum srpt_command_state state;
-
-	atomic_inc(&ch->sq_wr_avail);
-
-	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-
-	if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-		    && state != SRPT_STATE_MGMT_RSP_SENT
-		    && state != SRPT_STATE_DONE))
-		pr_debug("state = %d\n", state);
-
-	if (state != SRPT_STATE_DONE) {
-		srpt_unmap_sg_to_ib_sge(ch, ioctx);
-		transport_generic_free_cmd(&ioctx->cmd, 0);
-	} else {
-		pr_err("IB completion has been received too late for"
-		       " wr_id = %u.\n", ioctx->ioctx.index);
-	}
-}
-
-/**
- * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
- *
  * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
  * the data that has been transferred via IB RDMA had to be postponed until the
  * check_stop_free() callback.  None of this is necessary anymore and needs to
  * be cleaned up.
  */
-static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
-				  struct srpt_send_ioctx *ioctx,
-				  enum srpt_opcode opcode)
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+
 	WARN_ON(ioctx->n_rdma <= 0);
 	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
 
-	if (opcode == SRPT_RDMA_READ_LAST) {
-		if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
-						SRPT_STATE_DATA_IN))
-			target_execute_cmd(&ioctx->cmd);
-		else
-			pr_err("%s[%d]: wrong state = %d\n", __func__,
-			       __LINE__, srpt_get_cmd_state(ioctx));
-	} else {
-		WARN(true, "unexpected opcode %d\n", opcode);
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
+		srpt_abort_cmd(ioctx);
+		return;
 	}
+
+	if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+					SRPT_STATE_DATA_IN))
+		target_execute_cmd(&ioctx->cmd);
+	else
+		pr_err("%s[%d]: wrong state = %d\n", __func__,
+		       __LINE__, srpt_get_cmd_state(ioctx));
 }
 
-/**
- * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
- */
-static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
-				      struct srpt_send_ioctx *ioctx,
-				      enum srpt_opcode opcode)
+static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	enum srpt_command_state state;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
 
-	state = srpt_get_cmd_state(ioctx);
-	switch (opcode) {
-	case SRPT_RDMA_READ_LAST:
-		if (ioctx->n_rdma <= 0) {
-			pr_err("Received invalid RDMA read"
-			       " error completion with idx %d\n",
-			       ioctx->ioctx.index);
-			break;
-		}
-		atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
-		if (state == SRPT_STATE_NEED_DATA)
-			srpt_abort_cmd(ioctx);
-		else
-			pr_err("%s[%d]: wrong state = %d\n",
-			       __func__, __LINE__, state);
-		break;
-	case SRPT_RDMA_WRITE_LAST:
-		break;
-	default:
-		pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode);
-		break;
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
+			ioctx, wc->status);
+		srpt_abort_cmd(ioctx);
 	}
 }
 
@@ -1929,32 +1860,26 @@ out:
 	return;
 }
 
-static void srpt_process_rcv_completion(struct ib_cq *cq,
-					struct srpt_rdma_ch *ch,
-					struct ib_wc *wc)
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_device *sdev = ch->sport->sdev;
-	struct srpt_recv_ioctx *ioctx;
-	u32 index;
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_recv_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
 
-	index = idx_from_wr_id(wc->wr_id);
 	if (wc->status == IB_WC_SUCCESS) {
 		int req_lim;
 
 		req_lim = atomic_dec_return(&ch->req_lim);
 		if (unlikely(req_lim < 0))
 			pr_err("req_lim = %d < 0\n", req_lim);
-		ioctx = sdev->ioctx_ring[index];
 		srpt_handle_new_iu(ch, ioctx, NULL);
 	} else {
-		pr_info("receiving failed for idx %u with status %d\n",
-			index, wc->status);
+		pr_info("receiving failed for ioctx %p with status %d\n",
+			ioctx, wc->status);
 	}
 }
 
 /**
- * srpt_process_send_completion() - Process an IB send completion.
- *
  * Note: Although this has not yet been observed during tests, at least in
  * theory it is possible that the srpt_get_send_ioctx() call invoked by
  * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
@@ -1967,108 +1892,52 @@ static void srpt_process_rcv_completion(struct ib_cq *cq,
  * are queued on cmd_wait_list. The code below processes these delayed
  * requests one at a time.
  */
-static void srpt_process_send_completion(struct ib_cq *cq,
-					 struct srpt_rdma_ch *ch,
-					 struct ib_wc *wc)
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	struct srpt_send_ioctx *send_ioctx;
-	uint32_t index;
-	enum srpt_opcode opcode;
+	struct srpt_rdma_ch *ch = cq->cq_context;
+	struct srpt_send_ioctx *ioctx =
+		container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+	enum srpt_command_state state;
+		
+	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
 
-	index = idx_from_wr_id(wc->wr_id);
-	opcode = opcode_from_wr_id(wc->wr_id);
-	send_ioctx = ch->ioctx_ring[index];
-	if (wc->status == IB_WC_SUCCESS) {
-		if (opcode == SRPT_SEND)
-			srpt_handle_send_comp(ch, send_ioctx);
-		else {
-			WARN_ON(wc->opcode != IB_WC_RDMA_READ);
-			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
-		}
+	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
+		state != SRPT_STATE_MGMT_RSP_SENT);
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	if (wc->status != IB_WC_SUCCESS) {
+		pr_info("sending response for ioctx 0x%p failed"
+			" with status %d\n", ioctx, wc->status);
+
+		atomic_dec(&ch->req_lim);
+		srpt_abort_cmd(ioctx);
+		goto out;
+	}
+
+	if (state != SRPT_STATE_DONE) {
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		transport_generic_free_cmd(&ioctx->cmd, 0);
 	} else {
-		if (opcode == SRPT_SEND) {
-			pr_info("sending response for idx %u failed"
-				" with status %d\n", index, wc->status);
-			srpt_handle_send_err_comp(ch, wc->wr_id);
-		} else if (opcode != SRPT_RDMA_MID) {
-			pr_info("RDMA t %d for idx %u failed with"
-				" status %d\n", opcode, index, wc->status);
-			srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
-		}
+		pr_err("IB completion has been received too late for"
+		       " wr_id = %u.\n", ioctx->ioctx.index);
 	}
 
-	while (unlikely(opcode == SRPT_SEND
-			&& !list_empty(&ch->cmd_wait_list)
-			&& srpt_get_ch_state(ch) == CH_LIVE
-			&& (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+out:
+	while (!list_empty(&ch->cmd_wait_list) &&
+	       srpt_get_ch_state(ch) == CH_LIVE &&
+	       (ioctx = srpt_get_send_ioctx(ch)) != NULL) {
 		struct srpt_recv_ioctx *recv_ioctx;
 
 		recv_ioctx = list_first_entry(&ch->cmd_wait_list,
 					      struct srpt_recv_ioctx,
 					      wait_list);
 		list_del(&recv_ioctx->wait_list);
-		srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
-	}
-}
-
-static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
-{
-	struct ib_wc *const wc = ch->wc;
-	int i, n;
-
-	WARN_ON(cq != ch->cq);
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
-		for (i = 0; i < n; i++) {
-			if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
-				srpt_process_rcv_completion(cq, ch, &wc[i]);
-			else
-				srpt_process_send_completion(cq, ch, &wc[i]);
-		}
+		srpt_handle_new_iu(ch, recv_ioctx, ioctx);
 	}
 }
 
 /**
- * srpt_completion() - IB completion queue callback function.
- *
- * Notes:
- * - It is guaranteed that a completion handler will never be invoked
- *   concurrently on two different CPUs for the same completion queue. See also
- *   Documentation/infiniband/core_locking.txt and the implementation of
- *   handle_edge_irq() in kernel/irq/chip.c.
- * - When threaded IRQs are enabled, completion handlers are invoked in thread
- *   context instead of interrupt context.
- */
-static void srpt_completion(struct ib_cq *cq, void *ctx)
-{
-	struct srpt_rdma_ch *ch = ctx;
-
-	wake_up_interruptible(&ch->wait_queue);
-}
-
-static int srpt_compl_thread(void *arg)
-{
-	struct srpt_rdma_ch *ch;
-
-	/* Hibernation / freezing of the SRPT kernel thread is not supported. */
-	current->flags |= PF_NOFREEZE;
-
-	ch = arg;
-	BUG_ON(!ch);
-	pr_info("Session %s: kernel thread %s (PID %d) started\n",
-		ch->sess_name, ch->thread->comm, current->pid);
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(ch->wait_queue,
-			(srpt_process_completion(ch->cq, ch),
-			 kthread_should_stop()));
-	}
-	pr_info("Session %s: kernel thread %s (PID %d) stopped\n",
-		ch->sess_name, ch->thread->comm, current->pid);
-	return 0;
-}
-
-/**
  * srpt_create_ch_ib() - Create receive and send completion queues.
  */
 static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
@@ -2077,7 +1946,6 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 	struct srpt_port *sport = ch->sport;
 	struct srpt_device *sdev = sport->sdev;
 	u32 srp_sq_size = sport->port_attrib.srp_sq_size;
-	struct ib_cq_init_attr cq_attr = {};
 	int ret;
 
 	WARN_ON(ch->rq_size < 1);
@@ -2088,9 +1956,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
 		goto out;
 
 retry:
-	cq_attr.cqe = ch->rq_size + srp_sq_size;
-	ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
-			      &cq_attr);
+	ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size,
+			0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
 		pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -2133,18 +2000,6 @@ retry:
 	if (ret)
 		goto err_destroy_qp;
 
-	init_waitqueue_head(&ch->wait_queue);
-
-	pr_debug("creating thread for session %s\n", ch->sess_name);
-
-	ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
-	if (IS_ERR(ch->thread)) {
-		pr_err("failed to create kernel thread %ld\n",
-		       PTR_ERR(ch->thread));
-		ch->thread = NULL;
-		goto err_destroy_qp;
-	}
-
 out:
 	kfree(qp_init);
 	return ret;
@@ -2152,17 +2007,14 @@ out:
 err_destroy_qp:
 	ib_destroy_qp(ch->qp);
 err_destroy_cq:
-	ib_destroy_cq(ch->cq);
+	ib_free_cq(ch->cq);
 	goto out;
 }
 
 static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
 {
-	if (ch->thread)
-		kthread_stop(ch->thread);
-
 	ib_destroy_qp(ch->qp);
-	ib_destroy_cq(ch->cq);
+	ib_free_cq(ch->cq);
 }
 
 /**
@@ -2824,9 +2676,7 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 			      struct srpt_send_ioctx *ioctx)
 {
 	struct ib_send_wr *bad_wr;
-	int i;
-	int ret;
-	int sq_wr_avail;
+	int sq_wr_avail, ret, i;
 	enum dma_data_direction dir;
 	const int n_rdma = ioctx->n_rdma;
 
@@ -2843,29 +2693,24 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
 	}
 
 	for (i = 0; i < n_rdma; i++) {
-		struct ib_rdma_wr *wr = &ioctx->rdma_ius[i];
-
-		if (dir == DMA_FROM_DEVICE) {
-			wr->wr.opcode = IB_WR_RDMA_WRITE;
-			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-						SRPT_RDMA_WRITE_LAST :
-						SRPT_RDMA_MID,
-						ioctx->ioctx.index);
-		} else {
-			wr->wr.opcode = IB_WR_RDMA_READ;
-			wr->wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-						SRPT_RDMA_READ_LAST :
-						SRPT_RDMA_MID,
-						ioctx->ioctx.index);
-		}
+		struct ib_send_wr *wr = &ioctx->rdma_ius[i].wr;
+
+		wr->opcode = (dir == DMA_FROM_DEVICE) ?
+				IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 
 		if (i == n_rdma - 1) {
 			/* only get completion event for the last rdma read */
-			if (dir == DMA_TO_DEVICE)
-				wr->wr.send_flags = IB_SEND_SIGNALED;
-			wr->wr.next = NULL;
+			if (dir == DMA_TO_DEVICE) {
+				wr->send_flags = IB_SEND_SIGNALED;
+				ioctx->rdma_cqe.done = srpt_rdma_read_done;
+			} else {
+				ioctx->rdma_cqe.done = srpt_rdma_write_done;
+			}
+			wr->wr_cqe = &ioctx->rdma_cqe;
+			wr->next = NULL;
 		} else {
-			wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
+			wr->wr_cqe = NULL;
+			wr->next = &ioctx->rdma_ius[i + 1].wr;
 		}
 	}
 
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index fd6097e..f9568f5 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -128,27 +128,6 @@ enum {
 	DEFAULT_MAX_RDMA_SIZE = 65536,
 };
 
-enum srpt_opcode {
-	SRPT_RECV,
-	SRPT_SEND,
-	SRPT_RDMA_MID,
-	SRPT_RDMA_READ_LAST,
-	SRPT_RDMA_WRITE_LAST,
-};
-
-static inline u64 encode_wr_id(u8 opcode, u32 idx)
-{
-	return ((u64)opcode << 32) | idx;
-}
-static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
-{
-	return wr_id >> 32;
-}
-static inline u32 idx_from_wr_id(u64 wr_id)
-{
-	return (u32)wr_id;
-}
-
 /**
  * enum srpt_command_state - SCSI command state managed by SRPT.
  * @SRPT_STATE_NEW:           New command arrived and is being processed.
@@ -180,6 +159,7 @@ enum srpt_command_state {
  * @index: Index of the I/O context in its ioctx_ring array.
  */
 struct srpt_ioctx {
+	struct ib_cqe		cqe;
 	void			*buf;
 	dma_addr_t		dma;
 	uint32_t		index;
@@ -218,6 +198,7 @@ struct srpt_send_ioctx {
 	struct srpt_ioctx	ioctx;
 	struct srpt_rdma_ch	*ch;
 	struct ib_rdma_wr	*rdma_ius;
+	struct ib_cqe		rdma_cqe;
 	struct srp_direct_buf	*rbufs;
 	struct srp_direct_buf	single_rbuf;
 	struct scatterlist	*sg;
@@ -255,9 +236,6 @@ enum rdma_ch_state {
 
 /**
  * struct srpt_rdma_ch - RDMA channel.
- * @wait_queue:    Allows the kernel thread to wait for more work.
- * @thread:        Kernel thread that processes the IB queues associated with
- *                 the channel.
  * @cm_id:         IB CM ID associated with the channel.
  * @qp:            IB queue pair used for communicating over this channel.
  * @cq:            IB completion queue for this channel.
@@ -287,8 +265,6 @@ enum rdma_ch_state {
  * @release_done:  Enables waiting for srpt_release_channel() completion.
  */
 struct srpt_rdma_ch {
-	wait_queue_head_t	wait_queue;
-	struct task_struct	*thread;
 	struct ib_cm_id		*cm_id;
 	struct ib_qp		*qp;
 	struct ib_cq		*cq;
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 6/9] srp: use the new CQ API
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
                   ` (4 preceding siblings ...)
  2015-11-13 13:46 ` [PATCH 5/9] srpt: use the new CQ API Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-7-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-13 13:46 ` [PATCH 7/9] IB/iser: Use a dedicated descriptor for login Christoph Hellwig
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma; +Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

This also moves recv completion handling from hardirq context into
softirq context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 198 ++++++++++++++----------------------
 drivers/infiniband/ulp/srp/ib_srp.h |   7 +-
 2 files changed, 76 insertions(+), 129 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 3027824..57237e1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count,
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device, void *client_data);
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -445,41 +446,6 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 				  dev->max_pages_per_mr);
 }
 
-/**
- * srp_destroy_qp() - destroy an RDMA queue pair
- * @ch: SRP RDMA channel.
- *
- * Change a queue pair into the error state and wait until all receive
- * completions have been processed before destroying it. This avoids that
- * the receive completion handler can access the queue pair while it is
- * being destroyed.
- */
-static void srp_destroy_qp(struct srp_rdma_ch *ch)
-{
-	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-	static struct ib_recv_wr wr = { 0 };
-	struct ib_recv_wr *bad_wr;
-	int ret;
-
-	wr.wr_id = SRP_LAST_WR_ID;
-	/* Destroying a QP and reusing ch->done is only safe if not connected */
-	WARN_ON_ONCE(ch->connected);
-
-	ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE);
-	WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
-	if (ret)
-		goto out;
-
-	init_completion(&ch->done);
-	ret = ib_post_recv(ch->qp, &wr, &bad_wr);
-	WARN_ONCE(ret, "ib_post_recv() returned %d\n", ret);
-	if (ret == 0)
-		wait_for_completion(&ch->done);
-
-out:
-	ib_destroy_qp(ch->qp);
-}
-
 static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
@@ -490,34 +456,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	struct ib_fmr_pool *fmr_pool = NULL;
 	struct srp_fr_pool *fr_pool = NULL;
 	const int m = 1 + dev->use_fast_reg;
-	struct ib_cq_init_attr cq_attr = {};
 	int ret;
 
 	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
 	if (!init_attr)
 		return -ENOMEM;
 
-	/* + 1 for SRP_LAST_WR_ID */
-	cq_attr.cqe = target->queue_size + 1;
-	cq_attr.comp_vector = ch->comp_vector;
-	recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
-			       &cq_attr);
+	/* queue_size + 1 for ib_drain_qp */
+	recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, ch->comp_vector,
+				IB_POLL_SOFTIRQ);
 	if (IS_ERR(recv_cq)) {
 		ret = PTR_ERR(recv_cq);
 		goto err;
 	}
 
-	cq_attr.cqe = m * target->queue_size;
-	cq_attr.comp_vector = ch->comp_vector;
-	send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
-			       &cq_attr);
+	send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size, ch->comp_vector,
+				IB_POLL_DIRECT);
 	if (IS_ERR(send_cq)) {
 		ret = PTR_ERR(send_cq);
 		goto err_recv_cq;
 	}
 
-	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
-
 	init_attr->event_handler       = srp_qp_event;
 	init_attr->cap.max_send_wr     = m * target->queue_size;
 	init_attr->cap.max_recv_wr     = target->queue_size + 1;
@@ -557,11 +516,11 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 	}
 
 	if (ch->qp)
-		srp_destroy_qp(ch);
+		ib_destroy_qp(ch->qp);
 	if (ch->recv_cq)
-		ib_destroy_cq(ch->recv_cq);
+		ib_free_cq(ch->recv_cq);
 	if (ch->send_cq)
-		ib_destroy_cq(ch->send_cq);
+		ib_free_cq(ch->send_cq);
 
 	ch->qp = qp;
 	ch->recv_cq = recv_cq;
@@ -584,10 +543,10 @@ err_qp:
 	ib_destroy_qp(qp);
 
 err_send_cq:
-	ib_destroy_cq(send_cq);
+	ib_free_cq(send_cq);
 
 err_recv_cq:
-	ib_destroy_cq(recv_cq);
+	ib_free_cq(recv_cq);
 
 err:
 	kfree(init_attr);
@@ -623,9 +582,11 @@ static void srp_free_ch_ib(struct srp_target_port *target,
 		if (ch->fmr_pool)
 			ib_destroy_fmr_pool(ch->fmr_pool);
 	}
-	srp_destroy_qp(ch);
-	ib_destroy_cq(ch->send_cq);
-	ib_destroy_cq(ch->recv_cq);
+
+	ib_drain_qp(ch->qp);
+	ib_destroy_qp(ch->qp);
+	ib_free_cq(ch->send_cq);
+	ib_free_cq(ch->recv_cq);
 
 	/*
 	 * Avoid that the SCSI error handler tries to use this channel after
@@ -1038,7 +999,13 @@ static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
 	}
 }
 
-static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
+static void srp_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "INV RKEY");
+}
+
+static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
+		u32 rkey)
 {
 	struct ib_send_wr *bad_wr;
 	struct ib_send_wr wr = {
@@ -1049,8 +1016,8 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
 		.ex.invalidate_rkey = rkey,
 	};
 
-	wr.wr_id = LOCAL_INV_WR_ID_MASK;
-
+	wr.wr_cqe = &req->reg_cqe;
+	req->reg_cqe.done = srp_inv_rkey_done;
 	return ib_post_send(ch->qp, &wr, &bad_wr);
 }
 
@@ -1072,7 +1039,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
 		struct srp_fr_desc **pfr;
 
 		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
-			res = srp_inv_rkey(ch, (*pfr)->mr->rkey);
+			res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey);
 			if (res < 0) {
 				shost_printk(KERN_ERR, target->scsi_host, PFX
 				  "Queueing INV WR for rkey %#x failed (%d)\n",
@@ -1310,7 +1277,13 @@ reset_state:
 	return 0;
 }
 
+static void srp_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	srp_handle_qp_err(cq, wc, "FAST REG");
+}
+
 static int srp_map_finish_fr(struct srp_map_state *state,
+			     struct srp_request *req,
 			     struct srp_rdma_ch *ch)
 {
 	struct srp_target_port *target = ch->target;
@@ -1348,9 +1321,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
 	if (unlikely(n < 0))
 		return n;
 
+	req->reg_cqe.done = srp_reg_mr_done;
+
 	wr.wr.next = NULL;
 	wr.wr.opcode = IB_WR_REG_MR;
-	wr.wr.wr_id = FAST_REG_WR_ID_MASK;
+	wr.wr.wr_cqe = &req->reg_cqe;
 	wr.wr.num_sge = 0;
 	wr.wr.send_flags = 0;
 	wr.mr = desc->mr;
@@ -1455,7 +1430,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
 	while (state->sg_nents) {
 		int i, n;
 
-		n = srp_map_finish_fr(state, ch);
+		n = srp_map_finish_fr(state, req, ch);
 		if (unlikely(n < 0))
 			return n;
 
@@ -1522,7 +1497,7 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
 		state.sg_nents = 1;
 		sg_set_buf(idb_sg, req->indirect_desc, idb_len);
 		idb_sg->dma_address = req->indirect_dma_addr; /* hack! */
-		ret = srp_map_finish_fr(&state, ch);
+		ret = srp_map_finish_fr(&state, req, ch);
 		if (ret < 0)
 			return ret;
 	} else if (dev->use_fmr) {
@@ -1717,7 +1692,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
 	s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
 	struct srp_iu *iu;
 
-	srp_send_completion(ch->send_cq, ch);
+	ib_process_cq_direct(ch->send_cq);
 
 	if (list_empty(&ch->free_tx))
 		return NULL;
@@ -1737,6 +1712,19 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
 	return iu;
 }
 
+static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
+
+	if (likely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "SEND");
+		return;
+	}
+
+	list_add(&iu->list, &ch->free_tx);
+}
+
 static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 {
 	struct srp_target_port *target = ch->target;
@@ -1747,8 +1735,10 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 	list.length = len;
 	list.lkey   = target->lkey;
 
+	iu->cqe.done = srp_send_done;
+
 	wr.next       = NULL;
-	wr.wr_id      = (uintptr_t) iu;
+	wr.wr_cqe     = &iu->cqe;
 	wr.sg_list    = &list;
 	wr.num_sge    = 1;
 	wr.opcode     = IB_WR_SEND;
@@ -1767,8 +1757,10 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
 	list.length = iu->size;
 	list.lkey   = target->lkey;
 
+	iu->cqe.done = srp_recv_done;
+
 	wr.next     = NULL;
-	wr.wr_id    = (uintptr_t) iu;
+	wr.wr_cqe   = &iu->cqe;
 	wr.sg_list  = &list;
 	wr.num_sge  = 1;
 
@@ -1900,14 +1892,20 @@ static void srp_process_aer_req(struct srp_rdma_ch *ch,
 			     "problems processing SRP_AER_REQ\n");
 }
 
-static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc)
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+	struct srp_rdma_ch *ch = cq->cq_context;
 	struct srp_target_port *target = ch->target;
 	struct ib_device *dev = target->srp_host->srp_dev->dev;
-	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
 	int res;
 	u8 opcode;
 
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		srp_handle_qp_err(cq, wc, "RECV");
+		return;
+	}
+
 	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
 				   DMA_FROM_DEVICE);
 
@@ -1970,68 +1968,22 @@ static void srp_tl_err_work(struct work_struct *work)
 		srp_start_tl_fail_timers(target->rport);
 }
 
-static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
-			      bool send_err, struct srp_rdma_ch *ch)
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+		const char *opname)
 {
+	struct srp_rdma_ch *ch = cq->cq_context;
 	struct srp_target_port *target = ch->target;
 
-	if (wr_id == SRP_LAST_WR_ID) {
-		complete(&ch->done);
-		return;
-	}
-
 	if (ch->connected && !target->qp_in_error) {
-		if (wr_id & LOCAL_INV_WR_ID_MASK) {
-			shost_printk(KERN_ERR, target->scsi_host, PFX
-				     "LOCAL_INV failed with status %s (%d)\n",
-				     ib_wc_status_msg(wc_status), wc_status);
-		} else if (wr_id & FAST_REG_WR_ID_MASK) {
-			shost_printk(KERN_ERR, target->scsi_host, PFX
-				     "FAST_REG_MR failed status %s (%d)\n",
-				     ib_wc_status_msg(wc_status), wc_status);
-		} else {
-			shost_printk(KERN_ERR, target->scsi_host,
-				     PFX "failed %s status %s (%d) for iu %p\n",
-				     send_err ? "send" : "receive",
-				     ib_wc_status_msg(wc_status), wc_status,
-				     (void *)(uintptr_t)wr_id);
-		}
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "failed %s status %s (%d) for CQE %p\n",
+			     opname, ib_wc_status_msg(wc->status), wc->status,
+			     wc->wr_cqe);
 		queue_work(system_long_wq, &target->tl_err_work);
 	}
 	target->qp_in_error = true;
 }
 
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr)
-{
-	struct srp_rdma_ch *ch = ch_ptr;
-	struct ib_wc wc;
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		if (likely(wc.status == IB_WC_SUCCESS)) {
-			srp_handle_recv(ch, &wc);
-		} else {
-			srp_handle_qp_err(wc.wr_id, wc.status, false, ch);
-		}
-	}
-}
-
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr)
-{
-	struct srp_rdma_ch *ch = ch_ptr;
-	struct ib_wc wc;
-	struct srp_iu *iu;
-
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		if (likely(wc.status == IB_WC_SUCCESS)) {
-			iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
-			list_add(&iu->list, &ch->free_tx);
-		} else {
-			srp_handle_qp_err(wc.wr_id, wc.status, true, ch);
-		}
-	}
-}
-
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(shost);
@@ -3576,8 +3528,6 @@ static int __init srp_init_module(void)
 {
 	int ret;
 
-	BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
-
 	if (srp_sg_tablesize) {
 		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
 		if (!cmd_sg_entries)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 87a2a91..7fec482 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -66,11 +66,6 @@ enum {
 	SRP_TAG_TSK_MGMT	= 1U << 31,
 
 	SRP_MAX_PAGES_PER_MR	= 512,
-
-	LOCAL_INV_WR_ID_MASK	= 1,
-	FAST_REG_WR_ID_MASK	= 2,
-
-	SRP_LAST_WR_ID		= 0xfffffffcU,
 };
 
 enum srp_target_state {
@@ -128,6 +123,7 @@ struct srp_request {
 	struct srp_direct_buf  *indirect_desc;
 	dma_addr_t		indirect_dma_addr;
 	short			nmdesc;
+	struct ib_cqe		reg_cqe;
 };
 
 /**
@@ -231,6 +227,7 @@ struct srp_iu {
 	void		       *buf;
 	size_t			size;
 	enum dma_data_direction	direction;
+	struct ib_cqe		cqe;
 };
 
 /**
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 7/9] IB/iser: Use a dedicated descriptor for login
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
                   ` (5 preceding siblings ...)
  2015-11-13 13:46 ` [PATCH 6/9] srp: " Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-8-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  2015-11-13 13:46 ` [PATCH 8/9] IB/iser: Use helper for container_of Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 9/9] IB/iser: Convert to CQ abstraction Christoph Hellwig
  8 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel, Sagi Grimberg

From: Sagi Grimberg <sagig@mellanox.com>

Makes better sense and we'll need it later with CQ
abstraction.
iser switch login bufs to void

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h     |  30 +++++--
 drivers/infiniband/ulp/iser/iser_initiator.c | 128 +++++++++++++--------------
 drivers/infiniband/ulp/iser/iser_verbs.c     |  14 +--
 3 files changed, 89 insertions(+), 83 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 502063b..5648409 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -326,6 +326,25 @@ struct iser_rx_desc {
 	char		             pad[ISER_RX_PAD_SIZE];
 } __attribute__((packed));
 
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req:           pointer to login request buffer
+ * @resp:          pointer to login response buffer
+ * @req_dma:       DMA address of login request buffer
+ * @rsp_dma:      DMA address of login response buffer
+ * @sge:           IB sge for login post recv
+ */
+struct iser_login_desc {
+	void                         *req;
+	void                         *rsp;
+	u64                          req_dma;
+	u64                          rsp_dma;
+	struct ib_sge                sge;
+} __attribute__((packed));
+
+
 struct iser_conn;
 struct ib_conn;
 struct iscsi_iser_task;
@@ -512,11 +531,7 @@ struct ib_conn {
  * @up_completion:    connection establishment completed
  *                    (state is ISER_CONN_UP)
  * @conn_list:        entry in ig conn list
- * @login_buf:        login data buffer (stores login parameters)
- * @login_req_buf:    login request buffer
- * @login_req_dma:    login request buffer dma address
- * @login_resp_buf:   login response buffer
- * @login_resp_dma:   login response buffer dma address
+ * @login_desc:       login descriptor
  * @rx_desc_head:     head of rx_descs cyclic buffer
  * @rx_descs:         rx buffers array (cyclic buffer)
  * @num_rx_descs:     number of rx descriptors
@@ -539,10 +554,7 @@ struct iser_conn {
 	struct completion	     ib_completion;
 	struct completion	     up_completion;
 	struct list_head	     conn_list;
-
-	char  			     *login_buf;
-	char			     *login_req_buf, *login_resp_buf;
-	u64			     login_req_dma, login_resp_dma;
+	struct iser_login_desc       login_desc;
 	unsigned int 		     rx_desc_head;
 	struct iser_rx_desc	     *rx_descs;
 	u32                          num_rx_descs;
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index ffd00c4..21f28c8 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -174,73 +174,63 @@ static void iser_create_send_desc(struct iser_conn	*iser_conn,
 static void iser_free_login_buf(struct iser_conn *iser_conn)
 {
 	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
 
-	if (!iser_conn->login_buf)
+	if (!desc->req)
 		return;
 
-	if (iser_conn->login_req_dma)
-		ib_dma_unmap_single(device->ib_device,
-				    iser_conn->login_req_dma,
-				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
 
-	if (iser_conn->login_resp_dma)
-		ib_dma_unmap_single(device->ib_device,
-				    iser_conn->login_resp_dma,
-				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+	ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+			    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
 
-	kfree(iser_conn->login_buf);
+	kfree(desc->req);
+	kfree(desc->rsp);
 
 	/* make sure we never redo any unmapping */
-	iser_conn->login_req_dma = 0;
-	iser_conn->login_resp_dma = 0;
-	iser_conn->login_buf = NULL;
+	desc->req = NULL;
+	desc->rsp = NULL;
 }
 
 static int iser_alloc_login_buf(struct iser_conn *iser_conn)
 {
 	struct iser_device *device = iser_conn->ib_conn.device;
-	int			req_err, resp_err;
-
-	BUG_ON(device == NULL);
-
-	iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
-				     ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-	if (!iser_conn->login_buf)
-		goto out_err;
-
-	iser_conn->login_req_buf  = iser_conn->login_buf;
-	iser_conn->login_resp_buf = iser_conn->login_buf +
-						ISCSI_DEF_MAX_RECV_SEG_LEN;
-
-	iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
-						     iser_conn->login_req_buf,
-						     ISCSI_DEF_MAX_RECV_SEG_LEN,
-						     DMA_TO_DEVICE);
-
-	iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
-						      iser_conn->login_resp_buf,
-						      ISER_RX_LOGIN_SIZE,
-						      DMA_FROM_DEVICE);
-
-	req_err  = ib_dma_mapping_error(device->ib_device,
-					iser_conn->login_req_dma);
-	resp_err = ib_dma_mapping_error(device->ib_device,
-					iser_conn->login_resp_dma);
-
-	if (req_err || resp_err) {
-		if (req_err)
-			iser_conn->login_req_dma = 0;
-		if (resp_err)
-			iser_conn->login_resp_dma = 0;
-		goto free_login_buf;
-	}
+	struct iser_login_desc *desc = &iser_conn->login_desc;
+
+	desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL);
+	if (!desc->req)
+		return -ENOMEM;
+
+	desc->req_dma = ib_dma_map_single(device->ib_device, desc->req,
+					  ISCSI_DEF_MAX_RECV_SEG_LEN,
+					  DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->req_dma))
+		goto free_req;
+
+	desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!desc->rsp)
+		goto unmap_req;
+
+	desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp,
+					   ISER_RX_LOGIN_SIZE,
+					   DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device,
+				desc->rsp_dma))
+		goto free_rsp;
+
 	return 0;
 
-free_login_buf:
-	iser_free_login_buf(iser_conn);
+free_rsp:
+	kfree(desc->rsp);
+unmap_req:
+	ib_dma_unmap_single(device->ib_device, desc->req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN,
+			    DMA_TO_DEVICE);
+free_req:
+	kfree(desc->req);
 
-out_err:
-	iser_err("unable to alloc or map login buf\n");
 	return -ENOMEM;
 }
 
@@ -520,25 +510,25 @@ int iser_send_control(struct iscsi_conn *conn,
 	data_seg_len = ntoh24(task->hdr->dlength);
 
 	if (data_seg_len > 0) {
+		struct iser_login_desc *desc = &iser_conn->login_desc;
 		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+
 		if (task != conn->login_task) {
 			iser_err("data present on non login task!!!\n");
 			goto send_control_error;
 		}
 
-		ib_dma_sync_single_for_cpu(device->ib_device,
-			iser_conn->login_req_dma, task->data_count,
-			DMA_TO_DEVICE);
+		ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma,
+					   task->data_count, DMA_TO_DEVICE);
 
-		memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+		memcpy(desc->req, task->data, task->data_count);
 
-		ib_dma_sync_single_for_device(device->ib_device,
-			iser_conn->login_req_dma, task->data_count,
-			DMA_TO_DEVICE);
+		ib_dma_sync_single_for_device(device->ib_device, desc->req_dma,
+					      task->data_count, DMA_TO_DEVICE);
 
-		tx_dsg->addr    = iser_conn->login_req_dma;
-		tx_dsg->length  = task->data_count;
-		tx_dsg->lkey    = device->pd->local_dma_lkey;
+		tx_dsg->addr = desc->req_dma;
+		tx_dsg->length = task->data_count;
+		tx_dsg->lkey = device->pd->local_dma_lkey;
 		mdesc->num_sge = 2;
 	}
 
@@ -572,27 +562,31 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
 						   ib_conn);
 	struct iscsi_hdr *hdr;
+	char *data;
 	u64 rx_dma;
 	int rx_buflen, outstanding, count, err;
 
 	/* differentiate between login to all other PDUs */
-	if ((char *)rx_desc == iser_conn->login_resp_buf) {
-		rx_dma = iser_conn->login_resp_dma;
+	if (rx_desc == (void *)&iser_conn->login_desc) {
+		rx_dma = iser_conn->login_desc.rsp_dma;
 		rx_buflen = ISER_RX_LOGIN_SIZE;
+		hdr = iser_conn->login_desc.rsp + sizeof(struct iser_hdr);
+		data = iser_conn->login_desc.rsp + ISER_HEADERS_LEN;
 	} else {
 		rx_dma = rx_desc->dma_addr;
 		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+		hdr = &rx_desc->iscsi_header;
+		data = rx_desc->data;
 	}
 
 	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
 				   rx_buflen, DMA_FROM_DEVICE);
 
-	hdr = &rx_desc->iscsi_header;
 
 	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
 			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
 
-	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data,
 			rx_xfer_len - ISER_HEADERS_LEN);
 
 	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
@@ -604,7 +598,7 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	 * for the posted rx bufs refcount to become zero handles everything   */
 	ib_conn->post_recv_buf_count--;
 
-	if (rx_dma == iser_conn->login_resp_dma)
+	if (rx_desc == (void *)&iser_conn->login_desc)
 		return;
 
 	outstanding = ib_conn->post_recv_buf_count;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index bf29ddf..ee4cebc 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -1041,17 +1041,17 @@ int iser_post_recvl(struct iser_conn *iser_conn)
 {
 	struct ib_recv_wr rx_wr, *rx_wr_failed;
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
-	struct ib_sge	  sge;
+	struct iser_login_desc *desc = &iser_conn->login_desc;
 	int ib_ret;
 
-	sge.addr   = iser_conn->login_resp_dma;
-	sge.length = ISER_RX_LOGIN_SIZE;
-	sge.lkey   = ib_conn->device->pd->local_dma_lkey;
+	desc->sge.addr = desc->rsp_dma;
+	desc->sge.length = ISER_RX_LOGIN_SIZE;
+	desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
 
-	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
-	rx_wr.sg_list = &sge;
+	rx_wr.wr_id = (uintptr_t)desc;
+	rx_wr.sg_list = &desc->sge;
 	rx_wr.num_sge = 1;
-	rx_wr.next    = NULL;
+	rx_wr.next = NULL;
 
 	ib_conn->post_recv_buf_count++;
 	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 8/9] IB/iser: Use helper for container_of
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
                   ` (6 preceding siblings ...)
  2015-11-13 13:46 ` [PATCH 7/9] IB/iser: Use a dedicated descriptor for login Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
  2015-11-13 13:46 ` [PATCH 9/9] IB/iser: Convert to CQ abstraction Christoph Hellwig
  8 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel, Sagi Grimberg

From: Sagi Grimberg <sagig@mellanox.com>

Nicer this way.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h     | 6 ++++++
 drivers/infiniband/ulp/iser/iser_initiator.c | 3 +--
 drivers/infiniband/ulp/iser/iser_verbs.c     | 6 ++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 5648409..cf4c4ce 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -729,4 +729,10 @@ iser_tx_next_wr(struct iser_tx_desc *tx_desc)
 	return cur_wr;
 }
 
+static inline struct iser_conn *
+to_iser_conn(struct ib_conn *ib_conn)
+{
+	return container_of(ib_conn, struct iser_conn, ib_conn);
+}
+
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 21f28c8..21148b6 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -559,8 +559,7 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 			 unsigned long rx_xfer_len,
 			 struct ib_conn *ib_conn)
 {
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
 	struct iscsi_hdr *hdr;
 	char *data;
 	u64 rx_dma;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index ee4cebc..f75ef0c 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -455,8 +455,7 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
  */
 static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 {
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
 	struct iser_device	*device;
 	struct ib_device	*ib_dev;
 	struct ib_qp_init_attr	init_attr;
@@ -1160,9 +1159,8 @@ static void
 iser_handle_comp_error(struct ib_conn *ib_conn,
 		       struct ib_wc *wc)
 {
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
 	void *wr_id = (void *)(uintptr_t)wc->wr_id;
-	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-						   ib_conn);
 
 	if (wc->status != IB_WC_WR_FLUSH_ERR)
 		if (iser_conn->iscsi_conn)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* [PATCH 9/9] IB/iser: Convert to CQ abstraction
  2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
                   ` (7 preceding siblings ...)
  2015-11-13 13:46 ` [PATCH 8/9] IB/iser: Use helper for container_of Christoph Hellwig
@ 2015-11-13 13:46 ` Christoph Hellwig
       [not found]   ` <1447422410-20891-10-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  8 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-13 13:46 UTC (permalink / raw)
  To: linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel, Sagi Grimberg

From: Sagi Grimberg <sagig@mellanox.com>

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h     |  68 ++++---
 drivers/infiniband/ulp/iser/iser_initiator.c | 142 ++++++++++-----
 drivers/infiniband/ulp/iser/iser_memory.c    |  21 ++-
 drivers/infiniband/ulp/iser/iser_verbs.c     | 258 ++++++---------------------
 4 files changed, 209 insertions(+), 280 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index cf4c4ce..1799c87 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -151,16 +151,12 @@
 					 - ISER_MAX_RX_MISC_PDUS) /	\
 					 (1 + ISER_INFLIGHT_DATAOUTS))
 
-#define ISER_WC_BATCH_COUNT   16
 #define ISER_SIGNAL_CMD_COUNT 32
 
 #define ISER_VER			0x10
 #define ISER_WSV			0x08
 #define ISER_RSV			0x04
 
-#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
-#define ISER_BEACON_WRID		0xfffffffffffffffeULL
-
 /**
  * struct iser_hdr - iSER header
  *
@@ -269,7 +265,7 @@ enum iser_desc_type {
 #define ISER_MAX_WRS 7
 
 /**
- * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ * struct iser_tx_desc - iSER TX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -293,6 +289,7 @@ struct iser_tx_desc {
 	u64		             dma_addr;
 	struct ib_sge		     tx_sg[2];
 	int                          num_sge;
+	struct ib_cqe		     cqe;
 	bool			     mapped;
 	u8                           wr_idx;
 	union iser_wr {
@@ -306,9 +303,10 @@ struct iser_tx_desc {
 };
 
 #define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
-					sizeof(u64) + sizeof(struct ib_sge)))
+				 sizeof(u64) + sizeof(struct ib_sge) + \
+				 sizeof(struct ib_cqe)))
 /**
- * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ * struct iser_rx_desc - iSER RX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -323,6 +321,7 @@ struct iser_rx_desc {
 	char		             data[ISER_RECV_DATA_SEG_LEN];
 	u64		             dma_addr;
 	struct ib_sge		     rx_sg;
+	struct ib_cqe		     cqe;
 	char		             pad[ISER_RX_PAD_SIZE];
 } __attribute__((packed));
 
@@ -335,6 +334,7 @@ struct iser_rx_desc {
  * @req_dma:       DMA address of login request buffer
  * @rsp_dma:      DMA address of login response buffer
  * @sge:           IB sge for login post recv
+ * @cqe:           completion handler
  */
 struct iser_login_desc {
 	void                         *req;
@@ -342,6 +342,7 @@ struct iser_login_desc {
 	u64                          req_dma;
 	u64                          rsp_dma;
 	struct ib_sge                sge;
+	struct ib_cqe		     cqe;
 } __attribute__((packed));
 
 
@@ -352,18 +353,12 @@ struct iscsi_iser_task;
 /**
  * struct iser_comp - iSER completion context
  *
- * @device:     pointer to device handle
  * @cq:         completion queue
- * @wcs:        work completion array
- * @tasklet:    Tasklet handle
  * @active_qps: Number of active QPs attached
  *              to completion context
  */
 struct iser_comp {
-	struct iser_device      *device;
 	struct ib_cq		*cq;
-	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
-	struct tasklet_struct	 tasklet;
 	int                      active_qps;
 };
 
@@ -492,10 +487,11 @@ struct iser_fr_pool {
  * @rx_wr:               receive work request for batch posts
  * @device:              reference to iser device
  * @comp:                iser completion context
- * @pi_support:          Indicate device T10-PI support
- * @beacon:              beacon send wr to signal all flush errors were drained
- * @flush_comp:          completes when all connection completions consumed
  * @fr_pool:             connection fast registration poool
+ * @pi_support:          Indicate device T10-PI support
+ * @last:                last send wr to signal all flush errors were drained
+ * @last_cqe:            cqe handler for last wr
+ * @last_comp:           completes when all connection completions consumed
  */
 struct ib_conn {
 	struct rdma_cm_id           *cma_id;
@@ -505,10 +501,12 @@ struct ib_conn {
 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 	struct iser_device          *device;
 	struct iser_comp	    *comp;
-	bool			     pi_support;
-	struct ib_send_wr	     beacon;
-	struct completion	     flush_comp;
 	struct iser_fr_pool          fr_pool;
+	bool			     pi_support;
+	struct ib_send_wr	     last;
+	struct ib_cqe		     last_cqe;
+	struct ib_cqe		     reg_cqe;
+	struct completion	     last_comp;
 };
 
 /**
@@ -643,12 +641,14 @@ int iser_conn_terminate(struct iser_conn *iser_conn);
 
 void iser_release_work(struct work_struct *work);
 
-void iser_rcv_completion(struct iser_rx_desc *desc,
-			 unsigned long dto_xfer_len,
-			 struct ib_conn *ib_conn);
-
-void iser_snd_completion(struct iser_tx_desc *desc,
-			 struct ib_conn *ib_conn);
+void iser_err_comp(struct ib_wc *wc, const char *type);
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
@@ -735,4 +735,22 @@ to_iser_conn(struct ib_conn *ib_conn)
 	return container_of(ib_conn, struct iser_conn, ib_conn);
 }
 
+static inline struct iser_rx_desc *
+iser_rx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_rx_desc, cqe);
+}
+
+static inline struct iser_tx_desc *
+iser_tx(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_tx_desc, cqe);
+}
+
+static inline struct iser_login_desc *
+iser_login(struct ib_cqe *cqe)
+{
+	return container_of(cqe, struct iser_login_desc, cqe);
+}
+
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 21148b6..44ecb89 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -270,11 +270,11 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 			goto rx_desc_dma_map_failed;
 
 		rx_desc->dma_addr = dma_addr;
-
+		rx_desc->cqe.done = iser_task_rsp;
 		rx_sg = &rx_desc->rx_sg;
-		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->addr = rx_desc->dma_addr;
 		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-		rx_sg->lkey   = device->pd->local_dma_lkey;
+		rx_sg->lkey = device->pd->local_dma_lkey;
 	}
 
 	iser_conn->rx_desc_head = 0;
@@ -373,6 +373,7 @@ int iser_send_command(struct iscsi_conn *conn,
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	tx_desc->cqe.done = iser_cmd_comp;
 	iser_create_send_desc(iser_conn, tx_desc);
 
 	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
@@ -454,6 +455,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
 	}
 
 	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->cqe.done = iser_dataout_comp;
 	tx_desc->iser_header.flags = ISER_VER;
 	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
@@ -503,6 +505,7 @@ int iser_send_control(struct iscsi_conn *conn,
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	mdesc->type = ISCSI_TX_CONTROL;
+	mdesc->cqe.done = iser_ctrl_comp;
 	iser_create_send_desc(iser_conn, mdesc);
 
 	device = iser_conn->ib_conn.device;
@@ -552,44 +555,69 @@ send_control_error:
 	return err;
 }
 
-/**
- * iser_rcv_dto_completion - recv DTO completion
- */
-void iser_rcv_completion(struct iser_rx_desc *rx_desc,
-			 unsigned long rx_xfer_len,
-			 struct ib_conn *ib_conn)
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct ib_conn *ib_conn = wc->qp->qp_context;
 	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_login_desc *desc = iser_login(wc->wr_cqe);
 	struct iscsi_hdr *hdr;
 	char *data;
-	u64 rx_dma;
-	int rx_buflen, outstanding, count, err;
-
-	/* differentiate between login to all other PDUs */
-	if (rx_desc == (void *)&iser_conn->login_desc) {
-		rx_dma = iser_conn->login_desc.rsp_dma;
-		rx_buflen = ISER_RX_LOGIN_SIZE;
-		hdr = iser_conn->login_desc.rsp + sizeof(struct iser_hdr);
-		data = iser_conn->login_desc.rsp + ISER_HEADERS_LEN;
-	} else {
-		rx_dma = rx_desc->dma_addr;
-		rx_buflen = ISER_RX_PAYLOAD_SIZE;
-		hdr = &rx_desc->iscsi_header;
-		data = rx_desc->data;
+	int length;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "login_rsp");
+		return;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				   DMA_FROM_DEVICE);
+
+	hdr = desc->rsp + sizeof(struct iser_hdr);
+	data = desc->rsp + ISER_HEADERS_LEN;
+	length = wc->byte_len - ISER_HEADERS_LEN;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+		 hdr->itt, length);
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+				      DMA_FROM_DEVICE);
+
+	ib_conn->post_recv_buf_count--;
+}
+
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+	struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
+	struct iscsi_hdr *hdr;
+	int length;
+	int outstanding, count, err;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "task_rsp");
+		return;
 	}
 
-	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
-				   rx_buflen, DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+				   desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				   DMA_FROM_DEVICE);
 
+	hdr = &desc->iscsi_header;
+	length = wc->byte_len - ISER_HEADERS_LEN;
 
 	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
-			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+		 hdr->itt, length);
 
-	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data,
-			rx_xfer_len - ISER_HEADERS_LEN);
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length);
 
-	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
-				      rx_buflen, DMA_FROM_DEVICE);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+				      DMA_FROM_DEVICE);
 
 	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
 	 * task eliminates the need to worry on tasks which are completed in   *
@@ -597,9 +625,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	 * for the posted rx bufs refcount to become zero handles everything   */
 	ib_conn->post_recv_buf_count--;
 
-	if (rx_desc == (void *)&iser_conn->login_desc)
-		return;
-
 	outstanding = ib_conn->post_recv_buf_count;
 	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
 		count = min(iser_conn->qp_max_recv_dtos - outstanding,
@@ -610,26 +635,47 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 	}
 }
 
-void iser_snd_completion(struct iser_tx_desc *tx_desc,
-			struct ib_conn *ib_conn)
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "command");
+}
+
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc)
 {
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
 	struct iscsi_task *task;
-	struct iser_device *device = ib_conn->device;
 
-	if (tx_desc->type == ISCSI_TX_DATAOUT) {
-		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
-					ISER_HEADERS_LEN, DMA_TO_DEVICE);
-		kmem_cache_free(ig.desc_cache, tx_desc);
-		tx_desc = NULL;
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		iser_err_comp(wc, "control");
+		return;
 	}
 
-	if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
-		/* this arithmetic is legal by libiscsi dd_data allocation */
-		task = (void *) ((long)(void *)tx_desc -
-				  sizeof(struct iscsi_task));
-		if (task->hdr->itt == RESERVED_ITT)
-			iscsi_put_task(task);
-	}
+	/* this arithmetic is legal by libiscsi dd_data allocation */
+	task = (void *)desc - sizeof(struct iscsi_task);
+	if (task->hdr->itt == RESERVED_ITT)
+		iscsi_put_task(task);
+}
+
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+	struct iser_device *device = ib_conn->device;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		iser_err_comp(wc, "dataout");
+
+	ib_dma_unmap_single(device->ib_device, desc->dma_addr,
+			    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	kmem_cache_free(ig.desc_cache, desc);
+}
+
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn = wc->qp->qp_context;
+
+	complete(&ib_conn->last_comp);
 }
 
 void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 81ad5e9..454c8cd 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -67,6 +67,11 @@ static struct iser_reg_ops fmr_ops = {
 	.reg_desc_put	= iser_reg_desc_put_fmr,
 };
 
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+	iser_err_comp(wc, "memreg");
+}
+
 int iser_assign_reg_ops(struct iser_device *device)
 {
 	struct ib_device *ib_dev = device->ib_device;
@@ -413,12 +418,14 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
 }
 
 static void
-iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+iser_inv_rkey(struct ib_send_wr *inv_wr,
+	      struct ib_mr *mr,
+	      struct ib_cqe *cqe)
 {
 	u32 rkey;
 
 	inv_wr->opcode = IB_WR_LOCAL_INV;
-	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+	inv_wr->wr_cqe = cqe;
 	inv_wr->ex.invalidate_rkey = mr->rkey;
 	inv_wr->send_flags = 0;
 	inv_wr->num_sge = 0;
@@ -436,6 +443,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 {
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
 	struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
 	struct ib_sig_handover_wr *wr;
 	int ret;
 
@@ -447,11 +455,11 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 	iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
 	if (!pi_ctx->sig_mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr);
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr, cqe);
 
 	wr = sig_handover_wr(iser_tx_next_wr(tx_desc));
 	wr->wr.opcode = IB_WR_REG_SIG_MR;
-	wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+	wr->wr.wr_cqe = cqe;
 	wr->wr.sg_list = &data_reg->sge;
 	wr->wr.num_sge = 1;
 	wr->wr.send_flags = 0;
@@ -484,12 +492,13 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 			    struct iser_mem_reg *reg)
 {
 	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
 	struct ib_mr *mr = rsc->mr;
 	struct ib_reg_wr *wr;
 	int n;
 
 	if (!rsc->mr_valid)
-		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr);
+		iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
 
 	n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
 	if (unlikely(n != mem->size)) {
@@ -500,7 +509,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 
 	wr = reg_wr(iser_tx_next_wr(tx_desc));
 	wr->wr.opcode = IB_WR_REG_MR;
-	wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+	wr->wr.wr_cqe = cqe;
 	wr->wr.send_flags = 0;
 	wr->wr.num_sge = 0;
 	wr->mr = mr;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index f75ef0c..29d9046 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -44,17 +44,6 @@
 #define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
 				 ISCSI_ISER_MAX_CONN)
 
-static int iser_cq_poll_limit = 512;
-
-static void iser_cq_tasklet_fn(unsigned long data);
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
-
-static void iser_cq_event_callback(struct ib_event *cause, void *context)
-{
-	iser_err("cq event %s (%d)\n",
-		 ib_event_msg(cause->event), cause->event);
-}
-
 static void iser_qp_event_callback(struct ib_event *cause, void *context)
 {
 	iser_err("qp event %s (%d)\n",
@@ -104,27 +93,14 @@ static int iser_create_device_ib_res(struct iser_device *device)
 		goto pd_err;
 
 	for (i = 0; i < device->comps_used; i++) {
-		struct ib_cq_init_attr cq_attr = {};
 		struct iser_comp *comp = &device->comps[i];
 
-		comp->device = device;
-		cq_attr.cqe = max_cqe;
-		cq_attr.comp_vector = i;
-		comp->cq = ib_create_cq(ib_dev,
-					iser_cq_callback,
-					iser_cq_event_callback,
-					(void *)comp,
-					&cq_attr);
+		comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
+				IB_POLL_SOFTIRQ);
 		if (IS_ERR(comp->cq)) {
 			comp->cq = NULL;
 			goto cq_err;
 		}
-
-		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
-			goto cq_err;
-
-		tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
-			     (unsigned long)comp);
 	}
 
 	if (!iser_always_reg) {
@@ -134,7 +110,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
 
 		device->mr = ib_get_dma_mr(device->pd, access);
 		if (IS_ERR(device->mr))
-			goto dma_mr_err;
+			goto cq_err;
 	}
 
 	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
@@ -147,15 +123,12 @@ static int iser_create_device_ib_res(struct iser_device *device)
 handler_err:
 	if (device->mr)
 		ib_dereg_mr(device->mr);
-dma_mr_err:
-	for (i = 0; i < device->comps_used; i++)
-		tasklet_kill(&device->comps[i].tasklet);
 cq_err:
 	for (i = 0; i < device->comps_used; i++) {
 		struct iser_comp *comp = &device->comps[i];
 
 		if (comp->cq)
-			ib_destroy_cq(comp->cq);
+			ib_free_cq(comp->cq);
 	}
 	ib_dealloc_pd(device->pd);
 pd_err:
@@ -176,8 +149,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 	for (i = 0; i < device->comps_used; i++) {
 		struct iser_comp *comp = &device->comps[i];
 
-		tasklet_kill(&comp->tasklet);
-		ib_destroy_cq(comp->cq);
+		ib_free_cq(comp->cq);
 		comp->cq = NULL;
 	}
 
@@ -717,13 +689,13 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
 				 iser_conn, err);
 
 		/* post an indication that all flush errors were consumed */
-		err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+		err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr);
 		if (err) {
-			iser_err("conn %p failed to post beacon", ib_conn);
+			iser_err("conn %p failed to post last wr", ib_conn);
 			return 1;
 		}
 
-		wait_for_completion(&ib_conn->flush_comp);
+		wait_for_completion(&ib_conn->last_comp);
 	}
 
 	return 1;
@@ -960,14 +932,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 
 void iser_conn_init(struct iser_conn *iser_conn)
 {
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
 	iser_conn->state = ISER_CONN_INIT;
-	iser_conn->ib_conn.post_recv_buf_count = 0;
-	init_completion(&iser_conn->ib_conn.flush_comp);
 	init_completion(&iser_conn->stop_completion);
 	init_completion(&iser_conn->ib_completion);
 	init_completion(&iser_conn->up_completion);
 	INIT_LIST_HEAD(&iser_conn->conn_list);
 	mutex_init(&iser_conn->state_mutex);
+
+	ib_conn->post_recv_buf_count = 0;
+	ib_conn->reg_cqe.done = iser_reg_comp;
+	ib_conn->last_cqe.done = iser_last_comp;
+	ib_conn->last.wr_cqe = &ib_conn->last_cqe;
+	ib_conn->last.opcode = IB_WR_SEND;
+	init_completion(&ib_conn->last_comp);
 }
 
  /**
@@ -993,9 +972,6 @@ int iser_connect(struct iser_conn   *iser_conn,
 
 	iser_conn->state = ISER_CONN_PENDING;
 
-	ib_conn->beacon.wr_id = ISER_BEACON_WRID;
-	ib_conn->beacon.opcode = IB_WR_SEND;
-
 	ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
 					 (void *)iser_conn,
 					 RDMA_PS_TCP, IB_QPT_RC);
@@ -1038,56 +1014,60 @@ connect_failure:
 
 int iser_post_recvl(struct iser_conn *iser_conn)
 {
-	struct ib_recv_wr rx_wr, *rx_wr_failed;
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
 	struct iser_login_desc *desc = &iser_conn->login_desc;
+	struct ib_recv_wr wr, *wr_failed;
 	int ib_ret;
 
 	desc->sge.addr = desc->rsp_dma;
 	desc->sge.length = ISER_RX_LOGIN_SIZE;
 	desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
 
-	rx_wr.wr_id = (uintptr_t)desc;
-	rx_wr.sg_list = &desc->sge;
-	rx_wr.num_sge = 1;
-	rx_wr.next = NULL;
+	desc->cqe.done = iser_login_rsp;
+	wr.wr_cqe = &desc->cqe;
+	wr.sg_list = &desc->sge;
+	wr.num_sge = 1;
+	wr.next = NULL;
 
 	ib_conn->post_recv_buf_count++;
-	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+	ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 		ib_conn->post_recv_buf_count--;
 	}
+
 	return ib_ret;
 }
 
 int iser_post_recvm(struct iser_conn *iser_conn, int count)
 {
-	struct ib_recv_wr *rx_wr, *rx_wr_failed;
-	int i, ib_ret;
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
 	unsigned int my_rx_head = iser_conn->rx_desc_head;
 	struct iser_rx_desc *rx_desc;
+	struct ib_recv_wr *wr, *wr_failed;
+	int i, ib_ret;
 
-	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
-		rx_desc		= &iser_conn->rx_descs[my_rx_head];
-		rx_wr->wr_id	= (uintptr_t)rx_desc;
-		rx_wr->sg_list	= &rx_desc->rx_sg;
-		rx_wr->num_sge	= 1;
-		rx_wr->next	= rx_wr + 1;
+	for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
+		rx_desc = &iser_conn->rx_descs[my_rx_head];
+		rx_desc->cqe.done = iser_task_rsp;
+		wr->wr_cqe = &rx_desc->cqe;
+		wr->sg_list = &rx_desc->rx_sg;
+		wr->num_sge = 1;
+		wr->next = wr + 1;
 		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
 	}
 
-	rx_wr--;
-	rx_wr->next = NULL; /* mark end of work requests list */
+	wr--;
+	wr->next = NULL; /* mark end of work requests list */
 
 	ib_conn->post_recv_buf_count += count;
-	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+	ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
 		ib_conn->post_recv_buf_count -= count;
 	} else
 		iser_conn->rx_desc_head = my_rx_head;
+
 	return ib_ret;
 }
 
@@ -1108,7 +1088,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 				      DMA_TO_DEVICE);
 
 	wr->next = NULL;
-	wr->wr_id = (uintptr_t)tx_desc;
+	wr->wr_cqe = &tx_desc->cqe;
 	wr->sg_list = tx_desc->tx_sg;
 	wr->num_sge = tx_desc->num_sge;
 	wr->opcode = IB_WR_SEND;
@@ -1122,148 +1102,6 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 	return ib_ret;
 }
 
-/**
- * is_iser_tx_desc - Indicate if the completion wr_id
- *     is a TX descriptor or not.
- * @iser_conn: iser connection
- * @wr_id: completion WR identifier
- *
- * Since we cannot rely on wc opcode in FLUSH errors
- * we must work around it by checking if the wr_id address
- * falls in the iser connection rx_descs buffer. If so
- * it is an RX descriptor, otherwize it is a TX.
- */
-static inline bool
-is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
-{
-	void *start = iser_conn->rx_descs;
-	int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
-
-	if (wr_id >= start && wr_id < start + len)
-		return false;
-
-	return true;
-}
-
-/**
- * iser_handle_comp_error() - Handle error completion
- * @ib_conn:   connection RDMA resources
- * @wc:        work completion
- *
- * Notes: We may handle a FLUSH error completion and in this case
- *        we only cleanup in case TX type was DATAOUT. For non-FLUSH
- *        error completion we should also notify iscsi layer that
- *        connection is failed (in case we passed bind stage).
- */
-static void
-iser_handle_comp_error(struct ib_conn *ib_conn,
-		       struct ib_wc *wc)
-{
-	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
-	void *wr_id = (void *)(uintptr_t)wc->wr_id;
-
-	if (wc->status != IB_WC_WR_FLUSH_ERR)
-		if (iser_conn->iscsi_conn)
-			iscsi_conn_failure(iser_conn->iscsi_conn,
-					   ISCSI_ERR_CONN_FAILED);
-
-	if (wc->wr_id == ISER_FASTREG_LI_WRID)
-		return;
-
-	if (is_iser_tx_desc(iser_conn, wr_id)) {
-		struct iser_tx_desc *desc = wr_id;
-
-		if (desc->type == ISCSI_TX_DATAOUT)
-			kmem_cache_free(ig.desc_cache, desc);
-	} else {
-		ib_conn->post_recv_buf_count--;
-	}
-}
-
-/**
- * iser_handle_wc - handle a single work completion
- * @wc: work completion
- *
- * Soft-IRQ context, work completion can be either
- * SEND or RECV, and can turn out successful or
- * with error (or flush error).
- */
-static void iser_handle_wc(struct ib_wc *wc)
-{
-	struct ib_conn *ib_conn;
-	struct iser_tx_desc *tx_desc;
-	struct iser_rx_desc *rx_desc;
-
-	ib_conn = wc->qp->qp_context;
-	if (likely(wc->status == IB_WC_SUCCESS)) {
-		if (wc->opcode == IB_WC_RECV) {
-			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
-			iser_rcv_completion(rx_desc, wc->byte_len,
-					    ib_conn);
-		} else
-		if (wc->opcode == IB_WC_SEND) {
-			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
-			iser_snd_completion(tx_desc, ib_conn);
-		} else {
-			iser_err("Unknown wc opcode %d\n", wc->opcode);
-		}
-	} else {
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
-			iser_err("%s (%d): wr id %llx vend_err %x\n",
-				 ib_wc_status_msg(wc->status), wc->status,
-				 wc->wr_id, wc->vendor_err);
-		else
-			iser_dbg("%s (%d): wr id %llx\n",
-				 ib_wc_status_msg(wc->status), wc->status,
-				 wc->wr_id);
-
-		if (wc->wr_id == ISER_BEACON_WRID)
-			/* all flush errors were consumed */
-			complete(&ib_conn->flush_comp);
-		else
-			iser_handle_comp_error(ib_conn, wc);
-	}
-}
-
-/**
- * iser_cq_tasklet_fn - iSER completion polling loop
- * @data: iSER completion context
- *
- * Soft-IRQ context, polling connection CQ until
- * either CQ was empty or we exausted polling budget
- */
-static void iser_cq_tasklet_fn(unsigned long data)
-{
-	struct iser_comp *comp = (struct iser_comp *)data;
-	struct ib_cq *cq = comp->cq;
-	struct ib_wc *const wcs = comp->wcs;
-	int i, n, completed = 0;
-
-	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
-		for (i = 0; i < n; i++)
-			iser_handle_wc(&wcs[i]);
-
-		completed += n;
-		if (completed >= iser_cq_poll_limit)
-			break;
-	}
-
-	/*
-	 * It is assumed here that arming CQ only once its empty
-	 * would not cause interrupts to be missed.
-	 */
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-
-	iser_dbg("got %d completions\n", completed);
-}
-
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
-{
-	struct iser_comp *comp = cq_context;
-
-	tasklet_schedule(&comp->tasklet);
-}
-
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
 			     enum iser_data_dir cmd_dir, sector_t *sector)
 {
@@ -1311,3 +1149,21 @@ err:
 	/* Not alot we can do here, return ambiguous guard error */
 	return 0x1;
 }
+
+void iser_err_comp(struct ib_wc *wc, const char *type)
+{
+	if (wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context);
+
+		iser_err("%s failure: %s (%d) vend_err %x\n", type,
+			 ib_wc_status_msg(wc->status), wc->status,
+			 wc->vendor_err);
+
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+	} else {
+		iser_dbg("%s failure: %s (%d)\n", type,
+			 ib_wc_status_msg(wc->status), wc->status);
+	}
+}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
@ 2015-11-13 15:23       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-13 15:23 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bart Van Assche,
	axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	Linux Kernel

On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
> The new name is irq_poll as iopoll is already taken.  Better suggestions
> welcome.

Sagi (or Christoph if you can address that),

@ some pointer over the last 18 months there was a port done at
mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
didn't work at all. Can you tell now what was the problem and how did
you address it at your generalization?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-13 15:23       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-13 15:23 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig
  Cc: linux-rdma, Bart Van Assche, axboe, linux-scsi, Linux Kernel

On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch@lst.de> wrote:
> The new name is irq_poll as iopoll is already taken.  Better suggestions
> welcome.

Sagi (or Christoph if you can address that),

@ some pointer over the last 18 months there was a port done at
mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
didn't work at all. Can you tell now what was the problem and how did
you address it at your generalization?

Or.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-13 13:46 ` [PATCH 3/9] IB: add a helper to safely drain a QP Christoph Hellwig
@ 2015-11-13 16:16       ` Steve Wise
  2015-11-15  9:34   ` Sagi Grimberg
  1 sibling, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-13 16:16 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 7:46 AM, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
> ---
>   drivers/infiniband/core/cq.c | 46 ++++++++++++++++++++++++++++++++++++++++++++
>   include/rdma/ib_verbs.h      |  2 ++
>   2 files changed, 48 insertions(+)
>
> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> index d9eb796..bf2a079 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -206,3 +206,49 @@ void ib_free_cq(struct ib_cq *cq)
>   	WARN_ON_ONCE(ret);
>   }
>   EXPORT_SYMBOL(ib_free_cq);
> +
> +struct ib_stop_cqe {
> +	struct ib_cqe	cqe;
> +	struct completion done;
> +};
> +
> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	struct ib_stop_cqe *stop =
> +		container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
> +
> +	complete(&stop->done);
> +}
> +
> +/*
> + * Change a queue pair into the error state and wait until all receive
> + * completions have been processed before destroying it. This avoids that
> + * the receive completion handler can access the queue pair while it is
> + * being destroyed.
> + */
> +void ib_drain_qp(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +	struct ib_stop_cqe stop = { };
> +	struct ib_recv_wr wr, *bad_wr;
> +	int ret;
> +
> +	wr.wr_cqe = &stop.cqe;
> +	stop.cqe.done = ib_stop_done;
> +	init_completion(&stop.done);
> +
> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_recv(qp, &wr, &bad_wr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	wait_for_completion(&stop.done);
> +}
> +EXPORT_SYMBOL(ib_drain_qp);

This won't work with iwarp qps.  Once the QP is in ERROR state, 
post_send/post_recv can return a synchronous error vs async via the 
cq.   The IB spec explicitly states that posts while in ERROR will be 
completed with "flushed" via the CQ.

>From http://tools.ietf.org/html/draft-hilland-rddp-verbs-00#section-6.2.4:

    *   At some point in the execution of the flushing operation, the RI
        MUST begin to return an Immediate Error for any attempt to post
        a WR to a Work Queue; prior to that point, any WQEs posted to a
        Work Queue MUST be enqueued and then flushed as described above
        (e.g. The PostSQ is done in Non-Privileged Mode and the Non-
        Privileged Mode portion of the RI has not yet been informed that
        the QP is in the Error state).


Also pending send work requests can be completed with status "flushed", 
and I would think we need to do something similar for send wrs.  We 
definitely can see this with cxgb4 in the presence of unsignaled wrs 
that aren't followed by a signaled wr at the time the QP is moved out of 
RTS.   The driver has no way to know if these pending unsignaled wrs 
completed or not.  So it completes them with "flushed" status.

So how can we do this for iwarp?  It seems like all that might be needed 
is to modify the QP state to idle, retrying until it succeeds:

    If the QP is transitioning to the Error state, or has not yet
    finished flushing the Work Queues, a Modify QP request to transition
    to the IDLE state MUST fail with an Immediate Error. If none of the
    prior conditions are true, a Modify QP to the Idle state MUST take
    the QP to the Idle state. No other state transitions out of Error
    are supported. Any attempt to transition the QP to a state other
    than Idle MUST result in an Immediate Error.


Steve.

> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index e11e038..f59a8d3 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -3075,4 +3075,6 @@ int ib_sg_to_pages(struct ib_mr *mr,
>   		   int sg_nents,
>   		   int (*set_page)(struct ib_mr *, u64));
>   
> +void ib_drain_qp(struct ib_qp *qp);
> +
>   #endif /* IB_VERBS_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-13 16:16       ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-13 16:16 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 7:46 AM, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/infiniband/core/cq.c | 46 ++++++++++++++++++++++++++++++++++++++++++++
>   include/rdma/ib_verbs.h      |  2 ++
>   2 files changed, 48 insertions(+)
>
> diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
> index d9eb796..bf2a079 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -206,3 +206,49 @@ void ib_free_cq(struct ib_cq *cq)
>   	WARN_ON_ONCE(ret);
>   }
>   EXPORT_SYMBOL(ib_free_cq);
> +
> +struct ib_stop_cqe {
> +	struct ib_cqe	cqe;
> +	struct completion done;
> +};
> +
> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	struct ib_stop_cqe *stop =
> +		container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
> +
> +	complete(&stop->done);
> +}
> +
> +/*
> + * Change a queue pair into the error state and wait until all receive
> + * completions have been processed before destroying it. This avoids that
> + * the receive completion handler can access the queue pair while it is
> + * being destroyed.
> + */
> +void ib_drain_qp(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +	struct ib_stop_cqe stop = { };
> +	struct ib_recv_wr wr, *bad_wr;
> +	int ret;
> +
> +	wr.wr_cqe = &stop.cqe;
> +	stop.cqe.done = ib_stop_done;
> +	init_completion(&stop.done);
> +
> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_recv(qp, &wr, &bad_wr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	wait_for_completion(&stop.done);
> +}
> +EXPORT_SYMBOL(ib_drain_qp);

This won't work with iwarp qps.  Once the QP is in ERROR state, 
post_send/post_recv can return a synchronous error vs async via the 
cq.   The IB spec explicitly states that posts while in ERROR will be 
completed with "flushed" via the CQ.

>From http://tools.ietf.org/html/draft-hilland-rddp-verbs-00#section-6.2.4:

    *   At some point in the execution of the flushing operation, the RI
        MUST begin to return an Immediate Error for any attempt to post
        a WR to a Work Queue; prior to that point, any WQEs posted to a
        Work Queue MUST be enqueued and then flushed as described above
        (e.g. The PostSQ is done in Non-Privileged Mode and the Non-
        Privileged Mode portion of the RI has not yet been informed that
        the QP is in the Error state).


Also pending send work requests can be completed with status "flushed", 
and I would think we need to do something similar for send wrs.  We 
definitely can see this with cxgb4 in the presence of unsignaled wrs 
that aren't followed by a signaled wr at the time the QP is moved out of 
RTS.   The driver has no way to know if these pending unsignaled wrs 
completed or not.  So it completes them with "flushed" status.

So how can we do this for iwarp?  It seems like all that might be needed 
is to modify the QP state to idle, retrying until it succeeds:

    If the QP is transitioning to the Error state, or has not yet
    finished flushing the Work Queues, a Modify QP request to transition
    to the IDLE state MUST fail with an Immediate Error. If none of the
    prior conditions are true, a Modify QP to the Idle state MUST take
    the QP to the Idle state. No other state transitions out of Error
    are supported. Any attempt to transition the QP to a state other
    than Idle MUST result in an Immediate Error.


Steve.

> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index e11e038..f59a8d3 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -3075,4 +3075,6 @@ int ib_sg_to_pages(struct ib_mr *mr,
>   		   int sg_nents,
>   		   int (*set_page)(struct ib_mr *, u64));
>   
> +void ib_drain_qp(struct ib_qp *qp);
> +
>   #endif /* IB_VERBS_H */


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-13 18:25       ` Jason Gunthorpe
       [not found]   ` <1447422410-20891-3-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  1 sibling, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-13 18:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 13, 2015 at 02:46:43PM +0100, Christoph Hellwig wrote:
> This adds an abstraction that allows ULP to simply pass a completion
> object and completion callback with each submitted WR and let the RDMA
> core handle the nitty gritty details of how to handle completion
> interrupts and poll the CQ.

This looks pretty nice, I'd really like to look it over carefully
after SC|15..

I know Bart and others have attempted to have switching between event
and polling driven operation, but there were problems resolving the
races. Would be nice to review that conversation.. Do you remember the
details Bart?

> +static int __ib_process_cq(struct ib_cq *cq, int budget)
> +{
> +	int i, n, completed = 0;
> +
> +	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
> +		completed += n;
> +		if (completed >= budget)
> +			break;

For instance, like this, not fulling draining the cq and then doing:

> +	completed = __ib_process_cq(cq, budget);
> +	if (completed < budget) {
> +		irq_poll_complete(&cq->iop);
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {

Doesn't seem entirely right? There is no point in calling
ib_req_notify_cq if the code knows there is still stuff in the CQ and
has already, independently, arranged for ib_poll_hander to be
guarenteed called.

> +			if (!irq_poll_sched_prep(&cq->iop))
> +				irq_poll_sched(&cq->iop);

Which, it seems, is what this is doing.

Assuming irq_poll_sched is safe to call from a hard irq context, this
looks sane, at first glance.

> +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		queue_work(ib_comp_wq, &cq->work);

Same comment here..

> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
> +{
> +	queue_work(ib_comp_wq, &cq->work);

> +	switch (cq->poll_ctx) {
> +	case IB_POLL_DIRECT:
> +		cq->comp_handler = ib_cq_completion_direct;
> +		break;
> +	case IB_POLL_SOFTIRQ:
> +		cq->comp_handler = ib_cq_completion_softirq;
> +
> +		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
> +		irq_poll_enable(&cq->iop);
> +		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
> +		break;

I understand several drivers are not using a hard irq context for the
comp_handler call back. Is there any way to exploit that in this new
API so we don't have to do so many context switches? Ie if the driver
already is using a softirq when calling comp_handler can we somehow
just rig ib_poll_handler directly and avoid the overhead? (Future)

At first glance this seems so much saner than what we have..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-13 18:25       ` Jason Gunthorpe
  0 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-13 18:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rdma, sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 02:46:43PM +0100, Christoph Hellwig wrote:
> This adds an abstraction that allows ULP to simply pass a completion
> object and completion callback with each submitted WR and let the RDMA
> core handle the nitty gritty details of how to handle completion
> interrupts and poll the CQ.

This looks pretty nice, I'd really like to look it over carefully
after SC|15..

I know Bart and others have attempted to have switching between event
and polling driven operation, but there were problems resolving the
races. Would be nice to review that conversation.. Do you remember the
details Bart?

> +static int __ib_process_cq(struct ib_cq *cq, int budget)
> +{
> +	int i, n, completed = 0;
> +
> +	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
> +		completed += n;
> +		if (completed >= budget)
> +			break;

For instance, like this, not fulling draining the cq and then doing:

> +	completed = __ib_process_cq(cq, budget);
> +	if (completed < budget) {
> +		irq_poll_complete(&cq->iop);
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {

Doesn't seem entirely right? There is no point in calling
ib_req_notify_cq if the code knows there is still stuff in the CQ and
has already, independently, arranged for ib_poll_hander to be
guarenteed called.

> +			if (!irq_poll_sched_prep(&cq->iop))
> +				irq_poll_sched(&cq->iop);

Which, it seems, is what this is doing.

Assuming irq_poll_sched is safe to call from a hard irq context, this
looks sane, at first glance.

> +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		queue_work(ib_comp_wq, &cq->work);

Same comment here..

> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
> +{
> +	queue_work(ib_comp_wq, &cq->work);

> +	switch (cq->poll_ctx) {
> +	case IB_POLL_DIRECT:
> +		cq->comp_handler = ib_cq_completion_direct;
> +		break;
> +	case IB_POLL_SOFTIRQ:
> +		cq->comp_handler = ib_cq_completion_softirq;
> +
> +		irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
> +		irq_poll_enable(&cq->iop);
> +		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
> +		break;

I understand several drivers are not using a hard irq context for the
comp_handler call back. Is there any way to exploit that in this new
API so we don't have to do so many context switches? Ie if the driver
already is using a softirq when calling comp_handler can we somehow
just rig ib_poll_handler directly and avoid the overhead? (Future)

At first glance this seems so much saner than what we have..

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
@ 2015-11-13 19:19       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-13 19:19 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> The new name is irq_poll as iopoll is already taken.  Better suggestions
> welcome.

Hello Christoph,

Would it be possible to provide more background information about this ? 
Which other kernel subsystem is using the name iopoll ? I think the name 
blk-iopoll was chosen six years ago for this subsystem (see also 
https://lwn.net/Articles/346187/). If the conflicting subsystem is 
newer, how about renaming the other polling mechanism ?

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-13 19:19       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-13 19:19 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> The new name is irq_poll as iopoll is already taken.  Better suggestions
> welcome.

Hello Christoph,

Would it be possible to provide more background information about this ? 
Which other kernel subsystem is using the name iopoll ? I think the name 
blk-iopoll was chosen six years ago for this subsystem (see also 
https://lwn.net/Articles/346187/). If the conflicting subsystem is 
newer, how about renaming the other polling mechanism ?

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 18:25       ` Jason Gunthorpe
@ 2015-11-13 19:57           ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-13 19:57 UTC (permalink / raw)
  To: Jason Gunthorpe, Christoph Hellwig
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 10:25 AM, Jason Gunthorpe wrote:
> On Fri, Nov 13, 2015 at 02:46:43PM +0100, Christoph Hellwig wrote:
>> This adds an abstraction that allows ULP to simply pass a completion
>> object and completion callback with each submitted WR and let the RDMA
>> core handle the nitty gritty details of how to handle completion
>> interrupts and poll the CQ.
>
> This looks pretty nice, I'd really like to look it over carefully
> after SC|15..
>
> I know Bart and others have attempted to have switching between event
> and polling driven operation, but there were problems resolving the
> races. Would be nice to review that conversation.. Do you remember the
> details Bart?

Hello Jason,

I think this is the conversation you are referring to: "About a 
shortcoming of the verbs API" 
(http://thread.gmane.org/gmane.linux.drivers.rdma/5028). That 
conversation occurred five years ago, which means that you have an 
excellent memory :-)

I doesn't seem to me like Christoph wanted to support dynamic switching 
between the IB_POLL_DIRECT, IB_POLL_SOFTIRQ and IB_POLL_WORKQUEUE 
polling modes. I think this should have been mentioned in the patch 
description.

The implementation of this patch makes it clear that it is essential 
that all polling is serialized. The WC array that is used for polling is 
embedded in the CQ and is not protected against concurrent access. This 
means that it is essential that _ib_process_cq() calls are serialized. I 
need some more time to verify whether such serialization is always 
guaranteed by this patch.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-13 19:57           ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-13 19:57 UTC (permalink / raw)
  To: Jason Gunthorpe, Christoph Hellwig
  Cc: linux-rdma, sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 10:25 AM, Jason Gunthorpe wrote:
> On Fri, Nov 13, 2015 at 02:46:43PM +0100, Christoph Hellwig wrote:
>> This adds an abstraction that allows ULP to simply pass a completion
>> object and completion callback with each submitted WR and let the RDMA
>> core handle the nitty gritty details of how to handle completion
>> interrupts and poll the CQ.
>
> This looks pretty nice, I'd really like to look it over carefully
> after SC|15..
>
> I know Bart and others have attempted to have switching between event
> and polling driven operation, but there were problems resolving the
> races. Would be nice to review that conversation.. Do you remember the
> details Bart?

Hello Jason,

I think this is the conversation you are referring to: "About a 
shortcoming of the verbs API" 
(http://thread.gmane.org/gmane.linux.drivers.rdma/5028). That 
conversation occurred five years ago, which means that you have an 
excellent memory :-)

I doesn't seem to me like Christoph wanted to support dynamic switching 
between the IB_POLL_DIRECT, IB_POLL_SOFTIRQ and IB_POLL_WORKQUEUE 
polling modes. I think this should have been mentioned in the patch 
description.

The implementation of this patch makes it clear that it is essential 
that all polling is serialized. The WC array that is used for polling is 
embedded in the CQ and is not protected against concurrent access. This 
means that it is essential that _ib_process_cq() calls are serialized. I 
need some more time to verify whether such serialization is always 
guaranteed by this patch.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 19:57           ` Bart Van Assche
  (?)
@ 2015-11-13 22:06           ` Jason Gunthorpe
       [not found]             ` <20151113220636.GA32133-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  -1 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-13 22:06 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 11:57:56AM -0800, Bart Van Assche wrote:

> I think this is the conversation you are referring to: "About a shortcoming
> of the verbs API" (http://thread.gmane.org/gmane.linux.drivers.rdma/5028).
> That conversation occurred five years ago, which means that you have an
> excellent memory :-)

Heh, the whole thread is interesting, but this message is what I was
thinking of

http://thread.gmane.org/gmane.linux.drivers.rdma/5028

And it looks like this patch is OK relative to that discussion.

> I doesn't seem to me like Christoph wanted to support dynamic switching
> between the IB_POLL_DIRECT, IB_POLL_SOFTIRQ and IB_POLL_WORKQUEUE polling
> modes. I think this should have been mentioned in the patch description.

Indeed. Which is probably OK.

> The implementation of this patch makes it clear that it is essential that
> all polling is serialized. The WC array that is used for polling is embedded
> in the CQ and is not protected against concurrent access. This means that it
> is essential that _ib_process_cq() calls are serialized. I need some more
> time to verify whether such serialization is always guaranteed by this
> patch.

Yes, the two big design/review checks 
 - ib_process_cq is fully serialized/etc
 - All re-arm cases are done properly - rearm is only called when the
   CQ is empty and all cases where it is not empty guarentee that the
   polling loop happens again.

Looking at that thread and then at the patch a bit more..

+void ib_process_cq_direct(struct ib_cq *cq)
[..]
+	__ib_process_cq(cq, INT_MAX);

INT_MAX is not enough, it needs to loop.
This is missing a ib_req_notify also.

And this structure:

+static int __ib_process_cq(struct ib_cq *cq, int budget)
+	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {

Does an unnecessary ib_poll_cq call in common cases. I'd suggest
change the result to bool and do:

// true return means the caller should attempt ib_req_notify_cq
while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
 for (...)
 if (n != IB_POLL_BATCH)
   return true;
 completed += n;
 if (completed > budget)
    return false;
}
return true;

And then change call site like:

static void ib_cq_poll_work(struct work_struct *work)
{
    if (__ib_process_cq(...))
        if (ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
	    return;
    // Else we need to loop again.
    queue_work(ib_comp_wq, &cq->work);
}

Which avoids the rearm.

void ib_process_cq_direct(struct ib_cq *cq)
{
   while (1) {
       if (__ib_process_cq(..) &&
           ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
           return;
   }
}

Which adds the inf loop and rearm.

etc for softirq

Perhaps ib_req_notify_cq should be folded into __ib_process_cq, then
it can trivially honour the budget on additional loops from
IB_CQ_REPORT_MISSED_EVENTS.

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-13 15:23       ` Or Gerlitz
@ 2015-11-14  7:02           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:02 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Sagi Grimberg, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Bart Van Assche, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, Linux Kernel

On Fri, Nov 13, 2015 at 05:23:39PM +0200, Or Gerlitz wrote:
> On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
> > The new name is irq_poll as iopoll is already taken.  Better suggestions
> > welcome.
> 
> Sagi (or Christoph if you can address that),
> 
> @ some pointer over the last 18 months there was a port done at
> mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
> didn't work at all. Can you tell now what was the problem and how did
> you address it at your generalization?

Hi Or,

Sagi mentioned last time he tried a similar approach in iSER he saw
some large latency sparks.  We've seen nothing worse than the original
approach.  The Flash memory summit slide set has some numbers:

http://www.flashmemorysummit.com/English/Collaterals/Proceedings/2015/20150811_FA11_Bandic.pdf

they aren't quite up to date, but the latency distribution hasn't
really changed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-14  7:02           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:02 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Sagi Grimberg, linux-rdma, Bart Van Assche, axboe, linux-scsi,
	Linux Kernel

On Fri, Nov 13, 2015 at 05:23:39PM +0200, Or Gerlitz wrote:
> On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch@lst.de> wrote:
> > The new name is irq_poll as iopoll is already taken.  Better suggestions
> > welcome.
> 
> Sagi (or Christoph if you can address that),
> 
> @ some pointer over the last 18 months there was a port done at
> mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
> didn't work at all. Can you tell now what was the problem and how did
> you address it at your generalization?

Hi Or,

Sagi mentioned last time he tried a similar approach in iSER he saw
some large latency sparks.  We've seen nothing worse than the original
approach.  The Flash memory summit slide set has some numbers:

http://www.flashmemorysummit.com/English/Collaterals/Proceedings/2015/20150811_FA11_Bandic.pdf

they aren't quite up to date, but the latency distribution hasn't
really changed.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-13 19:19       ` Bart Van Assche
  (?)
@ 2015-11-14  7:02       ` Christoph Hellwig
  2015-11-17 17:16           ` Bart Van Assche
  -1 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:02 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 11:19:24AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>> welcome.
>
> Hello Christoph,
>
> Would it be possible to provide more background information about this ? 
> Which other kernel subsystem is using the name iopoll ?

Take a look at include/linux/iopoll.h  - I can't reaplly make much sense
of it to be honest, but it's used in a quite a few places.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-13 16:16       ` Steve Wise
@ 2015-11-14  7:05           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:05 UTC (permalink / raw)
  To: Steve Wise
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 13, 2015 at 10:16:04AM -0600, Steve Wise wrote:
> So how can we do this for iwarp?  It seems like all that might be needed is 
> to modify the QP state to idle, retrying until it succeeds:
>
>    If the QP is transitioning to the Error state, or has not yet
>    finished flushing the Work Queues, a Modify QP request to transition
>    to the IDLE state MUST fail with an Immediate Error. If none of the
>    prior conditions are true, a Modify QP to the Idle state MUST take
>    the QP to the Idle state. No other state transitions out of Error
>    are supported. Any attempt to transition the QP to a state other
>    than Idle MUST result in an Immediate Error.

Can you try to write up some code for this?  We could then wire it up
in the common helper.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-14  7:05           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:05 UTC (permalink / raw)
  To: Steve Wise
  Cc: Christoph Hellwig, linux-rdma, sagig, bart.vanassche, axboe,
	linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 10:16:04AM -0600, Steve Wise wrote:
> So how can we do this for iwarp?  It seems like all that might be needed is 
> to modify the QP state to idle, retrying until it succeeds:
>
>    If the QP is transitioning to the Error state, or has not yet
>    finished flushing the Work Queues, a Modify QP request to transition
>    to the IDLE state MUST fail with an Immediate Error. If none of the
>    prior conditions are true, a Modify QP to the Idle state MUST take
>    the QP to the Idle state. No other state transitions out of Error
>    are supported. Any attempt to transition the QP to a state other
>    than Idle MUST result in an Immediate Error.

Can you try to write up some code for this?  We could then wire it up
in the common helper.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 18:25       ` Jason Gunthorpe
@ 2015-11-14  7:08           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:08 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 13, 2015 at 11:25:13AM -0700, Jason Gunthorpe wrote:
> For instance, like this, not fulling draining the cq and then doing:
> 
> > +	completed = __ib_process_cq(cq, budget);
> > +	if (completed < budget) {
> > +		irq_poll_complete(&cq->iop);
> > +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> 
> Doesn't seem entirely right? There is no point in calling
> ib_req_notify_cq if the code knows there is still stuff in the CQ and
> has already, independently, arranged for ib_poll_hander to be
> guarenteed called.

The code only calls ib_req_notify_cq if it knowns we finished earlier than
our budget.

> > +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> > +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> > +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> > +		queue_work(ib_comp_wq, &cq->work);
> 
> Same comment here..


Same here - we only requeue the work item if either we processed all of
our budget, or ib_req_notify_cq with IB_CQ_REPORT_MISSED_EVENTS told
us that we need to poll again.

> I understand several drivers are not using a hard irq context for the
> comp_handler call back. Is there any way to exploit that in this new
> API so we don't have to do so many context switches? Ie if the driver
> already is using a softirq when calling comp_handler can we somehow
> just rig ib_poll_handler directly and avoid the overhead? (Future)

Let's say this API makes it possible.  I still don't think moving the
whole budget and rearm logic into the LLD is necessarily a good idea
if we can avoid it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-14  7:08           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:08 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma, sagig, bart.vanassche, axboe,
	linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 11:25:13AM -0700, Jason Gunthorpe wrote:
> For instance, like this, not fulling draining the cq and then doing:
> 
> > +	completed = __ib_process_cq(cq, budget);
> > +	if (completed < budget) {
> > +		irq_poll_complete(&cq->iop);
> > +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> 
> Doesn't seem entirely right? There is no point in calling
> ib_req_notify_cq if the code knows there is still stuff in the CQ and
> has already, independently, arranged for ib_poll_hander to be
> guarenteed called.

The code only calls ib_req_notify_cq if it knowns we finished earlier than
our budget.

> > +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> > +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> > +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> > +		queue_work(ib_comp_wq, &cq->work);
> 
> Same comment here..


Same here - we only requeue the work item if either we processed all of
our budget, or ib_req_notify_cq with IB_CQ_REPORT_MISSED_EVENTS told
us that we need to poll again.

> I understand several drivers are not using a hard irq context for the
> comp_handler call back. Is there any way to exploit that in this new
> API so we don't have to do so many context switches? Ie if the driver
> already is using a softirq when calling comp_handler can we somehow
> just rig ib_poll_handler directly and avoid the overhead? (Future)

Let's say this API makes it possible.  I still don't think moving the
whole budget and rearm logic into the LLD is necessarily a good idea
if we can avoid it.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 22:06           ` Jason Gunthorpe
@ 2015-11-14  7:13                 ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:13 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
> Looking at that thread and then at the patch a bit more..
> 
> +void ib_process_cq_direct(struct ib_cq *cq)
> [..]
> +	__ib_process_cq(cq, INT_MAX);
> 
> INT_MAX is not enough, it needs to loop.
> This is missing a ib_req_notify also.

No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
the SRP case polls the send CQ only from the same context it sends for
without any interrupt notification at al.

> +static int __ib_process_cq(struct ib_cq *cq, int budget)
> +	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
> 
> Does an unnecessary ib_poll_cq call in common cases. I'd suggest
> change the result to bool and do:
> 
> // true return means the caller should attempt ib_req_notify_cq
> while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
>  for (...)
>  if (n != IB_POLL_BATCH)
>    return true;
>  completed += n;
>  if (completed > budget)
>     return false;
> }
> return true;
> 
> And then change call site like:
> 
> static void ib_cq_poll_work(struct work_struct *work)
> {
>     if (__ib_process_cq(...))
>         if (ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
> 	    return;
>     // Else we need to loop again.
>     queue_work(ib_comp_wq, &cq->work);
> }
> 
> Which avoids the rearm.
> 
> void ib_process_cq_direct(struct ib_cq *cq)
> {
>    while (1) {
>        if (__ib_process_cq(..) &&
>            ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
>            return;
>    }
> }
> 
> Which adds the inf loop and rearm.
> 
> etc for softirq

For the workqueue and softirq cases this looks reasonable.  For the
direct case there is no rearming, though.

> Perhaps ib_req_notify_cq should be folded into __ib_process_cq, then
> it can trivially honour the budget on additional loops from
> IB_CQ_REPORT_MISSED_EVENTS.

Which also defeats this proposal.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-14  7:13                 ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-14  7:13 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel

On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
> Looking at that thread and then at the patch a bit more..
> 
> +void ib_process_cq_direct(struct ib_cq *cq)
> [..]
> +	__ib_process_cq(cq, INT_MAX);
> 
> INT_MAX is not enough, it needs to loop.
> This is missing a ib_req_notify also.

No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
the SRP case polls the send CQ only from the same context it sends for
without any interrupt notification at al.

> +static int __ib_process_cq(struct ib_cq *cq, int budget)
> +	while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
> 
> Does an unnecessary ib_poll_cq call in common cases. I'd suggest
> change the result to bool and do:
> 
> // true return means the caller should attempt ib_req_notify_cq
> while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
>  for (...)
>  if (n != IB_POLL_BATCH)
>    return true;
>  completed += n;
>  if (completed > budget)
>     return false;
> }
> return true;
> 
> And then change call site like:
> 
> static void ib_cq_poll_work(struct work_struct *work)
> {
>     if (__ib_process_cq(...))
>         if (ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
> 	    return;
>     // Else we need to loop again.
>     queue_work(ib_comp_wq, &cq->work);
> }
> 
> Which avoids the rearm.
> 
> void ib_process_cq_direct(struct ib_cq *cq)
> {
>    while (1) {
>        if (__ib_process_cq(..) &&
>            ib_req_notify_cq(cq, IB_POLL_FLAGS) == 0)
>            return;
>    }
> }
> 
> Which adds the inf loop and rearm.
> 
> etc for softirq

For the workqueue and softirq cases this looks reasonable.  For the
direct case there is no rearming, though.

> Perhaps ib_req_notify_cq should be folded into __ib_process_cq, then
> it can trivially honour the budget on additional loops from
> IB_CQ_REPORT_MISSED_EVENTS.

Which also defeats this proposal.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-14  7:02           ` Christoph Hellwig
@ 2015-11-15  8:48               ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15  8:48 UTC (permalink / raw)
  To: Christoph Hellwig, Or Gerlitz
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bart Van Assche,
	axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	Linux Kernel


>> On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> wrote:
>>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>>> welcome.
>>
>> Sagi (or Christoph if you can address that),
>>
>> @ some pointer over the last 18 months there was a port done at
>> mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
>> didn't work at all. Can you tell now what was the problem and how did
>> you address it at your generalization?
>
> Hi Or,
>
> Sagi mentioned last time he tried a similar approach in iSER he saw
> some large latency sparks.  We've seen nothing worse than the original
> approach.  The Flash memory summit slide set has some numbers:
>
> http://www.flashmemorysummit.com/English/Collaterals/Proceedings/2015/20150811_FA11_Bandic.pdf
>
> they aren't quite up to date, but the latency distribution hasn't
> really changed.

Or is correct,

I have attempted to convert iser to use blk_iopoll in the past, however
I've seen inconsistent performance and latency skews (comparing to
tasklets iser is using today). This was manifested in IOPs test cases
where I ran multiple threads with higher queue-depth and not in
sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
the time to pick it up since.

I do have every intention of testing it again with this. If it still
exist we will need to find the root-cause of it before converting
drivers to use it.

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-15  8:48               ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15  8:48 UTC (permalink / raw)
  To: Christoph Hellwig, Or Gerlitz
  Cc: linux-rdma, Bart Van Assche, axboe, linux-scsi, Linux Kernel


>> On Fri, Nov 13, 2015 at 3:46 PM, Christoph Hellwig <hch@lst.de> wrote:
>>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>>> welcome.
>>
>> Sagi (or Christoph if you can address that),
>>
>> @ some pointer over the last 18 months there was a port done at
>> mellanox for iser to use blk-iopoll and AFAIR it didn't work well or
>> didn't work at all. Can you tell now what was the problem and how did
>> you address it at your generalization?
>
> Hi Or,
>
> Sagi mentioned last time he tried a similar approach in iSER he saw
> some large latency sparks.  We've seen nothing worse than the original
> approach.  The Flash memory summit slide set has some numbers:
>
> http://www.flashmemorysummit.com/English/Collaterals/Proceedings/2015/20150811_FA11_Bandic.pdf
>
> they aren't quite up to date, but the latency distribution hasn't
> really changed.

Or is correct,

I have attempted to convert iser to use blk_iopoll in the past, however
I've seen inconsistent performance and latency skews (comparing to
tasklets iser is using today). This was manifested in IOPs test cases
where I ran multiple threads with higher queue-depth and not in
sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
the time to pick it up since.

I do have every intention of testing it again with this. If it still
exist we will need to find the root-cause of it before converting
drivers to use it.

Sagi.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-15  8:48               ` Sagi Grimberg
@ 2015-11-15  9:04                   ` Or Gerlitz
  -1 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:04 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Bart Van Assche, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, Linux Kernel, Oren Duer

On Sun, Nov 15, 2015 at 10:48 AM, Sagi Grimberg
<sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
> Or is correct,
>
> I have attempted to convert iser to use blk_iopoll in the past, however
> I've seen inconsistent performance and latency skews (comparing to
> tasklets iser is using today). This was manifested in IOPs test cases
> where I ran multiple threads with higher queue-depth and not in
> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
> the time to pick it up since.
>
> I do have every intention of testing it again with this. If it still
> exist we will need to find the root-cause of it before converting
> drivers to use it.

Good, this way (inconsistent performance and latency skews) or another
(all shines up) -- please
let us know your findings, best through commenting within V > 0 the
cover letter posts of this series
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-15  9:04                   ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:04 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, linux-rdma, Bart Van Assche, axboe,
	linux-scsi, Linux Kernel, Oren Duer

On Sun, Nov 15, 2015 at 10:48 AM, Sagi Grimberg
<sagig@dev.mellanox.co.il> wrote:
> Or is correct,
>
> I have attempted to convert iser to use blk_iopoll in the past, however
> I've seen inconsistent performance and latency skews (comparing to
> tasklets iser is using today). This was manifested in IOPs test cases
> where I ran multiple threads with higher queue-depth and not in
> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
> the time to pick it up since.
>
> I do have every intention of testing it again with this. If it still
> exist we will need to find the root-cause of it before converting
> drivers to use it.

Good, this way (inconsistent performance and latency skews) or another
(all shines up) -- please
let us know your findings, best through commenting within V > 0 the
cover letter posts of this series

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 7/9] IB/iser: Use a dedicated descriptor for login
  2015-11-13 13:46 ` [PATCH 7/9] IB/iser: Use a dedicated descriptor for login Christoph Hellwig
@ 2015-11-15  9:14       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:14 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Sagi Grimberg

On 11/13/2015 3:46 PM, Christoph Hellwig wrote:
> From: Sagi Grimberg<sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> Makes better sense and we'll need it later with CQ abstraction.
> iser switch login bufs to void

Sagi, few quick comments on this patch, please address for next version..

The 2nd sentence of the change-log needs better phrasing.

also multiple checkpatch hits on the patch, please fix

CHECK: Please don't use multiple blank lines
#26: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:329:

+

WARNING: __packed is preferred over __attribute__((packed))
#42: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:345:
+} __attribute__((packed));

CHECK: Please don't use multiple blank lines
#44: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:347:
+
+

CHECK: Alignment should match open parenthesis
#161: FILE: drivers/infiniband/ulp/iser/iser_initiator.c:209:
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->req_dma))

CHECK: Alignment should match open parenthesis
#172: FILE: drivers/infiniband/ulp/iser/iser_initiator.c:220:
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->rsp_dma))






--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 7/9] IB/iser: Use a dedicated descriptor for login
@ 2015-11-15  9:14       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:14 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel, Sagi Grimberg

On 11/13/2015 3:46 PM, Christoph Hellwig wrote:
> From: Sagi Grimberg<sagig@mellanox.com>
>
> Makes better sense and we'll need it later with CQ abstraction.
> iser switch login bufs to void

Sagi, few quick comments on this patch, please address for next version..

The 2nd sentence of the change-log needs better phrasing.

also multiple checkpatch hits on the patch, please fix

CHECK: Please don't use multiple blank lines
#26: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:329:

+

WARNING: __packed is preferred over __attribute__((packed))
#42: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:345:
+} __attribute__((packed));

CHECK: Please don't use multiple blank lines
#44: FILE: drivers/infiniband/ulp/iser/iscsi_iser.h:347:
+
+

CHECK: Alignment should match open parenthesis
#161: FILE: drivers/infiniband/ulp/iser/iser_initiator.c:209:
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->req_dma))

CHECK: Alignment should match open parenthesis
#172: FILE: drivers/infiniband/ulp/iser/iser_initiator.c:220:
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->rsp_dma))







^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 9/9] IB/iser: Convert to CQ abstraction
  2015-11-13 13:46 ` [PATCH 9/9] IB/iser: Convert to CQ abstraction Christoph Hellwig
@ 2015-11-15  9:21       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:21 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Sagi Grimberg

On 11/13/2015 3:46 PM, Christoph Hellwig wrote:
> From: Sagi Grimberg<sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Care to sparse some text here to assist a reviewer and future bisections?!

I have asked multiple times to avoid empty change-logs for patches in 
this driver.

>
> Signed-off-by: Sagi Grimberg<sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Christoph Hellwig<hch-jcswGhMUV9g@public.gmane.org>
> ---
>   drivers/infiniband/ulp/iser/iscsi_iser.h     |  68 ++++---
>   drivers/infiniband/ulp/iser/iser_initiator.c | 142 ++++++++++-----
>   drivers/infiniband/ulp/iser/iser_memory.c    |  21 ++-
>   drivers/infiniband/ulp/iser/iser_verbs.c     | 258 ++++++---------------------
>   4 files changed, 209 insertions(+), 280 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 9/9] IB/iser: Convert to CQ abstraction
@ 2015-11-15  9:21       ` Or Gerlitz
  0 siblings, 0 replies; 140+ messages in thread
From: Or Gerlitz @ 2015-11-15  9:21 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel, Sagi Grimberg

On 11/13/2015 3:46 PM, Christoph Hellwig wrote:
> From: Sagi Grimberg<sagig@mellanox.com>

Care to sparse some text here to assist a reviewer and future bisections?!

I have asked multiple times to avoid empty change-logs for patches in 
this driver.

>
> Signed-off-by: Sagi Grimberg<sagig@mellanox.com>
> Signed-off-by: Christoph Hellwig<hch@lst.de>
> ---
>   drivers/infiniband/ulp/iser/iscsi_iser.h     |  68 ++++---
>   drivers/infiniband/ulp/iser/iser_initiator.c | 142 ++++++++++-----
>   drivers/infiniband/ulp/iser/iser_memory.c    |  21 ++-
>   drivers/infiniband/ulp/iser/iser_verbs.c     | 258 ++++++---------------------
>   4 files changed, 209 insertions(+), 280 deletions(-)


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-13 13:46 ` [PATCH 3/9] IB: add a helper to safely drain a QP Christoph Hellwig
       [not found]   ` <1447422410-20891-4-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
@ 2015-11-15  9:34   ` Sagi Grimberg
       [not found]     ` <564851BB.1020004-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  1 sibling, 1 reply; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15  9:34 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel


> +
> +struct ib_stop_cqe {
> +	struct ib_cqe	cqe;
> +	struct completion done;
> +};
> +
> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	struct ib_stop_cqe *stop =
> +		container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
> +
> +	complete(&stop->done);
> +}
> +
> +/*
> + * Change a queue pair into the error state and wait until all receive
> + * completions have been processed before destroying it. This avoids that
> + * the receive completion handler can access the queue pair while it is
> + * being destroyed.
> + */
> +void ib_drain_qp(struct ib_qp *qp)
> +{
> +	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> +	struct ib_stop_cqe stop = { };
> +	struct ib_recv_wr wr, *bad_wr;
> +	int ret;
> +
> +	wr.wr_cqe = &stop.cqe;
> +	stop.cqe.done = ib_stop_done;
> +	init_completion(&stop.done);
> +
> +	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	ret = ib_post_recv(qp, &wr, &bad_wr);
> +	if (ret) {
> +		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> +		return;
> +	}
> +
> +	wait_for_completion(&stop.done);
> +}

This is taken from srp, and srp drains using a recv wr due to a race
causing a use-after-free condition in srp which re-posts a recv buffer
in the recv completion handler. srp does not really care if there are
pending send flushes.

I'm not sure if there are ordering rules for send/recv queues in
terms of flush completions, meaning that even if all recv flushes
were consumed maybe there are send flushes still pending.

I think that for a general drain helper it would be useful to
make sure that both the recv _and_ send flushes were drained.

So, something like:

void ib_drain_qp(struct ib_qp *qp)
{
	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
	struct ib_stop_cqe rstop, sstop;
	struct ib_recv_wr rwr = {}, *bad_rwr;
	struct ib_send_wr swr = {}, *bad_swr;
	int ret;

	rwr.wr_cqe = &rstop.cqe;
	rstop.cqe.done = ib_stop_done;
	init_completion(&rstop.done);

	swr.wr_cqe = &sstop.cqe;
	sstop.cqe.done = ib_stop_done;
	init_completion(&sstop.done);

	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
	if (ret) {
		WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
		return;
	}

	ret = ib_post_recv(qp, &rwr, &bad_rwr);
	if (ret) {
		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
		return;
	}

	ret = ib_post_send(qp, &swr, &bad_swr);
	if (ret) {
		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
		return;
	}

	wait_for_completion(&rstop.done);
	wait_for_completion(&sstop.done);
}

Thoughts?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-15  9:40   ` Sagi Grimberg
       [not found]     ` <564852F2.5080602-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
       [not found]   ` <1447422410-20891-3-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  1 sibling, 1 reply; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15  9:40 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> +/**
> + * ib_process_direct_cq - process a CQ in caller context
> + * @cq:		CQ to process
> + *
> + * This function is used to process all outstanding CQ entries on a
> + * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
> + * context and does not ask from completion interrupts from the HCA.
> + */
> +void ib_process_cq_direct(struct ib_cq *cq)
> +{
> +	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
> +
> +	__ib_process_cq(cq, INT_MAX);
> +}

I doubt INT_MAX is useful as a budget in any use-case. it can easily
hog the CPU. If the consumer is given access to poll a CQ, it must be
able to provide some way to budget it. Why not expose a budget argument
to the consumer?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-15  8:48               ` Sagi Grimberg
@ 2015-11-15 12:51                   ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-15 12:51 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Or Gerlitz, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Bart Van Assche, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, Linux Kernel

On Sun, Nov 15, 2015 at 10:48:41AM +0200, Sagi Grimberg wrote:
> I have attempted to convert iser to use blk_iopoll in the past, however
> I've seen inconsistent performance and latency skews (comparing to
> tasklets iser is using today). This was manifested in IOPs test cases
> where I ran multiple threads with higher queue-depth and not in
> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
> the time to pick it up since.
>
> I do have every intention of testing it again with this. If it still
> exist we will need to find the root-cause of it before converting
> drivers to use it.

Thanks.  If you see issue like that with high iops and high loads
the next thing to check are:

 a) increasing the budget (both in the main poll loop and the IB
    code)
 b) try without the single jiffie time limit in the main softirq handler
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-15 12:51                   ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-15 12:51 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Or Gerlitz, linux-rdma, Bart Van Assche,
	axboe, linux-scsi, Linux Kernel

On Sun, Nov 15, 2015 at 10:48:41AM +0200, Sagi Grimberg wrote:
> I have attempted to convert iser to use blk_iopoll in the past, however
> I've seen inconsistent performance and latency skews (comparing to
> tasklets iser is using today). This was manifested in IOPs test cases
> where I ran multiple threads with higher queue-depth and not in
> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
> the time to pick it up since.
>
> I do have every intention of testing it again with this. If it still
> exist we will need to find the root-cause of it before converting
> drivers to use it.

Thanks.  If you see issue like that with high iops and high loads
the next thing to check are:

 a) increasing the budget (both in the main poll loop and the IB
    code)
 b) try without the single jiffie time limit in the main softirq handler

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-15  9:40   ` Sagi Grimberg
@ 2015-11-15 12:55         ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-15 12:55 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Sun, Nov 15, 2015 at 11:40:02AM +0200, Sagi Grimberg wrote:
> I doubt INT_MAX is useful as a budget in any use-case. it can easily
> hog the CPU. If the consumer is given access to poll a CQ, it must be
> able to provide some way to budget it. Why not expose a budget argument
> to the consumer?

Because in theory we could have a lot of sends completing before
we finally need to reap them.  I think that's more of a theoretical
than real issue.

My preference would be to simply kill this mode though.  Allocate a IU
to each block request in SRP and only use the free_tx list for task
management and AEN/req_limit calls.  Then we can use a single CQ
and mark the regular I/O requests as unsignalled.

AFAICS no other driver wants a similar polling mode as the SRP initiator
does for it's send queue.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-15 12:55         ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-15 12:55 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, linux-rdma, bart.vanassche, axboe, linux-scsi,
	linux-kernel

On Sun, Nov 15, 2015 at 11:40:02AM +0200, Sagi Grimberg wrote:
> I doubt INT_MAX is useful as a budget in any use-case. it can easily
> hog the CPU. If the consumer is given access to poll a CQ, it must be
> able to provide some way to budget it. Why not expose a budget argument
> to the consumer?

Because in theory we could have a lot of sends completing before
we finally need to reap them.  I think that's more of a theoretical
than real issue.

My preference would be to simply kill this mode though.  Allocate a IU
to each block request in SRP and only use the free_tx list for task
management and AEN/req_limit calls.  Then we can use a single CQ
and mark the regular I/O requests as unsignalled.

AFAICS no other driver wants a similar polling mode as the SRP initiator
does for it's send queue.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-15  9:04                   ` Or Gerlitz
@ 2015-11-15 13:16                       ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15 13:16 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Bart Van Assche, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, Linux Kernel, Oren Duer



On 15/11/2015 11:04, Or Gerlitz wrote:
> On Sun, Nov 15, 2015 at 10:48 AM, Sagi Grimberg
> <sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
>> Or is correct,
>>
>> I have attempted to convert iser to use blk_iopoll in the past, however
>> I've seen inconsistent performance and latency skews (comparing to
>> tasklets iser is using today). This was manifested in IOPs test cases
>> where I ran multiple threads with higher queue-depth and not in
>> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
>> the time to pick it up since.
>>
>> I do have every intention of testing it again with this. If it still
>> exist we will need to find the root-cause of it before converting
>> drivers to use it.
>
> Good, this way (inconsistent performance and latency skews) or another
> (all shines up) -- please
> let us know your findings, best through commenting within V > 0 the
> cover letter posts of this series
>

Hi Or & Co,

I ran some tests on the iser code with this patchset applied.
I can confirm that I did not see any performance degradations.
summary (on my test servers):
1  LUN:   ~530K  (IOPs)
2  LUNs:  ~1080K (IOPs)
4  LUNs:  ~1350K (IOPs)
8  LUNs:  ~1930K (IOPs)
16 LUns:  ~2250K (IOPs)

These results are true both for tasklet and iopoll.

So, I don't have anything smart to say here, the IO
stack (block, scsi) has gone through major changes since
the last time I looked into this, so it'll be pretty hard to figure
out what was the root cause back then...

Sagi.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-15 13:16                       ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15 13:16 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Christoph Hellwig, linux-rdma, Bart Van Assche, axboe,
	linux-scsi, Linux Kernel, Oren Duer



On 15/11/2015 11:04, Or Gerlitz wrote:
> On Sun, Nov 15, 2015 at 10:48 AM, Sagi Grimberg
> <sagig@dev.mellanox.co.il> wrote:
>> Or is correct,
>>
>> I have attempted to convert iser to use blk_iopoll in the past, however
>> I've seen inconsistent performance and latency skews (comparing to
>> tasklets iser is using today). This was manifested in IOPs test cases
>> where I ran multiple threads with higher queue-depth and not in
>> sanitized pure latency (QD=1) test cases. Unfortunately I didn't have
>> the time to pick it up since.
>>
>> I do have every intention of testing it again with this. If it still
>> exist we will need to find the root-cause of it before converting
>> drivers to use it.
>
> Good, this way (inconsistent performance and latency skews) or another
> (all shines up) -- please
> let us know your findings, best through commenting within V > 0 the
> cover letter posts of this series
>

Hi Or & Co,

I ran some tests on the iser code with this patchset applied.
I can confirm that I did not see any performance degradations.
summary (on my test servers):
1  LUN:   ~530K  (IOPs)
2  LUNs:  ~1080K (IOPs)
4  LUNs:  ~1350K (IOPs)
8  LUNs:  ~1930K (IOPs)
16 LUns:  ~2250K (IOPs)

These results are true both for tasklet and iopoll.

So, I don't have anything smart to say here, the IO
stack (block, scsi) has gone through major changes since
the last time I looked into this, so it'll be pretty hard to figure
out what was the root cause back then...

Sagi.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-15 12:55         ` Christoph Hellwig
@ 2015-11-15 13:21             ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15 13:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



On 15/11/2015 14:55, Christoph Hellwig wrote:
> On Sun, Nov 15, 2015 at 11:40:02AM +0200, Sagi Grimberg wrote:
>> I doubt INT_MAX is useful as a budget in any use-case. it can easily
>> hog the CPU. If the consumer is given access to poll a CQ, it must be
>> able to provide some way to budget it. Why not expose a budget argument
>> to the consumer?
>
> Because in theory we could have a lot of sends completing before
> we finally need to reap them.  I think that's more of a theoretical
> than real issue.

Still, processing a CQ possibly forever is not something we'd want to
enable in an API, if a caller wants to do that anyway, it should loop
this call...

>
> My preference would be to simply kill this mode though.  Allocate a IU
> to each block request in SRP and only use the free_tx list for task
> management and AEN/req_limit calls.  Then we can use a single CQ
> and mark the regular I/O requests as unsignalled.

It might be better. I'd say that we keep this API and let Bart decide
if he wants to do that in srp. If he wants to convert srp, we can
always drop it.

> AFAICS no other driver wants a similar polling mode as the SRP initiator
> does for it's send queue.

iser worked in this mode in the past. But we changed that.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-15 13:21             ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-15 13:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rdma, bart.vanassche, axboe, linux-scsi, linux-kernel



On 15/11/2015 14:55, Christoph Hellwig wrote:
> On Sun, Nov 15, 2015 at 11:40:02AM +0200, Sagi Grimberg wrote:
>> I doubt INT_MAX is useful as a budget in any use-case. it can easily
>> hog the CPU. If the consumer is given access to poll a CQ, it must be
>> able to provide some way to budget it. Why not expose a budget argument
>> to the consumer?
>
> Because in theory we could have a lot of sends completing before
> we finally need to reap them.  I think that's more of a theoretical
> than real issue.

Still, processing a CQ possibly forever is not something we'd want to
enable in an API, if a caller wants to do that anyway, it should loop
this call...

>
> My preference would be to simply kill this mode though.  Allocate a IU
> to each block request in SRP and only use the free_tx list for task
> management and AEN/req_limit calls.  Then we can use a single CQ
> and mark the regular I/O requests as unsignalled.

It might be better. I'd say that we keep this API and let Bart decide
if he wants to do that in srp. If he wants to convert srp, we can
always drop it.

> AFAICS no other driver wants a similar polling mode as the SRP initiator
> does for it's send queue.

iser worked in this mode in the past. But we changed that.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-15  9:34   ` Sagi Grimberg
@ 2015-11-16 16:38         ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 16:38 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/15/2015 3:34 AM, Sagi Grimberg wrote:
>
>> +
>> +struct ib_stop_cqe {
>> +    struct ib_cqe    cqe;
>> +    struct completion done;
>> +};
>> +
>> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
>> +{
>> +    struct ib_stop_cqe *stop =
>> +        container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
>> +
>> +    complete(&stop->done);
>> +}
>> +
>> +/*
>> + * Change a queue pair into the error state and wait until all receive
>> + * completions have been processed before destroying it. This avoids 
>> that
>> + * the receive completion handler can access the queue pair while it is
>> + * being destroyed.
>> + */
>> +void ib_drain_qp(struct ib_qp *qp)
>> +{
>> +    struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>> +    struct ib_stop_cqe stop = { };
>> +    struct ib_recv_wr wr, *bad_wr;
>> +    int ret;
>> +
>> +    wr.wr_cqe = &stop.cqe;
>> +    stop.cqe.done = ib_stop_done;
>> +    init_completion(&stop.done);
>> +
>> +    ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>> +    if (ret) {
>> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +        return;
>> +    }
>> +
>> +    ret = ib_post_recv(qp, &wr, &bad_wr);
>> +    if (ret) {
>> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +        return;
>> +    }
>> +
>> +    wait_for_completion(&stop.done);
>> +}
>
> This is taken from srp, and srp drains using a recv wr due to a race
> causing a use-after-free condition in srp which re-posts a recv buffer
> in the recv completion handler. srp does not really care if there are
> pending send flushes.
>
> I'm not sure if there are ordering rules for send/recv queues in
> terms of flush completions, meaning that even if all recv flushes
> were consumed maybe there are send flushes still pending.
>
> I think that for a general drain helper it would be useful to
> make sure that both the recv _and_ send flushes were drained.
>
> So, something like:
>
> void ib_drain_qp(struct ib_qp *qp)
> {
>     struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>     struct ib_stop_cqe rstop, sstop;
>     struct ib_recv_wr rwr = {}, *bad_rwr;
>     struct ib_send_wr swr = {}, *bad_swr;
>     int ret;
>
>     rwr.wr_cqe = &rstop.cqe;
>     rstop.cqe.done = ib_stop_done;
>     init_completion(&rstop.done);
>
>     swr.wr_cqe = &sstop.cqe;
>     sstop.cqe.done = ib_stop_done;
>     init_completion(&sstop.done);
>
>     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>         return;
>     }
>
>     ret = ib_post_recv(qp, &rwr, &bad_rwr);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
>         return;
>     }
>
>     ret = ib_post_send(qp, &swr, &bad_swr);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
>         return;
>     }
>
>     wait_for_completion(&rstop.done);
>     wait_for_completion(&sstop.done);
> }
>
> Thoughts?

This won't work for iWARP as per my previous email.  But I will code 
something up that will.

Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-16 16:38         ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 16:38 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/15/2015 3:34 AM, Sagi Grimberg wrote:
>
>> +
>> +struct ib_stop_cqe {
>> +    struct ib_cqe    cqe;
>> +    struct completion done;
>> +};
>> +
>> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
>> +{
>> +    struct ib_stop_cqe *stop =
>> +        container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
>> +
>> +    complete(&stop->done);
>> +}
>> +
>> +/*
>> + * Change a queue pair into the error state and wait until all receive
>> + * completions have been processed before destroying it. This avoids 
>> that
>> + * the receive completion handler can access the queue pair while it is
>> + * being destroyed.
>> + */
>> +void ib_drain_qp(struct ib_qp *qp)
>> +{
>> +    struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>> +    struct ib_stop_cqe stop = { };
>> +    struct ib_recv_wr wr, *bad_wr;
>> +    int ret;
>> +
>> +    wr.wr_cqe = &stop.cqe;
>> +    stop.cqe.done = ib_stop_done;
>> +    init_completion(&stop.done);
>> +
>> +    ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>> +    if (ret) {
>> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +        return;
>> +    }
>> +
>> +    ret = ib_post_recv(qp, &wr, &bad_wr);
>> +    if (ret) {
>> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>> +        return;
>> +    }
>> +
>> +    wait_for_completion(&stop.done);
>> +}
>
> This is taken from srp, and srp drains using a recv wr due to a race
> causing a use-after-free condition in srp which re-posts a recv buffer
> in the recv completion handler. srp does not really care if there are
> pending send flushes.
>
> I'm not sure if there are ordering rules for send/recv queues in
> terms of flush completions, meaning that even if all recv flushes
> were consumed maybe there are send flushes still pending.
>
> I think that for a general drain helper it would be useful to
> make sure that both the recv _and_ send flushes were drained.
>
> So, something like:
>
> void ib_drain_qp(struct ib_qp *qp)
> {
>     struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
>     struct ib_stop_cqe rstop, sstop;
>     struct ib_recv_wr rwr = {}, *bad_rwr;
>     struct ib_send_wr swr = {}, *bad_swr;
>     int ret;
>
>     rwr.wr_cqe = &rstop.cqe;
>     rstop.cqe.done = ib_stop_done;
>     init_completion(&rstop.done);
>
>     swr.wr_cqe = &sstop.cqe;
>     sstop.cqe.done = ib_stop_done;
>     init_completion(&sstop.done);
>
>     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>         return;
>     }
>
>     ret = ib_post_recv(qp, &rwr, &bad_rwr);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
>         return;
>     }
>
>     ret = ib_post_send(qp, &swr, &bad_swr);
>     if (ret) {
>         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
>         return;
>     }
>
>     wait_for_completion(&rstop.done);
>     wait_for_completion(&sstop.done);
> }
>
> Thoughts?

This won't work for iWARP as per my previous email.  But I will code 
something up that will.

Steve

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-16 16:38         ` Steve Wise
@ 2015-11-16 18:30             ` Steve Wise
  -1 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 18:30 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> Sent: Monday, November 16, 2015 10:38 AM
> To: Sagi Grimberg; Christoph Hellwig; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org; axboe-b10kYP2dOMg@public.gmane.org; linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> On 11/15/2015 3:34 AM, Sagi Grimberg wrote:
> >
> >> +
> >> +struct ib_stop_cqe {
> >> +    struct ib_cqe    cqe;
> >> +    struct completion done;
> >> +};
> >> +
> >> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
> >> +{
> >> +    struct ib_stop_cqe *stop =
> >> +        container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
> >> +
> >> +    complete(&stop->done);
> >> +}
> >> +
> >> +/*
> >> + * Change a queue pair into the error state and wait until all receive
> >> + * completions have been processed before destroying it. This avoids
> >> that
> >> + * the receive completion handler can access the queue pair while it is
> >> + * being destroyed.
> >> + */
> >> +void ib_drain_qp(struct ib_qp *qp)
> >> +{
> >> +    struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >> +    struct ib_stop_cqe stop = { };
> >> +    struct ib_recv_wr wr, *bad_wr;
> >> +    int ret;
> >> +
> >> +    wr.wr_cqe = &stop.cqe;
> >> +    stop.cqe.done = ib_stop_done;
> >> +    init_completion(&stop.done);
> >> +
> >> +    ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >> +    if (ret) {
> >> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +        return;
> >> +    }
> >> +
> >> +    ret = ib_post_recv(qp, &wr, &bad_wr);
> >> +    if (ret) {
> >> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +        return;
> >> +    }
> >> +
> >> +    wait_for_completion(&stop.done);
> >> +}
> >
> > This is taken from srp, and srp drains using a recv wr due to a race
> > causing a use-after-free condition in srp which re-posts a recv buffer
> > in the recv completion handler. srp does not really care if there are
> > pending send flushes.
> >
> > I'm not sure if there are ordering rules for send/recv queues in
> > terms of flush completions, meaning that even if all recv flushes
> > were consumed maybe there are send flushes still pending.
> >
> > I think that for a general drain helper it would be useful to
> > make sure that both the recv _and_ send flushes were drained.
> >
> > So, something like:
> >
> > void ib_drain_qp(struct ib_qp *qp)
> > {
> >     struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >     struct ib_stop_cqe rstop, sstop;
> >     struct ib_recv_wr rwr = {}, *bad_rwr;
> >     struct ib_send_wr swr = {}, *bad_swr;
> >     int ret;
> >
> >     rwr.wr_cqe = &rstop.cqe;
> >     rstop.cqe.done = ib_stop_done;
> >     init_completion(&rstop.done);
> >
> >     swr.wr_cqe = &sstop.cqe;
> >     sstop.cqe.done = ib_stop_done;
> >     init_completion(&sstop.done);
> >
> >     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >         return;
> >     }
> >
> >     ret = ib_post_recv(qp, &rwr, &bad_rwr);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> >         return;
> >     }
> >
> >     ret = ib_post_send(qp, &swr, &bad_swr);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> >         return;
> >     }
> >
> >     wait_for_completion(&rstop.done);
> >     wait_for_completion(&sstop.done);
> > }
> >
> > Thoughts?
> 
> This won't work for iWARP as per my previous email.  But I will code
> something up that will.
> 
> Steve

After looking at the nes driver, I don't see any common way to support drain w/o some serious driver mods.  Since SRP is the only
user, perhaps we can ignore iWARP for this function...

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-16 18:30             ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 18:30 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> -----Original Message-----
> From: Steve Wise [mailto:swise@opengridcomputing.com]
> Sent: Monday, November 16, 2015 10:38 AM
> To: Sagi Grimberg; Christoph Hellwig; linux-rdma@vger.kernel.org
> Cc: bart.vanassche@sandisk.com; axboe@fb.com; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> On 11/15/2015 3:34 AM, Sagi Grimberg wrote:
> >
> >> +
> >> +struct ib_stop_cqe {
> >> +    struct ib_cqe    cqe;
> >> +    struct completion done;
> >> +};
> >> +
> >> +static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
> >> +{
> >> +    struct ib_stop_cqe *stop =
> >> +        container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
> >> +
> >> +    complete(&stop->done);
> >> +}
> >> +
> >> +/*
> >> + * Change a queue pair into the error state and wait until all receive
> >> + * completions have been processed before destroying it. This avoids
> >> that
> >> + * the receive completion handler can access the queue pair while it is
> >> + * being destroyed.
> >> + */
> >> +void ib_drain_qp(struct ib_qp *qp)
> >> +{
> >> +    struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >> +    struct ib_stop_cqe stop = { };
> >> +    struct ib_recv_wr wr, *bad_wr;
> >> +    int ret;
> >> +
> >> +    wr.wr_cqe = &stop.cqe;
> >> +    stop.cqe.done = ib_stop_done;
> >> +    init_completion(&stop.done);
> >> +
> >> +    ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >> +    if (ret) {
> >> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +        return;
> >> +    }
> >> +
> >> +    ret = ib_post_recv(qp, &wr, &bad_wr);
> >> +    if (ret) {
> >> +        WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >> +        return;
> >> +    }
> >> +
> >> +    wait_for_completion(&stop.done);
> >> +}
> >
> > This is taken from srp, and srp drains using a recv wr due to a race
> > causing a use-after-free condition in srp which re-posts a recv buffer
> > in the recv completion handler. srp does not really care if there are
> > pending send flushes.
> >
> > I'm not sure if there are ordering rules for send/recv queues in
> > terms of flush completions, meaning that even if all recv flushes
> > were consumed maybe there are send flushes still pending.
> >
> > I think that for a general drain helper it would be useful to
> > make sure that both the recv _and_ send flushes were drained.
> >
> > So, something like:
> >
> > void ib_drain_qp(struct ib_qp *qp)
> > {
> >     struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> >     struct ib_stop_cqe rstop, sstop;
> >     struct ib_recv_wr rwr = {}, *bad_rwr;
> >     struct ib_send_wr swr = {}, *bad_swr;
> >     int ret;
> >
> >     rwr.wr_cqe = &rstop.cqe;
> >     rstop.cqe.done = ib_stop_done;
> >     init_completion(&rstop.done);
> >
> >     swr.wr_cqe = &sstop.cqe;
> >     sstop.cqe.done = ib_stop_done;
> >     init_completion(&sstop.done);
> >
> >     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
> >         return;
> >     }
> >
> >     ret = ib_post_recv(qp, &rwr, &bad_rwr);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
> >         return;
> >     }
> >
> >     ret = ib_post_send(qp, &swr, &bad_swr);
> >     if (ret) {
> >         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
> >         return;
> >     }
> >
> >     wait_for_completion(&rstop.done);
> >     wait_for_completion(&sstop.done);
> > }
> >
> > Thoughts?
> 
> This won't work for iWARP as per my previous email.  But I will code
> something up that will.
> 
> Steve

After looking at the nes driver, I don't see any common way to support drain w/o some serious driver mods.  Since SRP is the only
user, perhaps we can ignore iWARP for this function...


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-16 18:30             ` Steve Wise
  (?)
@ 2015-11-16 18:37             ` Sagi Grimberg
       [not found]               ` <564A2270.1040004-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  -1 siblings, 1 reply; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-16 18:37 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel


> After looking at the nes driver, I don't see any common way to support drain w/o some serious driver mods.  Since SRP is the only
> user, perhaps we can ignore iWARP for this function...

But iser/isert essentially does it too (and I think xprtrdma will have
it soon)...

the modify_qp is invoked from rdma_disconnect() and we do post
an 'empty' wr to wait for all the flushes to drain (see
iser_conn_terminate).

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-16 18:37             ` Sagi Grimberg
@ 2015-11-16 19:03                   ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 19:03 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Sagi Grimberg
> Sent: Monday, November 16, 2015 12:38 PM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org; axboe-b10kYP2dOMg@public.gmane.org; linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > After looking at the nes driver, I don't see any common way to support drain w/o some serious driver mods.  Since SRP is the
only
> > user, perhaps we can ignore iWARP for this function...
> 
> But iser/isert essentially does it too (and I think xprtrdma will have
> it soon)...
> 
> the modify_qp is invoked from rdma_disconnect() and we do post
> an 'empty' wr to wait for all the flushes to drain (see
> iser_conn_terminate).

That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
cxgb4 with the old iwarp support patches.   

Perhaps we need another way to do this?  Like a completion object in the QP that gets triggered when the SQ and RQ become empty
after a transition to ERROR (and CLOSING for iwarp).  Then a core service that just waits until the QP is empty.  Implementation of
this design would hit the providers though since only they know when the flush is completed.

Alternatively, I could enable post-while-in-error support in cxgb4 and ignore the spec in this regard.  But I'd rather not do that.
:)

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-16 19:03                   ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-16 19:03 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-owner@vger.kernel.org] On Behalf Of Sagi Grimberg
> Sent: Monday, November 16, 2015 12:38 PM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma@vger.kernel.org
> Cc: bart.vanassche@sandisk.com; axboe@fb.com; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > After looking at the nes driver, I don't see any common way to support drain w/o some serious driver mods.  Since SRP is the
only
> > user, perhaps we can ignore iWARP for this function...
> 
> But iser/isert essentially does it too (and I think xprtrdma will have
> it soon)...
> 
> the modify_qp is invoked from rdma_disconnect() and we do post
> an 'empty' wr to wait for all the flushes to drain (see
> iser_conn_terminate).

That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
cxgb4 with the old iwarp support patches.   

Perhaps we need another way to do this?  Like a completion object in the QP that gets triggered when the SQ and RQ become empty
after a transition to ERROR (and CLOSING for iwarp).  Then a core service that just waits until the QP is empty.  Implementation of
this design would hit the providers though since only they know when the flush is completed.

Alternatively, I could enable post-while-in-error support in cxgb4 and ignore the spec in this regard.  But I'd rather not do that.
:)

Steve.


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-16 19:03                   ` Steve Wise
@ 2015-11-17  8:54                     ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-17  8:54 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA


> That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> cxgb4 with the old iwarp support patches.

It's there since ~3.17 I think...

>
> Perhaps we need another way to do this?  Like a completion object in the QP that gets triggered when the SQ and RQ become empty
> after a transition to ERROR (and CLOSING for iwarp).  Then a core service that just waits until the QP is empty.  Implementation of
> this design would hit the providers though since only they know when the flush is completed.

ULPs need a drain functionality, so ib_drain_qp() is the way to go...

How about we add a drain_qp() callout and have:

	if (qp->device->drain_qp) {
		qp->device->drain_qp();
		return;
	}

	IB drain qp logic...

This way iWARP devices can have their own magic on how to implement this
functionality.

Thoughts?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-17  8:54                     ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-17  8:54 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel


> That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> cxgb4 with the old iwarp support patches.

It's there since ~3.17 I think...

>
> Perhaps we need another way to do this?  Like a completion object in the QP that gets triggered when the SQ and RQ become empty
> after a transition to ERROR (and CLOSING for iwarp).  Then a core service that just waits until the QP is empty.  Implementation of
> this design would hit the providers though since only they know when the flush is completed.

ULPs need a drain functionality, so ib_drain_qp() is the way to go...

How about we add a drain_qp() callout and have:

	if (qp->device->drain_qp) {
		qp->device->drain_qp();
		return;
	}

	IB drain qp logic...

This way iWARP devices can have their own magic on how to implement this
functionality.

Thoughts?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-15  9:34   ` Sagi Grimberg
@ 2015-11-17 17:06         ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:06 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/15/2015 01:34 AM, Sagi Grimberg wrote:
> This is taken from srp, and srp drains using a recv wr due to a race
> causing a use-after-free condition in srp which re-posts a recv buffer
> in the recv completion handler.

Hello Sagi,

Would it be possible to clarify this ? Does this refer to an existing 
race or a race that would only occur if the code would be modified ?

Thanks,

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-17 17:06         ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:06 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/15/2015 01:34 AM, Sagi Grimberg wrote:
> This is taken from srp, and srp drains using a recv wr due to a race
> causing a use-after-free condition in srp which re-posts a recv buffer
> in the recv completion handler.

Hello Sagi,

Would it be possible to clarify this ? Does this refer to an existing 
race or a race that would only occur if the code would be modified ?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-14  7:02       ` Christoph Hellwig
@ 2015-11-17 17:16           ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:16 UTC (permalink / raw)
  To: Christoph Hellwig, Bart Van Assche
  Cc: linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/13/2015 11:02 PM, Christoph Hellwig wrote:
> On Fri, Nov 13, 2015 at 11:19:24AM -0800, Bart Van Assche wrote:
>> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>>> welcome.
>>
>> Would it be possible to provide more background information about this ?
>> Which other kernel subsystem is using the name iopoll ?
>
> Take a look at include/linux/iopoll.h  - I can't reaplly make much sense
> of it to be honest, but it's used in a quite a few places.

How about renaming blk_iopoll into blk_poll ? That way the name still 
refers to the block layer. And although the current implementation 
performs polling from IRQ context future implementations maybe will 
allow polling from thread context.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
@ 2015-11-17 17:16           ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:16 UTC (permalink / raw)
  To: Christoph Hellwig, Bart Van Assche
  Cc: linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/13/2015 11:02 PM, Christoph Hellwig wrote:
> On Fri, Nov 13, 2015 at 11:19:24AM -0800, Bart Van Assche wrote:
>> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>>> welcome.
>>
>> Would it be possible to provide more background information about this ?
>> Which other kernel subsystem is using the name iopoll ?
>
> Take a look at include/linux/iopoll.h  - I can't reaplly make much sense
> of it to be honest, but it's used in a quite a few places.

How about renaming blk_iopoll into blk_poll ? That way the name still 
refers to the block layer. And although the current implementation 
performs polling from IRQ context future implementations maybe will 
allow polling from thread context.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-17 17:16           ` Bart Van Assche
  (?)
@ 2015-11-17 17:27           ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/17/2015 09:16 AM, Bart Van Assche wrote:
> On 11/13/2015 11:02 PM, Christoph Hellwig wrote:
>> On Fri, Nov 13, 2015 at 11:19:24AM -0800, Bart Van Assche wrote:
>>> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>>>> The new name is irq_poll as iopoll is already taken.  Better suggestions
>>>> welcome.
>>>
>>> Would it be possible to provide more background information about this ?
>>> Which other kernel subsystem is using the name iopoll ?
>>
>> Take a look at include/linux/iopoll.h  - I can't reaplly make much sense
>> of it to be honest, but it's used in a quite a few places.
>
> How about renaming blk_iopoll into blk_poll ? That way the name still
> refers to the block layer. And although the current implementation
> performs polling from IRQ context future implementations maybe will
> allow polling from thread context.

(replying to my own e-mail)

Please ignore the previous comment - I just noticed that this mechanism 
is not limited to block devices.

Bart.


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
@ 2015-11-17 17:52       ` Bart Van Assche
       [not found]   ` <1447422410-20891-3-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
  1 sibling, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:52 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> + * context and does not ask from completion interrupts from the HCA.
                                ^^^^
Should this perhaps be changed into "for" ?

> + */
> +void ib_process_cq_direct(struct ib_cq *cq)
> +{
> +	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
> +
> +	__ib_process_cq(cq, INT_MAX);
> +}
> +EXPORT_SYMBOL(ib_process_cq_direct);

My proposal is to drop this function and to export __ib_process_cq() 
instead (with or without renaming). That will allow callers of this 
function to compare the poll budget with the number of completions that 
have been processed and use that information to decide whether or not to 
call this function again.

> +static void ib_cq_poll_work(struct work_struct *work)
> +{
> +	struct ib_cq *cq = container_of(work, struct ib_cq, work);
> +	int completed;
> +
> +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		queue_work(ib_comp_wq, &cq->work);
> +}
> +
> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
> +{
> +	queue_work(ib_comp_wq, &cq->work);
> +}

The above code will cause all polling to occur on the context of the CPU 
that received the completion interrupt. This approach is not powerful 
enough. For certain workloads throughput is higher if work completions 
are processed by another CPU core on the same CPU socket. Has it been 
considered to make the CPU core on which work completions are processed 
configurable ?

> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index 62b6cba..3027824 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
>   static void srp_destroy_qp(struct srp_rdma_ch *ch)
>   {
>   	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> -	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
> +	static struct ib_recv_wr wr = { 0 };
>   	struct ib_recv_wr *bad_wr;
>   	int ret;

Since the 'wr' structure is static I don't think it needs to be 
zero-initialized explicitly.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-17 17:52       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 17:52 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> + * context and does not ask from completion interrupts from the HCA.
                                ^^^^
Should this perhaps be changed into "for" ?

> + */
> +void ib_process_cq_direct(struct ib_cq *cq)
> +{
> +	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
> +
> +	__ib_process_cq(cq, INT_MAX);
> +}
> +EXPORT_SYMBOL(ib_process_cq_direct);

My proposal is to drop this function and to export __ib_process_cq() 
instead (with or without renaming). That will allow callers of this 
function to compare the poll budget with the number of completions that 
have been processed and use that information to decide whether or not to 
call this function again.

> +static void ib_cq_poll_work(struct work_struct *work)
> +{
> +	struct ib_cq *cq = container_of(work, struct ib_cq, work);
> +	int completed;
> +
> +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		queue_work(ib_comp_wq, &cq->work);
> +}
> +
> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
> +{
> +	queue_work(ib_comp_wq, &cq->work);
> +}

The above code will cause all polling to occur on the context of the CPU 
that received the completion interrupt. This approach is not powerful 
enough. For certain workloads throughput is higher if work completions 
are processed by another CPU core on the same CPU socket. Has it been 
considered to make the CPU core on which work completions are processed 
configurable ?

> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index 62b6cba..3027824 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
>   static void srp_destroy_qp(struct srp_rdma_ch *ch)
>   {
>   	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
> -	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
> +	static struct ib_recv_wr wr = { 0 };
>   	struct ib_recv_wr *bad_wr;
>   	int ret;

Since the 'wr' structure is static I don't think it needs to be 
zero-initialized explicitly.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
  2015-11-13 13:46 ` [PATCH 5/9] srpt: use the new CQ API Christoph Hellwig
@ 2015-11-17 18:22       ` Bart Van Assche
  2015-11-17 19:38     ` Bart Van Assche
  1 sibling, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 18:22 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
 > [ ... ]

The previous patch and this patch look like great work to me. However, 
this patch not only reworks the SRP target driver but also prevents 
users to move the SRP completion thread to another CPU core than the CPU 
core that processes the completion interrupts (with the help of e.g. the 
taskset command). Hence my request to make that CPU core configurable in 
the second patch of this patch series.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
@ 2015-11-17 18:22       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 18:22 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
 > [ ... ]

The previous patch and this patch look like great work to me. However, 
this patch not only reworks the SRP target driver but also prevents 
users to move the SRP completion thread to another CPU core than the CPU 
core that processes the completion interrupts (with the help of e.g. the 
taskset command). Hence my request to make that CPU core configurable in 
the second patch of this patch series.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
  2015-11-13 13:46 ` [PATCH 5/9] srpt: use the new CQ API Christoph Hellwig
@ 2015-11-17 19:38     ` Bart Van Assche
  2015-11-17 19:38     ` Bart Van Assche
  1 sibling, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 19:38 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> [ ... ]

This patch contains two logical changes:
- Conversion to the new CQ API.
- Removal of the ib_srpt_compl thread.

Had it been considered to implement these changes as two separate patches ?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
@ 2015-11-17 19:38     ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 19:38 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> [ ... ]

This patch contains two logical changes:
- Conversion to the new CQ API.
- Removal of the ib_srpt_compl thread.

Had it been considered to implement these changes as two separate patches ?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 6/9] srp: use the new CQ API
  2015-11-13 13:46 ` [PATCH 6/9] srp: " Christoph Hellwig
@ 2015-11-17 19:56       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 19:56 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> +static void srp_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	srp_handle_qp_err(cq, wc, "INV RKEY");
> +}
 >
> [ ... ]
 >
> +static void srp_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	srp_handle_qp_err(cq, wc, "FAST REG");
> +}

How about using names like srp_inv_rkey_err() and srp_reg_mr_err() to 
make clear that these completion functions are only called if an error 
occurred ?

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 6/9] srp: use the new CQ API
@ 2015-11-17 19:56       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-17 19:56 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> +static void srp_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	srp_handle_qp_err(cq, wc, "INV RKEY");
> +}
 >
> [ ... ]
 >
> +static void srp_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
> +{
> +	srp_handle_qp_err(cq, wc, "FAST REG");
> +}

How about using names like srp_inv_rkey_err() and srp_reg_mr_err() to 
make clear that these completion functions are only called if an error 
occurred ?

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-13 13:46     ` Christoph Hellwig
@ 2015-11-18  1:17       ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18  1:17 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> -		ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
> -		if (ret)
> -			break;
> +		if (i == n_rdma - 1) {
> +			/* only get completion event for the last rdma read */
> +			if (dir == DMA_TO_DEVICE)
> +				wr->wr.send_flags = IB_SEND_SIGNALED;
> +			wr->wr.next = NULL;
> +		} else {
> +			wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
> +		}
>   	}
>
> +	ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
>   	if (ret)
>   		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
>   				 __func__, __LINE__, ret, i, n_rdma);

Hello Christoph,

Chaining RDMA requests is a great idea. But it seems to me that this 
patch is based on the assumption that posting multiple RDMA requests 
either succeeds as a whole or fails as a whole. Sorry but I'm not sure 
that the verbs API guarantees this. In the ib_srpt driver a QP can be 
changed at any time into the error state and there might be drivers that 
report an immediate failure in that case. I think even when chaining 
RDMA requests that we still need a mechanism to wait until ongoing RDMA 
transfers have finished if some but not all RDMA requests have been posted.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
@ 2015-11-18  1:17       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18  1:17 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
> -		ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
> -		if (ret)
> -			break;
> +		if (i == n_rdma - 1) {
> +			/* only get completion event for the last rdma read */
> +			if (dir == DMA_TO_DEVICE)
> +				wr->wr.send_flags = IB_SEND_SIGNALED;
> +			wr->wr.next = NULL;
> +		} else {
> +			wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
> +		}
>   	}
>
> +	ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
>   	if (ret)
>   		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
>   				 __func__, __LINE__, ret, i, n_rdma);

Hello Christoph,

Chaining RDMA requests is a great idea. But it seems to me that this 
patch is based on the assumption that posting multiple RDMA requests 
either succeeds as a whole or fails as a whole. Sorry but I'm not sure 
that the verbs API guarantees this. In the ib_srpt driver a QP can be 
changed at any time into the error state and there might be drivers that 
report an immediate failure in that case. I think even when chaining 
RDMA requests that we still need a mechanism to wait until ongoing RDMA 
transfers have finished if some but not all RDMA requests have been posted.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-17 17:52       ` Bart Van Assche
@ 2015-11-18  7:55           ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18  7:55 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Hi Bart,

>> + */
>> +void ib_process_cq_direct(struct ib_cq *cq)
>> +{
>> +    WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
>> +
>> +    __ib_process_cq(cq, INT_MAX);
>> +}
>> +EXPORT_SYMBOL(ib_process_cq_direct);
>
> My proposal is to drop this function and to export __ib_process_cq()
> instead (with or without renaming). That will allow callers of this
> function to compare the poll budget with the number of completions that
> have been processed and use that information to decide whether or not to
> call this function again.

I agree with that.

>
>> +static void ib_cq_poll_work(struct work_struct *work)
>> +{
>> +    struct ib_cq *cq = container_of(work, struct ib_cq, work);
>> +    int completed;
>> +
>> +    completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
>> +    if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>> +        ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>> +        queue_work(ib_comp_wq, &cq->work);
>> +}
>> +
>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>> +{
>> +    queue_work(ib_comp_wq, &cq->work);
>> +}
>
> The above code will cause all polling to occur on the context of the CPU
> that received the completion interrupt. This approach is not powerful
> enough. For certain workloads throughput is higher if work completions
> are processed by another CPU core on the same CPU socket. Has it been
> considered to make the CPU core on which work completions are processed
> configurable ?

The workqueue is unbound. This means that the functionality you are
you are asking for exists.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-18  7:55           ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18  7:55 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel

Hi Bart,

>> + */
>> +void ib_process_cq_direct(struct ib_cq *cq)
>> +{
>> +    WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
>> +
>> +    __ib_process_cq(cq, INT_MAX);
>> +}
>> +EXPORT_SYMBOL(ib_process_cq_direct);
>
> My proposal is to drop this function and to export __ib_process_cq()
> instead (with or without renaming). That will allow callers of this
> function to compare the poll budget with the number of completions that
> have been processed and use that information to decide whether or not to
> call this function again.

I agree with that.

>
>> +static void ib_cq_poll_work(struct work_struct *work)
>> +{
>> +    struct ib_cq *cq = container_of(work, struct ib_cq, work);
>> +    int completed;
>> +
>> +    completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
>> +    if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>> +        ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>> +        queue_work(ib_comp_wq, &cq->work);
>> +}
>> +
>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>> +{
>> +    queue_work(ib_comp_wq, &cq->work);
>> +}
>
> The above code will cause all polling to occur on the context of the CPU
> that received the completion interrupt. This approach is not powerful
> enough. For certain workloads throughput is higher if work completions
> are processed by another CPU core on the same CPU socket. Has it been
> considered to make the CPU core on which work completions are processed
> configurable ?

The workqueue is unbound. This means that the functionality you are
you are asking for exists.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-17 17:06         ` Bart Van Assche
@ 2015-11-18  7:59             ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18  7:59 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



On 17/11/2015 19:06, Bart Van Assche wrote:
> On 11/15/2015 01:34 AM, Sagi Grimberg wrote:
>> This is taken from srp, and srp drains using a recv wr due to a race
>> causing a use-after-free condition in srp which re-posts a recv buffer
>> in the recv completion handler.
>
> Hello Sagi,
>
> Would it be possible to clarify this ? Does this refer to an existing
> race or a race that would only occur if the code would be modified ?

I was referring to a bug that srp_destroy_qp() was design to
address:

commit 7dad6b2e440d810273946b0e7092a8fe043c3b8a
Author: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
Date:   Tue Oct 21 18:00:35 2014 +0200

     IB/srp: Fix a race condition triggered by destroying a queue pair

     At least LID reassignment can trigger a race condition in the SRP
     initiator driver, namely the receive completion handler trying to
     post a request on a QP during or after QP destruction and before
     the CQ's have been destroyed. Avoid this race by modifying a QP
     into the error state and by waiting until all receive completions
     have been processed before destroying a QP.

     Reported-by: Max Gurtuvoy <maxg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
     Signed-off-by: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
     Reviewed-by: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
     Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-18  7:59             ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18  7:59 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel



On 17/11/2015 19:06, Bart Van Assche wrote:
> On 11/15/2015 01:34 AM, Sagi Grimberg wrote:
>> This is taken from srp, and srp drains using a recv wr due to a race
>> causing a use-after-free condition in srp which re-posts a recv buffer
>> in the recv completion handler.
>
> Hello Sagi,
>
> Would it be possible to clarify this ? Does this refer to an existing
> race or a race that would only occur if the code would be modified ?

I was referring to a bug that srp_destroy_qp() was design to
address:

commit 7dad6b2e440d810273946b0e7092a8fe043c3b8a
Author: Bart Van Assche <bvanassche@acm.org>
Date:   Tue Oct 21 18:00:35 2014 +0200

     IB/srp: Fix a race condition triggered by destroying a queue pair

     At least LID reassignment can trigger a race condition in the SRP
     initiator driver, namely the receive completion handler trying to
     post a request on a QP during or after QP destruction and before
     the CQ's have been destroyed. Avoid this race by modifying a QP
     into the error state and by waiting until all receive completions
     have been processed before destroying a QP.

     Reported-by: Max Gurtuvoy <maxg@mellanox.com>
     Signed-off-by: Bart Van Assche <bvanassche@acm.org>
     Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
     Signed-off-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-18  1:17       ` Bart Van Assche
  (?)
@ 2015-11-18  9:15       ` Sagi Grimberg
  2015-11-18 16:32           ` Bart Van Assche
  -1 siblings, 1 reply; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18  9:15 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel



On 18/11/2015 03:17, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> -        ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
>> -        if (ret)
>> -            break;
>> +        if (i == n_rdma - 1) {
>> +            /* only get completion event for the last rdma read */
>> +            if (dir == DMA_TO_DEVICE)
>> +                wr->wr.send_flags = IB_SEND_SIGNALED;
>> +            wr->wr.next = NULL;
>> +        } else {
>> +            wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
>> +        }
>>       }
>>
>> +    ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
>>       if (ret)
>>           pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
>>                    __func__, __LINE__, ret, i, n_rdma);
>
> Hello Christoph,

Hi Bart,

>
> Chaining RDMA requests is a great idea. But it seems to me that this
> patch is based on the assumption that posting multiple RDMA requests
> either succeeds as a whole or fails as a whole. Sorry but I'm not sure
> that the verbs API guarantees this. In the ib_srpt driver a QP can be
> changed at any time into the error state and there might be drivers that
> report an immediate failure in that case.

I'm not so sure it actually matters if some WRs succeeded. In the normal
flow when srpt has enough available work requests (sq_wr_avail) they
should all succeed otherwise we're in trouble. If the QP transitioned
to ERROR state, then some failed, but those that succeeded will
generate flush completions, and srpt should handle it correctly
shouldn't it?

> I think even when chaining
> RDMA requests that we still need a mechanism to wait until ongoing RDMA
> transfers have finished if some but not all RDMA requests have been posted.

I'm not an expert on srpt, can you explain how this mechanism will help?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-13 13:46 ` [PATCH 3/9] IB: add a helper to safely drain a QP Christoph Hellwig
@ 2015-11-18 11:32       ` Sagi Grimberg
  2015-11-15  9:34   ` Sagi Grimberg
  1 sibling, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18 11:32 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Christoph,

Given the discussion around this patch I think it would
be a good idea remove it from the patchset since it's not
mandatory for the CQ abstraction. I think that we should
take it with Steve to come up with a complete solution for
this bit.

Thoughts?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-18 11:32       ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-18 11:32 UTC (permalink / raw)
  To: Christoph Hellwig, linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel

Christoph,

Given the discussion around this patch I think it would
be a good idea remove it from the patchset since it's not
mandatory for the CQ abstraction. I think that we should
take it with Steve to come up with a complete solution for
this bit.

Thoughts?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 1/9] move blk_iopoll to limit and make it generally available
  2015-11-17 17:16           ` Bart Van Assche
  (?)
  (?)
@ 2015-11-18 13:58           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 13:58 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Tue, Nov 17, 2015 at 09:16:28AM -0800, Bart Van Assche wrote:
> How about renaming blk_iopoll into blk_poll ? That way the name still 
> refers to the block layer. And although the current implementation performs 
> polling from IRQ context future implementations maybe will allow polling 
> from thread context.

Well, the point is that there is no block layer specific code in there,
and the RDMA stack isn't really part of the block layer either.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-17 17:52       ` Bart Van Assche
@ 2015-11-18 14:00           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:00 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Tue, Nov 17, 2015 at 09:52:58AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> + * context and does not ask from completion interrupts from the HCA.
>                                ^^^^
> Should this perhaps be changed into "for" ?

Yes.

>
>> + */
>> +void ib_process_cq_direct(struct ib_cq *cq)
>> +{
>> +	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
>> +
>> +	__ib_process_cq(cq, INT_MAX);
>> +}
>> +EXPORT_SYMBOL(ib_process_cq_direct);
>
> My proposal is to drop this function and to export __ib_process_cq() 
> instead (with or without renaming). That will allow callers of this 
> function to compare the poll budget with the number of completions that 
> have been processed and use that information to decide whether or not to 
> call this function again.

I'd like to keep the WARN_ON, but we can export the same signature.

Then again my preference would be to remove the direct mode entirely.

>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>> +{
>> +	queue_work(ib_comp_wq, &cq->work);
>> +}
>
> The above code will cause all polling to occur on the context of the CPU 
> that received the completion interrupt. This approach is not powerful 
> enough. For certain workloads throughput is higher if work completions are 
> processed by another CPU core on the same CPU socket. Has it been 
> considered to make the CPU core on which work completions are processed 
> configurable ?

It's an unbound workqueue, so it's not tied to the specific CPU.  However
we'll only run the work_struct once so it's still tied to a single CPU
at a time, but that's not different from the kthread use previously.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-18 14:00           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:00 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Tue, Nov 17, 2015 at 09:52:58AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> + * context and does not ask from completion interrupts from the HCA.
>                                ^^^^
> Should this perhaps be changed into "for" ?

Yes.

>
>> + */
>> +void ib_process_cq_direct(struct ib_cq *cq)
>> +{
>> +	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
>> +
>> +	__ib_process_cq(cq, INT_MAX);
>> +}
>> +EXPORT_SYMBOL(ib_process_cq_direct);
>
> My proposal is to drop this function and to export __ib_process_cq() 
> instead (with or without renaming). That will allow callers of this 
> function to compare the poll budget with the number of completions that 
> have been processed and use that information to decide whether or not to 
> call this function again.

I'd like to keep the WARN_ON, but we can export the same signature.

Then again my preference would be to remove the direct mode entirely.

>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>> +{
>> +	queue_work(ib_comp_wq, &cq->work);
>> +}
>
> The above code will cause all polling to occur on the context of the CPU 
> that received the completion interrupt. This approach is not powerful 
> enough. For certain workloads throughput is higher if work completions are 
> processed by another CPU core on the same CPU socket. Has it been 
> considered to make the CPU core on which work completions are processed 
> configurable ?

It's an unbound workqueue, so it's not tied to the specific CPU.  However
we'll only run the work_struct once so it's still tied to a single CPU
at a time, but that's not different from the kthread use previously.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
  2015-11-17 19:38     ` Bart Van Assche
@ 2015-11-18 14:03         ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:03 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Tue, Nov 17, 2015 at 11:38:48AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> [ ... ]
>
> This patch contains two logical changes:
> - Conversion to the new CQ API.
> - Removal of the ib_srpt_compl thread.
>
> Had it been considered to implement these changes as two separate patches ?

It's intentional.  I want the new style completions to happen from
a controlled environment, so I don't want to allow calling them from
random context.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 5/9] srpt: use the new CQ API
@ 2015-11-18 14:03         ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:03 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Tue, Nov 17, 2015 at 11:38:48AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> [ ... ]
>
> This patch contains two logical changes:
> - Conversion to the new CQ API.
> - Removal of the ib_srpt_compl thread.
>
> Had it been considered to implement these changes as two separate patches ?

It's intentional.  I want the new style completions to happen from
a controlled environment, so I don't want to allow calling them from
random context.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 6/9] srp: use the new CQ API
  2015-11-17 19:56       ` Bart Van Assche
  (?)
@ 2015-11-18 14:03       ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:03 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Tue, Nov 17, 2015 at 11:56:39AM -0800, Bart Van Assche wrote:
> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>> +static void srp_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
>> +{
>> +	srp_handle_qp_err(cq, wc, "INV RKEY");
>> +}
> >
>> [ ... ]
> >
>> +static void srp_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc)
>> +{
>> +	srp_handle_qp_err(cq, wc, "FAST REG");
>> +}
>
> How about using names like srp_inv_rkey_err() and srp_reg_mr_err() to make 
> clear that these completion functions are only called if an error occurred 
> ?

I can do that if you like.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-18  1:17       ` Bart Van Assche
@ 2015-11-18 14:06           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:06 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Tue, Nov 17, 2015 at 05:17:35PM -0800, Bart Van Assche wrote:
> Chaining RDMA requests is a great idea. But it seems to me that this patch 
> is based on the assumption that posting multiple RDMA requests either 
> succeeds as a whole or fails as a whole. Sorry but I'm not sure that the 
> verbs API guarantees this. In the ib_srpt driver a QP can be changed at any 
> time into the error state and there might be drivers that report an 
> immediate failure in that case. I think even when chaining RDMA requests 
> that we still need a mechanism to wait until ongoing RDMA transfers have 
> finished if some but not all RDMA requests have been posted.

I'd have to look at where it's guaranteed that we get flushed errors,
but if there were drivers that broke this assumption the iSER driver
would already be badly broken by this.  So if we don't have the formal
guaranteed yet we should add it and fix up the drivers.

Once all drivers use the new-style complentions we could in fact just
remove the return value from ->post_send_wr and require that all erorrs
are reported through ->done.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
@ 2015-11-18 14:06           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:06 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Tue, Nov 17, 2015 at 05:17:35PM -0800, Bart Van Assche wrote:
> Chaining RDMA requests is a great idea. But it seems to me that this patch 
> is based on the assumption that posting multiple RDMA requests either 
> succeeds as a whole or fails as a whole. Sorry but I'm not sure that the 
> verbs API guarantees this. In the ib_srpt driver a QP can be changed at any 
> time into the error state and there might be drivers that report an 
> immediate failure in that case. I think even when chaining RDMA requests 
> that we still need a mechanism to wait until ongoing RDMA transfers have 
> finished if some but not all RDMA requests have been posted.

I'd have to look at where it's guaranteed that we get flushed errors,
but if there were drivers that broke this assumption the iSER driver
would already be badly broken by this.  So if we don't have the formal
guaranteed yet we should add it and fix up the drivers.

Once all drivers use the new-style complentions we could in fact just
remove the return value from ->post_send_wr and require that all erorrs
are reported through ->done.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-18 11:32       ` Sagi Grimberg
@ 2015-11-18 14:06           ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:06 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Wed, Nov 18, 2015 at 01:32:19PM +0200, Sagi Grimberg wrote:
> Christoph,
>
> Given the discussion around this patch I think it would
> be a good idea remove it from the patchset since it's not
> mandatory for the CQ abstraction. I think that we should
> take it with Steve to come up with a complete solution for
> this bit.
>
> Thoughts?

Yes, let's drop it for now.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-18 14:06           ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-18 14:06 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-rdma, bart.vanassche, axboe, linux-scsi, linux-kernel

On Wed, Nov 18, 2015 at 01:32:19PM +0200, Sagi Grimberg wrote:
> Christoph,
>
> Given the discussion around this patch I think it would
> be a good idea remove it from the patchset since it's not
> mandatory for the CQ abstraction. I think that we should
> take it with Steve to come up with a complete solution for
> this bit.
>
> Thoughts?

Yes, let's drop it for now.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-18 14:06           ` Christoph Hellwig
@ 2015-11-18 15:21               ` Steve Wise
  -1 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-18 15:21 UTC (permalink / raw)
  To: Christoph Hellwig, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/18/2015 8:06 AM, Christoph Hellwig wrote:
> On Wed, Nov 18, 2015 at 01:32:19PM +0200, Sagi Grimberg wrote:
>> Christoph,
>>
>> Given the discussion around this patch I think it would
>> be a good idea remove it from the patchset since it's not
>> mandatory for the CQ abstraction. I think that we should
>> take it with Steve to come up with a complete solution for
>> this bit.
>>
>> Thoughts?
> Yes, let's drop it for now.
>

Fine with me.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-18 15:21               ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-18 15:21 UTC (permalink / raw)
  To: Christoph Hellwig, Sagi Grimberg
  Cc: linux-rdma, bart.vanassche, axboe, linux-scsi, linux-kernel

On 11/18/2015 8:06 AM, Christoph Hellwig wrote:
> On Wed, Nov 18, 2015 at 01:32:19PM +0200, Sagi Grimberg wrote:
>> Christoph,
>>
>> Given the discussion around this patch I think it would
>> be a good idea remove it from the patchset since it's not
>> mandatory for the CQ abstraction. I think that we should
>> take it with Steve to come up with a complete solution for
>> this bit.
>>
>> Thoughts?
> Yes, let's drop it for now.
>

Fine with me.


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-18  9:15       ` Sagi Grimberg
@ 2015-11-18 16:32           ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18 16:32 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel

On 11/18/2015 01:15 AM, Sagi Grimberg wrote:
> On 18/11/2015 03:17, Bart Van Assche wrote:
>> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>>> -        ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
>>> -        if (ret)
>>> -            break;
>>> +        if (i == n_rdma - 1) {
>>> +            /* only get completion event for the last rdma read */
>>> +            if (dir == DMA_TO_DEVICE)
>>> +                wr->wr.send_flags = IB_SEND_SIGNALED;
>>> +            wr->wr.next = NULL;
>>> +        } else {
>>> +            wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
>>> +        }
>>>       }
>>>
>>> +    ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
>>>       if (ret)
>>>           pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
>>>                    __func__, __LINE__, ret, i, n_rdma);
>>
>> Hello Christoph,
>
> Hi Bart,
>
>>
>> Chaining RDMA requests is a great idea. But it seems to me that this
>> patch is based on the assumption that posting multiple RDMA requests
>> either succeeds as a whole or fails as a whole. Sorry but I'm not sure
>> that the verbs API guarantees this. In the ib_srpt driver a QP can be
>> changed at any time into the error state and there might be drivers that
>> report an immediate failure in that case.
>
> I'm not so sure it actually matters if some WRs succeeded. In the normal
> flow when srpt has enough available work requests (sq_wr_avail) they
> should all succeed otherwise we're in trouble. If the QP transitioned
> to ERROR state, then some failed, but those that succeeded will
> generate flush completions, and srpt should handle it correctly
> shouldn't it?
>
>> I think even when chaining
>> RDMA requests that we still need a mechanism to wait until ongoing RDMA
>> transfers have finished if some but not all RDMA requests have been
>> posted.
>
> I'm not an expert on srpt, can you explain how this mechanism will help?

Hello Sagi,

As you know events like a cable pull can cause some of the RDMA work 
requests to succeed and others to fail. It is essential that all RDMA 
work requests related to the same SCSI command have finished before the 
buffers these requests operate upon are reused. The purpose of the 
SRPT_RDMA_ABORT request is to wait for the RDMA requests that were 
posted without IB_SEND_SIGNALED and for which no error completion will 
be received. BTW, I think this consideration applies to all SCSI target 
drivers and not only to SRP target drivers.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
@ 2015-11-18 16:32           ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18 16:32 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel

On 11/18/2015 01:15 AM, Sagi Grimberg wrote:
> On 18/11/2015 03:17, Bart Van Assche wrote:
>> On 11/13/2015 05:46 AM, Christoph Hellwig wrote:
>>> -        ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
>>> -        if (ret)
>>> -            break;
>>> +        if (i == n_rdma - 1) {
>>> +            /* only get completion event for the last rdma read */
>>> +            if (dir == DMA_TO_DEVICE)
>>> +                wr->wr.send_flags = IB_SEND_SIGNALED;
>>> +            wr->wr.next = NULL;
>>> +        } else {
>>> +            wr->wr.next = &ioctx->rdma_ius[i + 1].wr;
>>> +        }
>>>       }
>>>
>>> +    ret = ib_post_send(ch->qp, &ioctx->rdma_ius->wr, &bad_wr);
>>>       if (ret)
>>>           pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
>>>                    __func__, __LINE__, ret, i, n_rdma);
>>
>> Hello Christoph,
>
> Hi Bart,
>
>>
>> Chaining RDMA requests is a great idea. But it seems to me that this
>> patch is based on the assumption that posting multiple RDMA requests
>> either succeeds as a whole or fails as a whole. Sorry but I'm not sure
>> that the verbs API guarantees this. In the ib_srpt driver a QP can be
>> changed at any time into the error state and there might be drivers that
>> report an immediate failure in that case.
>
> I'm not so sure it actually matters if some WRs succeeded. In the normal
> flow when srpt has enough available work requests (sq_wr_avail) they
> should all succeed otherwise we're in trouble. If the QP transitioned
> to ERROR state, then some failed, but those that succeeded will
> generate flush completions, and srpt should handle it correctly
> shouldn't it?
>
>> I think even when chaining
>> RDMA requests that we still need a mechanism to wait until ongoing RDMA
>> transfers have finished if some but not all RDMA requests have been
>> posted.
>
> I'm not an expert on srpt, can you explain how this mechanism will help?

Hello Sagi,

As you know events like a cable pull can cause some of the RDMA work 
requests to succeed and others to fail. It is essential that all RDMA 
work requests related to the same SCSI command have finished before the 
buffers these requests operate upon are reused. The purpose of the 
SRPT_RDMA_ABORT request is to wait for the RDMA requests that were 
posted without IB_SEND_SIGNALED and for which no error completion will 
be received. BTW, I think this consideration applies to all SCSI target 
drivers and not only to SRP target drivers.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-18  7:55           ` Sagi Grimberg
@ 2015-11-18 18:20               ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18 18:20 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/17/2015 11:55 PM, Sagi Grimberg wrote:
>>> +static void ib_cq_poll_work(struct work_struct *work)
>>> +{
>>> +    struct ib_cq *cq = container_of(work, struct ib_cq, work);
>>> +    int completed;
>>> +
>>> +    completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
>>> +    if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>>> +        ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>>> +        queue_work(ib_comp_wq, &cq->work);
>>> +}
>>> +
>>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>>> +{
>>> +    queue_work(ib_comp_wq, &cq->work);
>>> +}
>>
>> The above code will cause all polling to occur on the context of the CPU
>> that received the completion interrupt. This approach is not powerful
>> enough. For certain workloads throughput is higher if work completions
>> are processed by another CPU core on the same CPU socket. Has it been
>> considered to make the CPU core on which work completions are processed
>> configurable ?
>
> The workqueue is unbound. This means that the functionality you are
> you are asking for exists.

Hello Sagi,

Are you perhaps referring to the sysfs CPU mask that allows to control 
workqueue affinity ? I expect that setting the CPU mask for an entire 
pool through sysfs will lead to suboptimal results. What I have learned 
by tuning target systems is that there is a significant performance 
difference (> 30% IOPS) between a configuration where each completion 
thread is pinned to exactly one CPU compared to allowing the scheduler 
to choose a CPU.

Controlling the CPU affinity of worker threads with the taskset command 
is not possible since the function create_worker() in kernel/workqueue.c 
calls kthread_bind_mask(). That function sets PF_NO_SETAFFINITY. From 
sched.h:

#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to 
meddle with cpus_allowed */

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-18 18:20               ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-18 18:20 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig, linux-rdma
  Cc: axboe, linux-scsi, linux-kernel

On 11/17/2015 11:55 PM, Sagi Grimberg wrote:
>>> +static void ib_cq_poll_work(struct work_struct *work)
>>> +{
>>> +    struct ib_cq *cq = container_of(work, struct ib_cq, work);
>>> +    int completed;
>>> +
>>> +    completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
>>> +    if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
>>> +        ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>>> +        queue_work(ib_comp_wq, &cq->work);
>>> +}
>>> +
>>> +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
>>> +{
>>> +    queue_work(ib_comp_wq, &cq->work);
>>> +}
>>
>> The above code will cause all polling to occur on the context of the CPU
>> that received the completion interrupt. This approach is not powerful
>> enough. For certain workloads throughput is higher if work completions
>> are processed by another CPU core on the same CPU socket. Has it been
>> considered to make the CPU core on which work completions are processed
>> configurable ?
>
> The workqueue is unbound. This means that the functionality you are
> you are asking for exists.

Hello Sagi,

Are you perhaps referring to the sysfs CPU mask that allows to control 
workqueue affinity ? I expect that setting the CPU mask for an entire 
pool through sysfs will lead to suboptimal results. What I have learned 
by tuning target systems is that there is a significant performance 
difference (> 30% IOPS) between a configuration where each completion 
thread is pinned to exactly one CPU compared to allowing the scheduler 
to choose a CPU.

Controlling the CPU affinity of worker threads with the taskset command 
is not possible since the function create_worker() in kernel/workqueue.c 
calls kthread_bind_mask(). That function sets PF_NO_SETAFFINITY. From 
sched.h:

#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to 
meddle with cpus_allowed */

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-18 18:20               ` Bart Van Assche
  (?)
@ 2015-11-20 10:16               ` Christoph Hellwig
  2015-11-20 16:50                 ` Bart Van Assche
  -1 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-20 10:16 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Sagi Grimberg, Christoph Hellwig, linux-rdma, axboe, linux-scsi,
	linux-kernel

On Wed, Nov 18, 2015 at 10:20:14AM -0800, Bart Van Assche wrote:
> Are you perhaps referring to the sysfs CPU mask that allows to control 
> workqueue affinity ?

I think he is referring to the defintion of WQ_UNBOUND:

  WQ_UNBOUND

	Work items queued to an unbound wq are served by the special
	woker-pools which host workers which are not bound to any
	specific CPU.  This makes the wq behave as a simple execution
	context provider without concurrency management.  The unbound
	worker-pools try to start execution of work items as soon as
	possible.  Unbound wq sacrifices locality but is useful for
	the following cases.

	* Wide fluctuation in the concurrency level requirement is
	  expected and using bound wq may end up creating large number
	  of mostly unused workers across different CPUs as the issuer
	  hops through different CPUs.

	* Long running CPU intensive workloads which can be better
	  managed by the system scheduler.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
  2015-11-18 16:32           ` Bart Van Assche
@ 2015-11-20 10:20               ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-20 10:20 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Sagi Grimberg, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Wed, Nov 18, 2015 at 08:32:59AM -0800, Bart Van Assche wrote:
> As you know events like a cable pull can cause some of the RDMA work 
> requests to succeed and others to fail. It is essential that all RDMA work 
> requests related to the same SCSI command have finished before the buffers 
> these requests operate upon are reused. The purpose of the SRPT_RDMA_ABORT 
> request is to wait for the RDMA requests that were posted without 
> IB_SEND_SIGNALED and for which no error completion will be received. BTW, I 
> think this consideration applies to all SCSI target drivers and not only to 
> SRP target drivers.

I think everyone understand the theroetical issue, but we'd like to
see a practical case that the implementation in isert and my proposed
srpt one don't handle.

Given that chained WRs must not be reordered the HCA must also give
us the completions in the order we submitted them.  Because of that
the previous WRs must have been completed by the time we get the
notification for the last one which usually does the cleanup.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 4/9] srpt: chain RDMA READ/WRITE requests
@ 2015-11-20 10:20               ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-20 10:20 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Sagi Grimberg, Christoph Hellwig, linux-rdma, axboe, linux-scsi,
	linux-kernel

On Wed, Nov 18, 2015 at 08:32:59AM -0800, Bart Van Assche wrote:
> As you know events like a cable pull can cause some of the RDMA work 
> requests to succeed and others to fail. It is essential that all RDMA work 
> requests related to the same SCSI command have finished before the buffers 
> these requests operate upon are reused. The purpose of the SRPT_RDMA_ABORT 
> request is to wait for the RDMA requests that were posted without 
> IB_SEND_SIGNALED and for which no error completion will be received. BTW, I 
> think this consideration applies to all SCSI target drivers and not only to 
> SRP target drivers.

I think everyone understand the theroetical issue, but we'd like to
see a practical case that the implementation in isert and my proposed
srpt one don't handle.

Given that chained WRs must not be reordered the HCA must also give
us the completions in the order we submitted them.  Because of that
the previous WRs must have been completed by the time we get the
notification for the last one which usually does the cleanup.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-20 10:16               ` Christoph Hellwig
@ 2015-11-20 16:50                 ` Bart Van Assche
  2015-11-22  9:51                   ` Sagi Grimberg
  0 siblings, 1 reply; 140+ messages in thread
From: Bart Van Assche @ 2015-11-20 16:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, linux-rdma, axboe, linux-scsi, linux-kernel

On 11/20/2015 02:16 AM, Christoph Hellwig wrote:
> On Wed, Nov 18, 2015 at 10:20:14AM -0800, Bart Van Assche wrote:
>> Are you perhaps referring to the sysfs CPU mask that allows to control
>> workqueue affinity ?
> 
> I think he is referring to the defintion of WQ_UNBOUND:
> 
>    WQ_UNBOUND
> 
> 	Work items queued to an unbound wq are served by the special
> 	woker-pools which host workers which are not bound to any
> 	specific CPU.  This makes the wq behave as a simple execution
> 	context provider without concurrency management.  The unbound
> 	worker-pools try to start execution of work items as soon as
> 	possible.  Unbound wq sacrifices locality but is useful for
> 	the following cases.
> 
> 	* Wide fluctuation in the concurrency level requirement is
> 	  expected and using bound wq may end up creating large number
> 	  of mostly unused workers across different CPUs as the issuer
> 	  hops through different CPUs.
> 
> 	* Long running CPU intensive workloads which can be better
> 	  managed by the system scheduler.
 
Hello Christoph,

The comment about locality in the above quote is interesting. How about
modifying patch 2/9 as indicated below ? The modification below does not
change the behavior of this patch if ib_cq.w.cpu is not modified. And it
allows users who care about locality and who want to skip the scheduler
overhead by setting ib_cq.w.cpu to the index of the CPU they want the
work to be processed on.

Thanks,

Bart.

---
 drivers/infiniband/core/cq.c | 11 ++++++-----
 include/rdma/ib_verbs.h      |  5 ++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index bf2a079..4d80d8c 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -94,18 +94,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 
 static void ib_cq_poll_work(struct work_struct *work)
 {
-	struct ib_cq *cq = container_of(work, struct ib_cq, work);
+	struct ib_cq *cq = container_of(work, struct ib_cq, w.work);
 	int completed;
 
 	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
-		queue_work(ib_comp_wq, &cq->work);
+		queue_work_on(cq->w.cpu, ib_comp_wq, &cq->w.work);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 {
-	queue_work(ib_comp_wq, &cq->work);
+	queue_work_on(cq->w.cpu, ib_comp_wq, &cq->w.work);
 }
 
 /**
@@ -159,7 +159,8 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 		break;
 	case IB_POLL_WORKQUEUE:
 		cq->comp_handler = ib_cq_completion_workqueue;
-		INIT_WORK(&cq->work, ib_cq_poll_work);
+		INIT_WORK(&cq->w.work, ib_cq_poll_work);
+		cq->w.cpu = WORK_CPU_UNBOUND;
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		break;
 	default:
@@ -195,7 +196,7 @@ void ib_free_cq(struct ib_cq *cq)
 		irq_poll_disable(&cq->iop);
 		break;
 	case IB_POLL_WORKQUEUE:
-		flush_work(&cq->work);
+		flush_work(&cq->w.work);
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f59a8d3..b1344f8 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1291,7 +1291,10 @@ struct ib_cq {
 	struct ib_wc		*wc;
 	union {
 		struct irq_poll		iop;
-		struct work_struct	work;
+		struct {
+			struct work_struct	work;
+			int			cpu;
+		} w;
 	};
 };
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-20 16:50                 ` Bart Van Assche
@ 2015-11-22  9:51                   ` Sagi Grimberg
  2015-11-22 10:13                     ` Christoph Hellwig
  0 siblings, 1 reply; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-22  9:51 UTC (permalink / raw)
  To: Bart Van Assche, Christoph Hellwig
  Cc: linux-rdma, axboe, linux-scsi, linux-kernel


> Hello Christoph,
>
> The comment about locality in the above quote is interesting. How about
> modifying patch 2/9 as indicated below ? The modification below does not
> change the behavior of this patch if ib_cq.w.cpu is not modified. And it
> allows users who care about locality and who want to skip the scheduler
> overhead by setting ib_cq.w.cpu to the index of the CPU they want the
> work to be processed on.

That sounds acceptable...

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-22  9:51                   ` Sagi Grimberg
@ 2015-11-22 10:13                     ` Christoph Hellwig
       [not found]                       ` <20151122101308.GA12189-jcswGhMUV9g@public.gmane.org>
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-22 10:13 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, axboe,
	linux-scsi, linux-kernel

On Sun, Nov 22, 2015 at 11:51:13AM +0200, Sagi Grimberg wrote:
>
>> Hello Christoph,
>>
>> The comment about locality in the above quote is interesting. How about
>> modifying patch 2/9 as indicated below ? The modification below does not
>> change the behavior of this patch if ib_cq.w.cpu is not modified. And it
>> allows users who care about locality and who want to skip the scheduler
>> overhead by setting ib_cq.w.cpu to the index of the CPU they want the
>> work to be processed on.
>
> That sounds acceptable...

Wouldn't it be a better idea to set the WQ_SYSFS interface and use
the standard sysfs interface for specifying cpumasks or node affinity?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-22 10:13                     ` Christoph Hellwig
@ 2015-11-22 10:36                           ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-22 10:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA


> Wouldn't it be a better idea to set the WQ_SYSFS interface and use
> the standard sysfs interface for specifying cpumasks or node affinity?

I think that bart wants to allow the caller to select cpu affinity
per CQ. In this case ib_alloc_cq in workqueue mode would need to
accept a affinity_hint from the caller (default to wild-card 
WORK_CPU_UNBOUND).
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-22 10:36                           ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-22 10:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma, axboe, linux-scsi, linux-kernel


> Wouldn't it be a better idea to set the WQ_SYSFS interface and use
> the standard sysfs interface for specifying cpumasks or node affinity?

I think that bart wants to allow the caller to select cpu affinity
per CQ. In this case ib_alloc_cq in workqueue mode would need to
accept a affinity_hint from the caller (default to wild-card 
WORK_CPU_UNBOUND).

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-22 10:36                           ` Sagi Grimberg
@ 2015-11-22 13:23                               ` Christoph Hellwig
  -1 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-22 13:23 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Bart Van Assche,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Sun, Nov 22, 2015 at 12:36:00PM +0200, Sagi Grimberg wrote:
>
>> Wouldn't it be a better idea to set the WQ_SYSFS interface and use
>> the standard sysfs interface for specifying cpumasks or node affinity?
>
> I think that bart wants to allow the caller to select cpu affinity
> per CQ. In this case ib_alloc_cq in workqueue mode would need to
> accept a affinity_hint from the caller (default to wild-card 
> WORK_CPU_UNBOUND).

Hmm, true.  How would be set that hint from userspace?  I'd really prefer
to see a practical justification for it first.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-22 13:23                               ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-22 13:23 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Bart Van Assche, linux-rdma, axboe,
	linux-scsi, linux-kernel

On Sun, Nov 22, 2015 at 12:36:00PM +0200, Sagi Grimberg wrote:
>
>> Wouldn't it be a better idea to set the WQ_SYSFS interface and use
>> the standard sysfs interface for specifying cpumasks or node affinity?
>
> I think that bart wants to allow the caller to select cpu affinity
> per CQ. In this case ib_alloc_cq in workqueue mode would need to
> accept a affinity_hint from the caller (default to wild-card 
> WORK_CPU_UNBOUND).

Hmm, true.  How would be set that hint from userspace?  I'd really prefer
to see a practical justification for it first.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-22 13:23                               ` Christoph Hellwig
@ 2015-11-22 14:57                                   ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-22 14:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	axboe-b10kYP2dOMg, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA


>>
>> I think that bart wants to allow the caller to select cpu affinity
>> per CQ. In this case ib_alloc_cq in workqueue mode would need to
>> accept a affinity_hint from the caller (default to wild-card
>> WORK_CPU_UNBOUND).
>
> Hmm, true.  How would be set that hint from userspace?  I'd really prefer
> to see a practical justification for it first.

In order to assign CPUs from user-space we'd need an ethtool like
interface for isert/srpt/<xxxt>. Given that this is something we don't
want to get into right now, I assumed that Bart meant that srpt
would take a "least used" approach from srpt driver (which isn't better
taking the wild-card option I'd say), So I'll let Bart answer...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-22 14:57                                   ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-22 14:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma, axboe, linux-scsi, linux-kernel


>>
>> I think that bart wants to allow the caller to select cpu affinity
>> per CQ. In this case ib_alloc_cq in workqueue mode would need to
>> accept a affinity_hint from the caller (default to wild-card
>> WORK_CPU_UNBOUND).
>
> Hmm, true.  How would be set that hint from userspace?  I'd really prefer
> to see a practical justification for it first.

In order to assign CPUs from user-space we'd need an ethtool like
interface for isert/srpt/<xxxt>. Given that this is something we don't
want to get into right now, I assumed that Bart meant that srpt
would take a "least used" approach from srpt driver (which isn't better
taking the wild-card option I'd say), So I'll let Bart answer...

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-22 14:57                                   ` Sagi Grimberg
  (?)
@ 2015-11-22 16:55                                   ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-22 16:55 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig
  Cc: linux-rdma, axboe, linux-scsi, linux-kernel

On 11/22/15 06:57, Sagi Grimberg wrote:
>>> I think that bart wants to allow the caller to select cpu affinity
>>> per CQ. In this case ib_alloc_cq in workqueue mode would need to
>>> accept a affinity_hint from the caller (default to wild-card
>>> WORK_CPU_UNBOUND).
>>
>> Hmm, true.  How would be set that hint from userspace?  I'd really prefer
>> to see a practical justification for it first.
>
> In order to assign CPUs from user-space we'd need an ethtool like
> interface for isert/srpt/<xxxt>. Given that this is something we don't
> want to get into right now, I assumed that Bart meant that srpt
> would take a "least used" approach from srpt driver (which isn't better
> taking the wild-card option I'd say), So I'll let Bart answer...

Hello Christoph and Sagi,

My intention is indeed to allow to control CPU affinity per CQ. One use 
case is to implement a least-used policy in RDMA drivers that use 
multiple completion queues. Another use case is to make CPU affinity 
configurable from user space through something similar to ethtool or via 
sysfs.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-16 19:03                   ` Steve Wise
@ 2015-11-23 10:28                     ` Sagi Grimberg
  -1 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-23 10:28 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA


> That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> cxgb4 with the old iwarp support patches.

Steve,

I think I figured out why this works with iWARP.

For iWARP, rdma_disconnect() calls iw_cm_disconnect() with abrupt=0
which would make iw_cm_disconnect() move the QP into SQ_DRAIN state"

int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
{
	...

         if (qp) {
                 if (abrupt)
                         ret = iwcm_modify_qp_err(qp);
                 else
                         ret = iwcm_modify_qp_sqd(qp);

                 /*
                  * If both sides are disconnecting the QP could
                  * already be in ERR or SQD states
                  */
                 ret = 0;
	}
}

IFAIK, SQD state allows the ULP to post work requests on the send
queue and expect these work requests to FLUSH.

So Maybe we should have:
void ib_drain_qp(struct ib_qp *qp)
{
     struct ib_qp_attr attr = { };
     struct ib_stop_cqe rstop, sstop;
     struct ib_recv_wr rwr = {}, *bad_rwr;
     struct ib_send_wr swr = {}, *bad_swr;
     enum ib_qp_state state;
     int ret;

     if rdma_cap_ib_cm(id->device, id->port_num) {
	state = IB_QPS_ERR;
     else if rdma_cap_iw_cm(id->device, id->port_num)
         state = IB_QPS_SQD;
     else
        return;

     rwr.wr_cqe = &rstop.cqe;
     rstop.cqe.done = ib_stop_done;
     init_completion(&rstop.done);

     swr.wr_cqe = &sstop.cqe;
     sstop.cqe.done = ib_stop_done;
     swr.send_flags = IB_SEND_SIGNALED;
     init_completion(&sstop.done);

     attr.qp_state = state;
     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
     if (ret) {
         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
         return;
     }

     ret = ib_post_recv(qp, &rwr, &bad_rwr);
     if (ret) {
         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
         return;
     }

     ret = ib_post_send(qp, &swr, &bad_swr);
     if (ret) {
         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
         return;
     }

     wait_for_completion(&rstop.done);
     wait_for_completion(&sstop.done);
}

Thoughts?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-23 10:28                     ` Sagi Grimberg
  0 siblings, 0 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-23 10:28 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel


> That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> cxgb4 with the old iwarp support patches.

Steve,

I think I figured out why this works with iWARP.

For iWARP, rdma_disconnect() calls iw_cm_disconnect() with abrupt=0
which would make iw_cm_disconnect() move the QP into SQ_DRAIN state"

int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
{
	...

         if (qp) {
                 if (abrupt)
                         ret = iwcm_modify_qp_err(qp);
                 else
                         ret = iwcm_modify_qp_sqd(qp);

                 /*
                  * If both sides are disconnecting the QP could
                  * already be in ERR or SQD states
                  */
                 ret = 0;
	}
}

IFAIK, SQD state allows the ULP to post work requests on the send
queue and expect these work requests to FLUSH.

So Maybe we should have:
void ib_drain_qp(struct ib_qp *qp)
{
     struct ib_qp_attr attr = { };
     struct ib_stop_cqe rstop, sstop;
     struct ib_recv_wr rwr = {}, *bad_rwr;
     struct ib_send_wr swr = {}, *bad_swr;
     enum ib_qp_state state;
     int ret;

     if rdma_cap_ib_cm(id->device, id->port_num) {
	state = IB_QPS_ERR;
     else if rdma_cap_iw_cm(id->device, id->port_num)
         state = IB_QPS_SQD;
     else
        return;

     rwr.wr_cqe = &rstop.cqe;
     rstop.cqe.done = ib_stop_done;
     init_completion(&rstop.done);

     swr.wr_cqe = &sstop.cqe;
     sstop.cqe.done = ib_stop_done;
     swr.send_flags = IB_SEND_SIGNALED;
     init_completion(&sstop.done);

     attr.qp_state = state;
     ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
     if (ret) {
         WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
         return;
     }

     ret = ib_post_recv(qp, &rwr, &bad_rwr);
     if (ret) {
         WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
         return;
     }

     ret = ib_post_send(qp, &swr, &bad_swr);
     if (ret) {
         WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
         return;
     }

     wait_for_completion(&rstop.done);
     wait_for_completion(&sstop.done);
}

Thoughts?

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-23 10:28                     ` Sagi Grimberg
  (?)
@ 2015-11-23 10:35                     ` Sagi Grimberg
  2015-11-23 14:33                       ` 'Christoph Hellwig'
       [not found]                       ` <5652EC00.8010705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  -1 siblings, 2 replies; 140+ messages in thread
From: Sagi Grimberg @ 2015-11-23 10:35 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel


> So Maybe we should have:
> void ib_drain_qp(struct ib_qp *qp)

Christoph suggested that this flushing would be taken care
of by rdma_disconnect which sounds even better I think..

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-23 10:35                     ` Sagi Grimberg
@ 2015-11-23 14:33                       ` 'Christoph Hellwig'
       [not found]                       ` <5652EC00.8010705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  1 sibling, 0 replies; 140+ messages in thread
From: 'Christoph Hellwig' @ 2015-11-23 14:33 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Steve Wise, 'Christoph Hellwig',
	linux-rdma, bart.vanassche, axboe, linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 12:35:44PM +0200, Sagi Grimberg wrote:
>
>> So Maybe we should have:
>> void ib_drain_qp(struct ib_qp *qp)
>
> Christoph suggested that this flushing would be taken care
> of by rdma_disconnect which sounds even better I think..

Note that will only work once we've converted all drivers to
the new CQ API under development.  Without that every completion
handlers needs a special case for the drain WC.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-23 10:28                     ` Sagi Grimberg
@ 2015-11-23 14:44                       ` Steve Wise
  -1 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-23 14:44 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> -----Original Message-----
> From: Sagi Grimberg [mailto:sagig@dev.mellanox.co.il]
> Sent: Monday, November 23, 2015 4:29 AM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma@vger.kernel.org
> Cc: bart.vanassche@sandisk.com; axboe@fb.com; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> > cxgb4 with the old iwarp support patches.
> 
> Steve,
> 
> I think I figured out why this works with iWARP.
> 
> For iWARP, rdma_disconnect() calls iw_cm_disconnect() with abrupt=0
> which would make iw_cm_disconnect() move the QP into SQ_DRAIN state"
>

Yes.  Note:  SQ_DRAIN == CLOSING state for iWARP QPs.   CLOSING state means the transport will try and do an orderly shutdown.
More on this below.
 
> int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
> {
> 	...
> 
>          if (qp) {
>                  if (abrupt)
>                          ret = iwcm_modify_qp_err(qp);
>                  else
>                          ret = iwcm_modify_qp_sqd(qp);
> 
>                  /*
>                   * If both sides are disconnecting the QP could
>                   * already be in ERR or SQD states
>                   */
>                  ret = 0;
> 	}
> }
> 
> IFAIK, SQD state allows the ULP to post work requests on the send
> queue and expect these work requests to FLUSH.
> 

The iWARP QP states are different from IB unfortunately.  And the way iWARP was plugged into the original IB-centric RDMA subsystem,
this difference is not very visible.  Moving an iWARP to CLOSING/SQD begins an "orderly close" of the TCP connection.  IE TCP FIN,
FIN/ACK, ACK.   

> So Maybe we should have:
> void ib_drain_qp(struct ib_qp *qp)
> {
>      struct ib_qp_attr attr = { };
>      struct ib_stop_cqe rstop, sstop;
>      struct ib_recv_wr rwr = {}, *bad_rwr;
>      struct ib_send_wr swr = {}, *bad_swr;
>      enum ib_qp_state state;
>      int ret;
> 
>      if rdma_cap_ib_cm(id->device, id->port_num) {
> 	state = IB_QPS_ERR;
>      else if rdma_cap_iw_cm(id->device, id->port_num)
>          state = IB_QPS_SQD;
>      else
>         return;
> 
>      rwr.wr_cqe = &rstop.cqe;
>      rstop.cqe.done = ib_stop_done;
>      init_completion(&rstop.done);
> 
>      swr.wr_cqe = &sstop.cqe;
>      sstop.cqe.done = ib_stop_done;
>      swr.send_flags = IB_SEND_SIGNALED;
>      init_completion(&sstop.done);
> 
>      attr.qp_state = state;
>      ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>          return;
>      }
> 
>      ret = ib_post_recv(qp, &rwr, &bad_rwr);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
>          return;
>      }
> 
>      ret = ib_post_send(qp, &swr, &bad_swr);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
>          return;
>      }
> 
>      wait_for_completion(&rstop.done);
>      wait_for_completion(&sstop.done);
> }
> 
> Thoughts?

The problem with moving the QP -> CLOSING (aka SQD) is this:  as per the iWARP Verbs spec, ULPS _must_ quiesce the SQ before moving
it to CLOSING.  IE make sure there are no outstanding SQ WRs.  So the drain operation really has to be done _before_ the move to
CLOSING/SQD. :(  If there _are_ outstanding SQ WRs when an attempt to move the QP to CLOSING, or an ingress RDMA operation arrives
while the QP is in CLOSING (and doing a TCP fin/fin-ack exchange), the QP is immediately moved to ERROR.   Also, no WR posts are
allowed while the QP is in CLOSING, unlike the IB SQD state.

The valid drain logic that I think needs to be implemented to support iWARP is one of two methods:

1) as I said before, enhance the ib_qp struct to have a "flush complete" completion object, changes the providers to all complete
that object when a) they are in ERROR and b) the SQ and RQ become empty (or is already empty).  Then ib_drain_qp() just waits for
this completion.

2) change the iwarp providers to allow posting WRs while in ERROR.  One way is do this and still support the requirement that "at
some point while in error, the provider must synchronously fail posts", is to allow the posts if the SQ or RQ still has pending WRs,
but fail immediately if the SQ or RQ is already empty.  Thus the "drain" WRs issued by iw_drain_qp() would work if they were needed,
and fail immediately if they are not needed.  In either case, the flush operation is complete.

I really wish the iWARP spec architects had avoided these sorts of diversions from the IB spec....

Steve.




^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-23 14:44                       ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-23 14:44 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> -----Original Message-----
> From: Sagi Grimberg [mailto:sagig@dev.mellanox.co.il]
> Sent: Monday, November 23, 2015 4:29 AM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma@vger.kernel.org
> Cc: bart.vanassche@sandisk.com; axboe@fb.com; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > That won't work for iWARP.  Is this code new?  I didn't see any errors that would result from this code when I tested iSER over
> > cxgb4 with the old iwarp support patches.
> 
> Steve,
> 
> I think I figured out why this works with iWARP.
> 
> For iWARP, rdma_disconnect() calls iw_cm_disconnect() with abrupt=0
> which would make iw_cm_disconnect() move the QP into SQ_DRAIN state"
>

Yes.  Note:  SQ_DRAIN == CLOSING state for iWARP QPs.   CLOSING state means the transport will try and do an orderly shutdown.
More on this below.
 
> int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
> {
> 	...
> 
>          if (qp) {
>                  if (abrupt)
>                          ret = iwcm_modify_qp_err(qp);
>                  else
>                          ret = iwcm_modify_qp_sqd(qp);
> 
>                  /*
>                   * If both sides are disconnecting the QP could
>                   * already be in ERR or SQD states
>                   */
>                  ret = 0;
> 	}
> }
> 
> IFAIK, SQD state allows the ULP to post work requests on the send
> queue and expect these work requests to FLUSH.
> 

The iWARP QP states are different from IB unfortunately.  And the way iWARP was plugged into the original IB-centric RDMA subsystem,
this difference is not very visible.  Moving an iWARP to CLOSING/SQD begins an "orderly close" of the TCP connection.  IE TCP FIN,
FIN/ACK, ACK.   

> So Maybe we should have:
> void ib_drain_qp(struct ib_qp *qp)
> {
>      struct ib_qp_attr attr = { };
>      struct ib_stop_cqe rstop, sstop;
>      struct ib_recv_wr rwr = {}, *bad_rwr;
>      struct ib_send_wr swr = {}, *bad_swr;
>      enum ib_qp_state state;
>      int ret;
> 
>      if rdma_cap_ib_cm(id->device, id->port_num) {
> 	state = IB_QPS_ERR;
>      else if rdma_cap_iw_cm(id->device, id->port_num)
>          state = IB_QPS_SQD;
>      else
>         return;
> 
>      rwr.wr_cqe = &rstop.cqe;
>      rstop.cqe.done = ib_stop_done;
>      init_completion(&rstop.done);
> 
>      swr.wr_cqe = &sstop.cqe;
>      sstop.cqe.done = ib_stop_done;
>      swr.send_flags = IB_SEND_SIGNALED;
>      init_completion(&sstop.done);
> 
>      attr.qp_state = state;
>      ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
>          return;
>      }
> 
>      ret = ib_post_recv(qp, &rwr, &bad_rwr);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
>          return;
>      }
> 
>      ret = ib_post_send(qp, &swr, &bad_swr);
>      if (ret) {
>          WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
>          return;
>      }
> 
>      wait_for_completion(&rstop.done);
>      wait_for_completion(&sstop.done);
> }
> 
> Thoughts?

The problem with moving the QP -> CLOSING (aka SQD) is this:  as per the iWARP Verbs spec, ULPS _must_ quiesce the SQ before moving
it to CLOSING.  IE make sure there are no outstanding SQ WRs.  So the drain operation really has to be done _before_ the move to
CLOSING/SQD. :(  If there _are_ outstanding SQ WRs when an attempt to move the QP to CLOSING, or an ingress RDMA operation arrives
while the QP is in CLOSING (and doing a TCP fin/fin-ack exchange), the QP is immediately moved to ERROR.   Also, no WR posts are
allowed while the QP is in CLOSING, unlike the IB SQD state.

The valid drain logic that I think needs to be implemented to support iWARP is one of two methods:

1) as I said before, enhance the ib_qp struct to have a "flush complete" completion object, changes the providers to all complete
that object when a) they are in ERROR and b) the SQ and RQ become empty (or is already empty).  Then ib_drain_qp() just waits for
this completion.

2) change the iwarp providers to allow posting WRs while in ERROR.  One way is do this and still support the requirement that "at
some point while in error, the provider must synchronously fail posts", is to allow the posts if the SQ or RQ still has pending WRs,
but fail immediately if the SQ or RQ is already empty.  Thus the "drain" WRs issued by iw_drain_qp() would work if they were needed,
and fail immediately if they are not needed.  In either case, the flush operation is complete.

I really wish the iWARP spec architects had avoided these sorts of diversions from the IB spec....

Steve.




^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
  2015-11-23 10:35                     ` Sagi Grimberg
@ 2015-11-23 14:48                           ` Steve Wise
       [not found]                       ` <5652EC00.8010705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  1 sibling, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-23 14:48 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: linux-kernel-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-kernel-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Sagi Grimberg
> Sent: Monday, November 23, 2015 4:36 AM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org; axboe-b10kYP2dOMg@public.gmane.org; linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > So Maybe we should have:
> > void ib_drain_qp(struct ib_qp *qp)
> 
> Christoph suggested that this flushing would be taken care
> of by rdma_disconnect which sounds even better I think..
> --

Agreed. 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* RE: [PATCH 3/9] IB: add a helper to safely drain a QP
@ 2015-11-23 14:48                           ` Steve Wise
  0 siblings, 0 replies; 140+ messages in thread
From: Steve Wise @ 2015-11-23 14:48 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig', linux-rdma
  Cc: bart.vanassche, axboe, linux-scsi, linux-kernel



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Sagi Grimberg
> Sent: Monday, November 23, 2015 4:36 AM
> To: Steve Wise; 'Christoph Hellwig'; linux-rdma@vger.kernel.org
> Cc: bart.vanassche@sandisk.com; axboe@fb.com; linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/9] IB: add a helper to safely drain a QP
> 
> 
> > So Maybe we should have:
> > void ib_drain_qp(struct ib_qp *qp)
> 
> Christoph suggested that this flushing would be taken care
> of by rdma_disconnect which sounds even better I think..
> --

Agreed. 


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-14  7:08           ` Christoph Hellwig
  (?)
@ 2015-11-23 20:01           ` Jason Gunthorpe
       [not found]             ` <20151123200136.GA5640-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  -1 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 20:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rdma, sagig, bart.vanassche, axboe, linux-scsi, linux-kernel

On Sat, Nov 14, 2015 at 08:08:49AM +0100, Christoph Hellwig wrote:
> On Fri, Nov 13, 2015 at 11:25:13AM -0700, Jason Gunthorpe wrote:
> > For instance, like this, not fulling draining the cq and then doing:
> > 
> > > +	completed = __ib_process_cq(cq, budget);
> > > +	if (completed < budget) {
> > > +		irq_poll_complete(&cq->iop);
> > > +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> > 
> > Doesn't seem entirely right? There is no point in calling
> > ib_req_notify_cq if the code knows there is still stuff in the CQ and
> > has already, independently, arranged for ib_poll_hander to be
> > guarenteed called.
> 
> The code only calls ib_req_notify_cq if it knowns we finished earlier than
> our budget.

Okay, having now read the whole thing, I think I see the flow now. I don't
see any holes in the above, other than it is doing a bit more work
than it needs in some edges cases because it doesn't know if the CQ is
actually empty or not.

> > > +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> > > +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> > > +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> > > +		queue_work(ib_comp_wq, &cq->work);
> > 
> > Same comment here..
> 
> Same here - we only requeue the work item if either we processed all of
> our budget, or ib_req_notify_cq with IB_CQ_REPORT_MISSED_EVENTS told
> us that we need to poll again.

I find the if construction hard to read, but yes, it looks OK.

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-14  7:13                 ` Christoph Hellwig
@ 2015-11-23 20:37                     ` Jason Gunthorpe
  -1 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 20:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Sat, Nov 14, 2015 at 08:13:44AM +0100, Christoph Hellwig wrote:
> On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
> > Looking at that thread and then at the patch a bit more..
> > 
> > +void ib_process_cq_direct(struct ib_cq *cq)
> > [..]
> > +	__ib_process_cq(cq, INT_MAX);
> > 
> > INT_MAX is not enough, it needs to loop.
> > This is missing a ib_req_notify also.
> 
> No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
> the SRP case polls the send CQ only from the same context it sends for
> without any interrupt notification at al.

Hurm, okay, that is not at all what I was thinking this was for..

So the only use of this function is to drain a send cq, in a state
where it is guarenteed no new entries can be added, and only if the cq
is not already event driven. I'd stick those notes in the comment..

Hum. I wonder if this is even a reasonable way to run a ULP. It is
important that rx completions are not used to drive reaping of
resources that are still committed to the send queue. ie do not
trigger send buffer reuse based on a rx completion.

So, if a ULP uses this API, how does it handle the sendq becoming
full? As above, a ULP cannot use recvs to infer available sendq
space. It must directly reap the sendq. So a correct ULP would have to
spin calling ib_process_direct_cq until it makes enough progress to
add more things to the sendq. I don't obviously see that in SRP - so
I'm guessing it has buggered up sendq flow control?

NFS had similar problems lately too, I wrote a long explanation to
Chuck on this subject.

That said, the demand poll almost seems like a reasonable way for a
ULP to run the sendq, do the polls on send occasionally or when more
space is needed to better amortize the reaping overhead at the cost of
send latency. But API wise it needs to be able to switch over to a
sleep if enough progress hasn't been made.

So.. maybe also add to the comment that ib_process_cq_direct is
deprecated and should not be used in new code until SRP gets sorted?

> > Perhaps ib_req_notify_cq should be folded into __ib_process_cq, then
> > it can trivially honour the budget on additional loops from
> > IB_CQ_REPORT_MISSED_EVENTS.
> 
> Which also defeats this proposal.

Just ignore my remarks here, someone should do a benchmark to see if
we are hitting the edge cases of extra spins around the various loops
before reworking this. Can't trivially hoist ib_req_notify_cq into
__ib_process_cq because of how it needs to be ordered with
irq_poll_complete

Reviewed-by: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-23 20:37                     ` Jason Gunthorpe
  0 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 20:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Sat, Nov 14, 2015 at 08:13:44AM +0100, Christoph Hellwig wrote:
> On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
> > Looking at that thread and then at the patch a bit more..
> > 
> > +void ib_process_cq_direct(struct ib_cq *cq)
> > [..]
> > +	__ib_process_cq(cq, INT_MAX);
> > 
> > INT_MAX is not enough, it needs to loop.
> > This is missing a ib_req_notify also.
> 
> No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
> the SRP case polls the send CQ only from the same context it sends for
> without any interrupt notification at al.

Hurm, okay, that is not at all what I was thinking this was for..

So the only use of this function is to drain a send cq, in a state
where it is guarenteed no new entries can be added, and only if the cq
is not already event driven. I'd stick those notes in the comment..

Hum. I wonder if this is even a reasonable way to run a ULP. It is
important that rx completions are not used to drive reaping of
resources that are still committed to the send queue. ie do not
trigger send buffer reuse based on a rx completion.

So, if a ULP uses this API, how does it handle the sendq becoming
full? As above, a ULP cannot use recvs to infer available sendq
space. It must directly reap the sendq. So a correct ULP would have to
spin calling ib_process_direct_cq until it makes enough progress to
add more things to the sendq. I don't obviously see that in SRP - so
I'm guessing it has buggered up sendq flow control?

NFS had similar problems lately too, I wrote a long explanation to
Chuck on this subject.

That said, the demand poll almost seems like a reasonable way for a
ULP to run the sendq, do the polls on send occasionally or when more
space is needed to better amortize the reaping overhead at the cost of
send latency. But API wise it needs to be able to switch over to a
sleep if enough progress hasn't been made.

So.. maybe also add to the comment that ib_process_cq_direct is
deprecated and should not be used in new code until SRP gets sorted?

> > Perhaps ib_req_notify_cq should be folded into __ib_process_cq, then
> > it can trivially honour the budget on additional loops from
> > IB_CQ_REPORT_MISSED_EVENTS.
> 
> Which also defeats this proposal.

Just ignore my remarks here, someone should do a benchmark to see if
we are hitting the edge cases of extra spins around the various loops
before reworking this. Can't trivially hoist ib_req_notify_cq into
__ib_process_cq because of how it needs to be ordered with
irq_poll_complete

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 20:01           ` Jason Gunthorpe
@ 2015-11-23 20:57                 ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-23 20:57 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb,
	bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Mon, Nov 23, 2015 at 01:01:36PM -0700, Jason Gunthorpe wrote:
> Okay, having now read the whole thing, I think I see the flow now. I don't
> see any holes in the above, other than it is doing a bit more work
> than it needs in some edges cases because it doesn't know if the CQ is
> actually empty or not.
> 
> > > > +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> > > > +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> > > > +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> > > > +		queue_work(ib_comp_wq, &cq->work);
> > > 
> > > Same comment here..
> > 
> > Same here - we only requeue the work item if either we processed all of
> > our budget, or ib_req_notify_cq with IB_CQ_REPORT_MISSED_EVENTS told
> > us that we need to poll again.
> 
> I find the if construction hard to read, but yes, it looks OK.

If you're got suggestions to improve the flow please send them my way.
I'm not entirely happy with it, but couldn't come up with a better idea.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-23 20:57                 ` Christoph Hellwig
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Hellwig @ 2015-11-23 20:57 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma, sagig, bart.vanassche, axboe,
	linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 01:01:36PM -0700, Jason Gunthorpe wrote:
> Okay, having now read the whole thing, I think I see the flow now. I don't
> see any holes in the above, other than it is doing a bit more work
> than it needs in some edges cases because it doesn't know if the CQ is
> actually empty or not.
> 
> > > > +	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
> > > > +	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
> > > > +	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> > > > +		queue_work(ib_comp_wq, &cq->work);
> > > 
> > > Same comment here..
> > 
> > Same here - we only requeue the work item if either we processed all of
> > our budget, or ib_req_notify_cq with IB_CQ_REPORT_MISSED_EVENTS told
> > us that we need to poll again.
> 
> I find the if construction hard to read, but yes, it looks OK.

If you're got suggestions to improve the flow please send them my way.
I'm not entirely happy with it, but couldn't come up with a better idea.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 20:37                     ` Jason Gunthorpe
@ 2015-11-23 21:04                       ` Bart Van Assche
  -1 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-23 21:04 UTC (permalink / raw)
  To: Jason Gunthorpe, Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/23/2015 12:37 PM, Jason Gunthorpe wrote:
> On Sat, Nov 14, 2015 at 08:13:44AM +0100, Christoph Hellwig wrote:
>> On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
>>> Looking at that thread and then at the patch a bit more..
>>>
>>> +void ib_process_cq_direct(struct ib_cq *cq)
>>> [..]
>>> +	__ib_process_cq(cq, INT_MAX);
>>>
>>> INT_MAX is not enough, it needs to loop.
>>> This is missing a ib_req_notify also.
>>
>> No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
>> the SRP case polls the send CQ only from the same context it sends for
>> without any interrupt notification at al.
>
> Hurm, okay, that is not at all what I was thinking this was for..
>
> So the only use of this function is to drain a send cq, in a state
> where it is guarenteed no new entries can be added, and only if the cq
> is not already event driven. I'd stick those notes in the comment..
>
> Hum. I wonder if this is even a reasonable way to run a ULP. It is
> important that rx completions are not used to drive reaping of
> resources that are still committed to the send queue. ie do not
> trigger send buffer reuse based on a rx completion.
>
> So, if a ULP uses this API, how does it handle the sendq becoming
> full? As above, a ULP cannot use recvs to infer available sendq
> space. It must directly reap the sendq. So a correct ULP would have to
> spin calling ib_process_direct_cq until it makes enough progress to
> add more things to the sendq. I don't obviously see that in SRP - so
> I'm guessing it has buggered up sendq flow control?
>
> NFS had similar problems lately too, I wrote a long explanation to
> Chuck on this subject.
>
> That said, the demand poll almost seems like a reasonable way for a
> ULP to run the sendq, do the polls on send occasionally or when more
> space is needed to better amortize the reaping overhead at the cost of
> send latency. But API wise it needs to be able to switch over to a
> sleep if enough progress hasn't been made.
>
> So.. maybe also add to the comment that ib_process_cq_direct is
> deprecated and should not be used in new code until SRP gets sorted?

Hello Jason,

Considerable time ago the send queue in the SRP initiator driver was 
modified from signaled to non-signaled to reduce the number of 
interrupts triggered by the SRP initiator driver. The SRP initiator 
driver polls the send queue every time before a SCSI command is sent to 
the target. I think this is a pattern that is also useful for other 
ULP's so I'm not convinced that ib_process_cq_direct() should be 
deprecated :-)

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-23 21:04                       ` Bart Van Assche
  0 siblings, 0 replies; 140+ messages in thread
From: Bart Van Assche @ 2015-11-23 21:04 UTC (permalink / raw)
  To: Jason Gunthorpe, Christoph Hellwig
  Cc: Bart Van Assche, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/23/2015 12:37 PM, Jason Gunthorpe wrote:
> On Sat, Nov 14, 2015 at 08:13:44AM +0100, Christoph Hellwig wrote:
>> On Fri, Nov 13, 2015 at 03:06:36PM -0700, Jason Gunthorpe wrote:
>>> Looking at that thread and then at the patch a bit more..
>>>
>>> +void ib_process_cq_direct(struct ib_cq *cq)
>>> [..]
>>> +	__ib_process_cq(cq, INT_MAX);
>>>
>>> INT_MAX is not enough, it needs to loop.
>>> This is missing a ib_req_notify also.
>>
>> No.  Direct cases _never_ calls ib_req_notify.  Its for the case where
>> the SRP case polls the send CQ only from the same context it sends for
>> without any interrupt notification at al.
>
> Hurm, okay, that is not at all what I was thinking this was for..
>
> So the only use of this function is to drain a send cq, in a state
> where it is guarenteed no new entries can be added, and only if the cq
> is not already event driven. I'd stick those notes in the comment..
>
> Hum. I wonder if this is even a reasonable way to run a ULP. It is
> important that rx completions are not used to drive reaping of
> resources that are still committed to the send queue. ie do not
> trigger send buffer reuse based on a rx completion.
>
> So, if a ULP uses this API, how does it handle the sendq becoming
> full? As above, a ULP cannot use recvs to infer available sendq
> space. It must directly reap the sendq. So a correct ULP would have to
> spin calling ib_process_direct_cq until it makes enough progress to
> add more things to the sendq. I don't obviously see that in SRP - so
> I'm guessing it has buggered up sendq flow control?
>
> NFS had similar problems lately too, I wrote a long explanation to
> Chuck on this subject.
>
> That said, the demand poll almost seems like a reasonable way for a
> ULP to run the sendq, do the polls on send occasionally or when more
> space is needed to better amortize the reaping overhead at the cost of
> send latency. But API wise it needs to be able to switch over to a
> sleep if enough progress hasn't been made.
>
> So.. maybe also add to the comment that ib_process_cq_direct is
> deprecated and should not be used in new code until SRP gets sorted?

Hello Jason,

Considerable time ago the send queue in the SRP initiator driver was 
modified from signaled to non-signaled to reduce the number of 
interrupts triggered by the SRP initiator driver. The SRP initiator 
driver polls the send queue every time before a SCSI command is sent to 
the target. I think this is a pattern that is also useful for other 
ULP's so I'm not convinced that ib_process_cq_direct() should be 
deprecated :-)

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 21:04                       ` Bart Van Assche
  (?)
@ 2015-11-23 21:28                       ` Jason Gunthorpe
  2015-11-23 21:54                         ` Bart Van Assche
  -1 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 21:28 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 01:04:25PM -0800, Bart Van Assche wrote:

> Considerable time ago the send queue in the SRP initiator driver was
> modified from signaled to non-signaled to reduce the number of interrupts
> triggered by the SRP initiator driver. The SRP initiator driver polls the
> send queue every time before a SCSI command is sent to the target. I think
> this is a pattern that is also useful for other ULP's so I'm not convinced
> that ib_process_cq_direct() should be deprecated :-)

As I explained, that is a fine idea, but I can't see how SRP is able
to correctly do sendq flow control without spinning on the poll, which
it does not do.

I'm guessing SRP is trying to drive sendq flow control from the recv
side, like NFS was. This is wrong and should not be part of the common
API.

Does that make sense?

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 21:28                       ` Jason Gunthorpe
@ 2015-11-23 21:54                         ` Bart Van Assche
  2015-11-23 22:18                           ` Jason Gunthorpe
  0 siblings, 1 reply; 140+ messages in thread
From: Bart Van Assche @ 2015-11-23 21:54 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/23/2015 01:28 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 01:04:25PM -0800, Bart Van Assche wrote:
>
>> Considerable time ago the send queue in the SRP initiator driver was
>> modified from signaled to non-signaled to reduce the number of interrupts
>> triggered by the SRP initiator driver. The SRP initiator driver polls the
>> send queue every time before a SCSI command is sent to the target. I think
>> this is a pattern that is also useful for other ULP's so I'm not convinced
>> that ib_process_cq_direct() should be deprecated :-)
>
> As I explained, that is a fine idea, but I can't see how SRP is able
> to correctly do sendq flow control without spinning on the poll, which
> it does not do.
>
> I'm guessing SRP is trying to drive sendq flow control from the recv
> side, like NFS was. This is wrong and should not be part of the common
> API.
>
> Does that make sense?

Not really ... Please have a look at the SRP initiator source code. What 
the SRP initiator does is to poll the send queue before sending a new 
SCSI command to the target system starts. I think this approach could 
also be used in other ULP drivers if the send queue poll frequency is 
such that no send queue overflow occurs.

Bart.


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 21:54                         ` Bart Van Assche
@ 2015-11-23 22:18                           ` Jason Gunthorpe
  2015-11-23 22:33                             ` Bart Van Assche
  0 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 22:18 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 01:54:05PM -0800, Bart Van Assche wrote:

> Not really ... Please have a look at the SRP initiator source code. What the
> SRP initiator does is to poll the send queue before sending a new
> SCSI

I see that. What I don't see is how SRP handles things when the
sendq fills up, ie the case where __srp_get_tx_iu() == NULL. It looks
like the driver starts to panic and generates printks. I can't tell if
it can survive that, but it doesn't look very good..

It would be a lot better if this wasn't allowed to happen, the polling
loop can run until a sendq becomes available, and never return null
from __srp_get_tx_iu().

Ie, __srp_get_tx_iu should look more like

   ib_poll_cq_until(..., !list_empty(&ch->free_tx));

Which would be a fairly sane core API for this direct usage.. Ideally
the core code would sleep if possible and not just spin. Maybe also
protect it with a timer.

> command to the target system starts. I think this approach could also be
> used in other ULP drivers if the send queue poll frequency is such that no
> send queue overflow occurs.

Yes, I agree, but it has to be done properly :)

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 22:18                           ` Jason Gunthorpe
@ 2015-11-23 22:33                             ` Bart Van Assche
  2015-11-23 23:06                               ` Jason Gunthorpe
  0 siblings, 1 reply; 140+ messages in thread
From: Bart Van Assche @ 2015-11-23 22:33 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On 11/23/2015 02:18 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 01:54:05PM -0800, Bart Van Assche wrote:
> What I don't see is how SRP handles things when the
> sendq fills up, ie the case where __srp_get_tx_iu() == NULL. It looks
> like the driver starts to panic and generates printks. I can't tell if
> it can survive that, but it doesn't look very good..

Hello Jason,

 From srp_cm_rep_handler():

		target->scsi_host->can_queue
			= min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE,
			      target->scsi_host->can_queue);

In other words, the SCSI core is told to ensure that the number of 
outstanding SCSI commands is one less than the number of elements in the 
ch->free_tx list. And since the SRP initiator serializes task management 
requests it is guaranteed that __srp_get_tx_iu() won't fail due to 
ch->free_tx being empty.

Bart.

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-23 22:33                             ` Bart Van Assche
@ 2015-11-23 23:06                               ` Jason Gunthorpe
       [not found]                                 ` <B24F4DDE-709A-4D2D-8B26-4E83325DBB1A@asomi.com>
  0 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-23 23:06 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, linux-rdma, sagig, axboe, linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 02:33:05PM -0800, Bart Van Assche wrote:
> On 11/23/2015 02:18 PM, Jason Gunthorpe wrote:
> >On Mon, Nov 23, 2015 at 01:54:05PM -0800, Bart Van Assche wrote:
> >What I don't see is how SRP handles things when the
> >sendq fills up, ie the case where __srp_get_tx_iu() == NULL. It looks
> >like the driver starts to panic and generates printks. I can't tell if
> >it can survive that, but it doesn't look very good..
> 
> Hello Jason,
> 
> From srp_cm_rep_handler():
> 
> 		target->scsi_host->can_queue
> 			= min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE,
> 			      target->scsi_host->can_queue);
> 
> In other words, the SCSI core is told to ensure that the number of
> outstanding SCSI commands is one less than the number of elements in the
> ch->free_tx list. And since the SRP initiator serializes task management
> requests it is guaranteed that __srp_get_tx_iu() won't fail due to
> ch->free_tx being empty.

I realize that, and as I already explained, SRP cannot drive the sendq
flow control from the recv side.

The SCSI core considers the command complete and will issue a new
command as soon as the recv completion associated with the command is
returned. (ie when the remote responds)

This *DOES NOT* say anything about the state of the sendq, it does not
guarantee there is send CQ entry available for the associated send, it
does not guarantee there is available space in the sendq. Verbs DOES
NOT make ordering guarentees between queues, even if the queues are
causally related.

This is an important point in verbs and it is commonly done wrong.

So, yes, __srp_get_tx_iu absolutely can fail due to ch->free_tx being
empty, even though by observing the recv side SRP has inferred that
the sendq should have space.

Every ULP has to cope with this, and a direct poll API that doesn't
account for the need to block on a predicate is broken by design.
This is why I object.

'ib_poll_cq_until' would correct all of this.

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
       [not found]                                 ` <B24F4DDE-709A-4D2D-8B26-4E83325DBB1A@asomi.com>
@ 2015-11-24  0:00                                   ` Jason Gunthorpe
       [not found]                                     ` <20151124000011.GA9301-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-24  0:00 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 03:30:42PM -0800, Caitlin Bestler wrote:
>    The receive completion can be safely assumed to indicate transmit
>    completion over a reliable connection unless your peer has gone
>    completely bonkers and is replying to a command that it did not
>    receive.

Perhaps iWarp is different and does specify this ordering but IB does
not.

The issue with IB is how the ACK protocol is designed. There is not
strong ordering between ACKs and data transfers. A HCA can send
ACK,DATA and the network could drop the ACK. The recevier side does
not know the ACK was lost and goes ahead to process DATA.

Since only ACK advances the sendq and DATA advances the recvq it is
trivial to get a case where the recvq is advanced with a reply while
the sendq continues to wait for the ACK to be resent.

Further IB allows ACK coalescing and has no rules for how an ACK is
placed. It is entirely valid for a HCA to RECV,REPLY,ACK - for
instance.

>    I actually had a bug in an early iWARP emulation where the simulated
>    peer, because it was simulated, responded
>    instantly. The result was a TCP segment that both acked the
>    transmission *and* contained the reply. The bug was
>    that the code processed the reception before the transmission ack,
>    causing the receive completion to be placed
>    on the completion queue before transmit completion.

I don't know if iWARP has the same lax ordering as IB, but certainly,
what you describe is legal for IB verbs to do, and our kernel ULPs
have to cope with it.

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-24  0:00                                   ` Jason Gunthorpe
@ 2015-11-24  0:34                                         ` Tom Talpey
  0 siblings, 0 replies; 140+ messages in thread
From: Tom Talpey @ 2015-11-24  0:34 UTC (permalink / raw)
  To: Jason Gunthorpe, Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/23/2015 7:00 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 03:30:42PM -0800, Caitlin Bestler wrote:
>>     The receive completion can be safely assumed to indicate transmit
>>     completion over a reliable connection unless your peer has gone
>>     completely bonkers and is replying to a command that it did not
>>     receive.
>
> Perhaps iWarp is different and does specify this ordering but IB does
> not.

iWARP is not different. The situation you (Jason) describe has
nothing to do with the transport. It has everything to do with
as you point out the lack of causality between send and receive
completions.

It is entirely possible for the reply to be received before the
send is fully processed. For example, the send might be issued
on one core, and that core scheduled away before the completion
for the send is ready. In the meantime, the request goes on
the wire, the target processes it and replies, and the reply
is processed. Boom, the send queue completion is still pending.

Been there, seen that. Bluescreened on it, mysteriously.

A really good way to see this is with software providers, btw.
Try it with soft{roce,iwarp}, under heavy load.

Tom.

>
> The issue with IB is how the ACK protocol is designed. There is not
> strong ordering between ACKs and data transfers. A HCA can send
> ACK,DATA and the network could drop the ACK. The recevier side does
> not know the ACK was lost and goes ahead to process DATA.
>
> Since only ACK advances the sendq and DATA advances the recvq it is
> trivial to get a case where the recvq is advanced with a reply while
> the sendq continues to wait for the ACK to be resent.
>
> Further IB allows ACK coalescing and has no rules for how an ACK is
> placed. It is entirely valid for a HCA to RECV,REPLY,ACK - for
> instance.
>
>>     I actually had a bug in an early iWARP emulation where the simulated
>>     peer, because it was simulated, responded
>>     instantly. The result was a TCP segment that both acked the
>>     transmission *and* contained the reply. The bug was
>>     that the code processed the reception before the transmission ack,
>>     causing the receive completion to be placed
>>     on the completion queue before transmit completion.
>
> I don't know if iWARP has the same lax ordering as IB, but certainly,
> what you describe is legal for IB verbs to do, and our kernel ULPs
> have to cope with it.
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-24  0:34                                         ` Tom Talpey
  0 siblings, 0 replies; 140+ messages in thread
From: Tom Talpey @ 2015-11-24  0:34 UTC (permalink / raw)
  To: Jason Gunthorpe, Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel

On 11/23/2015 7:00 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 03:30:42PM -0800, Caitlin Bestler wrote:
>>     The receive completion can be safely assumed to indicate transmit
>>     completion over a reliable connection unless your peer has gone
>>     completely bonkers and is replying to a command that it did not
>>     receive.
>
> Perhaps iWarp is different and does specify this ordering but IB does
> not.

iWARP is not different. The situation you (Jason) describe has
nothing to do with the transport. It has everything to do with
as you point out the lack of causality between send and receive
completions.

It is entirely possible for the reply to be received before the
send is fully processed. For example, the send might be issued
on one core, and that core scheduled away before the completion
for the send is ready. In the meantime, the request goes on
the wire, the target processes it and replies, and the reply
is processed. Boom, the send queue completion is still pending.

Been there, seen that. Bluescreened on it, mysteriously.

A really good way to see this is with software providers, btw.
Try it with soft{roce,iwarp}, under heavy load.

Tom.

>
> The issue with IB is how the ACK protocol is designed. There is not
> strong ordering between ACKs and data transfers. A HCA can send
> ACK,DATA and the network could drop the ACK. The recevier side does
> not know the ACK was lost and goes ahead to process DATA.
>
> Since only ACK advances the sendq and DATA advances the recvq it is
> trivial to get a case where the recvq is advanced with a reply while
> the sendq continues to wait for the ACK to be resent.
>
> Further IB allows ACK coalescing and has no rules for how an ACK is
> placed. It is entirely valid for a HCA to RECV,REPLY,ACK - for
> instance.
>
>>     I actually had a bug in an early iWARP emulation where the simulated
>>     peer, because it was simulated, responded
>>     instantly. The result was a TCP segment that both acked the
>>     transmission *and* contained the reply. The bug was
>>     that the code processed the reception before the transmission ack,
>>     causing the receive completion to be placed
>>     on the completion queue before transmit completion.
>
> I don't know if iWARP has the same lax ordering as IB, but certainly,
> what you describe is legal for IB verbs to do, and our kernel ULPs
> have to cope with it.
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-24  0:34                                         ` Tom Talpey
@ 2015-11-24  0:40                                             ` Jason Gunthorpe
  -1 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-24  0:40 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Caitlin Bestler, Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Mon, Nov 23, 2015 at 07:34:53PM -0500, Tom Talpey wrote:

> Been there, seen that. Bluescreened on it, mysteriously.

Yes, me too :(

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-24  0:40                                             ` Jason Gunthorpe
  0 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-24  0:40 UTC (permalink / raw)
  To: Tom Talpey
  Cc: Caitlin Bestler, Bart Van Assche, Christoph Hellwig, linux-rdma,
	sagig, axboe, linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 07:34:53PM -0500, Tom Talpey wrote:

> Been there, seen that. Bluescreened on it, mysteriously.

Yes, me too :(

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-24  0:00                                   ` Jason Gunthorpe
@ 2015-11-24  2:35                                         ` Caitlin Bestler
  0 siblings, 0 replies; 140+ messages in thread
From: Caitlin Bestler @ 2015-11-24  2:35 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA



On 11/23/2015 4:00 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 03:30:42PM -0800, Caitlin Bestler wrote:
>>     The receive completion can be safely assumed to indicate transmit
>>     completion over a reliable connection unless your peer has gone
>>     completely bonkers and is replying to a command that it did not
>>     receive.
> Perhaps iWarp is different and does specify this ordering but IB does
> not.
>
> The issue with IB is how the ACK protocol is designed. There is not
> strong ordering between ACKs and data transfers. A HCA can send
> ACK,DATA and the network could drop the ACK. The recevier side does
> not know the ACK was lost and goes ahead to process DATA.
>
> Since only ACK advances the sendq and DATA advances the recvq it is
> trivial to get a case where the recvq is advanced with a reply while
> the sendq continues to wait for the ACK to be resent.
>
> Further IB allows ACK coalescing and has no rules for how an ACK is
> placed. It is entirely valid for a HCA to RECV,REPLY,ACK - for
> instance.
>
>
Is it possible for an IB HCA to transmit a response on a QP and not in 
that packet
or a previous packet acknowledge something that it has delivered to the 
user?

My recollection of the IB verbs is that they were unlikely to have 
overlooked something
like that. If it did slip through then there should be an errata.

But regardless of specification lawyering, is this an implementation 
issue. Are there
actual HCAs that make this mistake?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-24  2:35                                         ` Caitlin Bestler
  0 siblings, 0 replies; 140+ messages in thread
From: Caitlin Bestler @ 2015-11-24  2:35 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel



On 11/23/2015 4:00 PM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 03:30:42PM -0800, Caitlin Bestler wrote:
>>     The receive completion can be safely assumed to indicate transmit
>>     completion over a reliable connection unless your peer has gone
>>     completely bonkers and is replying to a command that it did not
>>     receive.
> Perhaps iWarp is different and does specify this ordering but IB does
> not.
>
> The issue with IB is how the ACK protocol is designed. There is not
> strong ordering between ACKs and data transfers. A HCA can send
> ACK,DATA and the network could drop the ACK. The recevier side does
> not know the ACK was lost and goes ahead to process DATA.
>
> Since only ACK advances the sendq and DATA advances the recvq it is
> trivial to get a case where the recvq is advanced with a reply while
> the sendq continues to wait for the ACK to be resent.
>
> Further IB allows ACK coalescing and has no rules for how an ACK is
> placed. It is entirely valid for a HCA to RECV,REPLY,ACK - for
> instance.
>
>
Is it possible for an IB HCA to transmit a response on a QP and not in 
that packet
or a previous packet acknowledge something that it has delivered to the 
user?

My recollection of the IB verbs is that they were unlikely to have 
overlooked something
like that. If it did slip through then there should be an errata.

But regardless of specification lawyering, is this an implementation 
issue. Are there
actual HCAs that make this mistake?


^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-24  2:35                                         ` Caitlin Bestler
@ 2015-11-24  7:03                                             ` Jason Gunthorpe
  -1 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-24  7:03 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Mon, Nov 23, 2015 at 06:35:28PM -0800, Caitlin Bestler wrote:

> Is it possible for an IB HCA to transmit a response on a QP and not
> in that packet or a previous packet acknowledge something that it
> has delivered to the user?

AFAIK, the rules of ack coalescing do not interact with the send
side. Even if they did, that is the wrong place to look at.

We must look at the receiver. Ordered ack,data on the wire may suffer
a packet loss and the ack may not reach the reciever. In this case can
the reciever detect the lost ack and not progress the data? For IB, it
cannot. The ack sequencing is part of the transmitters recv FSM, and
does not interact with the send FSM.

I feel this a deliberate IB design choice to be simple and efficient
in hardware.

> My recollection of the IB verbs is that they were unlikely to have
> overlooked something like that. If it did slip through then there
> should be an errata.

verbs reflects the wire protocol and the wire protocol has no way to
create a linkage between the send and recv sides of a RC QP. It is not
a spec bug, there is no errata.

> But regardless of specification lawyering, is this an implementation
> issue.

All IB implementations have no choice but to act this way - the wire
protocol provides no way to guarentee ack vs data sequencing at the
receiver, so there is simply no way to guarentee the sendq advances
strictly causally with the recvq.

> Are there actual HCAs that make this mistake?

All IB HCAs have this behavior and require apps to see a send CQ
completion before making any statements about the state of the send Q
or buffers handed over to the HCA. Tom and I have seen this in real
systems under proper stress conditions. [Which is why I am so certain
about this, because when I first hit it years ago I dug into the spec
and figured out it was not a HW bug I was looking at]

This is a direct consequence of how IB runs the ACK protocol.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-24  7:03                                             ` Jason Gunthorpe
  0 siblings, 0 replies; 140+ messages in thread
From: Jason Gunthorpe @ 2015-11-24  7:03 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel

On Mon, Nov 23, 2015 at 06:35:28PM -0800, Caitlin Bestler wrote:

> Is it possible for an IB HCA to transmit a response on a QP and not
> in that packet or a previous packet acknowledge something that it
> has delivered to the user?

AFAIK, the rules of ack coalescing do not interact with the send
side. Even if they did, that is the wrong place to look at.

We must look at the receiver. Ordered ack,data on the wire may suffer
a packet loss and the ack may not reach the reciever. In this case can
the reciever detect the lost ack and not progress the data? For IB, it
cannot. The ack sequencing is part of the transmitters recv FSM, and
does not interact with the send FSM.

I feel this a deliberate IB design choice to be simple and efficient
in hardware.

> My recollection of the IB verbs is that they were unlikely to have
> overlooked something like that. If it did slip through then there
> should be an errata.

verbs reflects the wire protocol and the wire protocol has no way to
create a linkage between the send and recv sides of a RC QP. It is not
a spec bug, there is no errata.

> But regardless of specification lawyering, is this an implementation
> issue.

All IB implementations have no choice but to act this way - the wire
protocol provides no way to guarentee ack vs data sequencing at the
receiver, so there is simply no way to guarentee the sendq advances
strictly causally with the recvq.

> Are there actual HCAs that make this mistake?

All IB HCAs have this behavior and require apps to see a send CQ
completion before making any statements about the state of the send Q
or buffers handed over to the HCA. Tom and I have seen this in real
systems under proper stress conditions. [Which is why I am so certain
about this, because when I first hit it years ago I dug into the spec
and figured out it was not a HW bug I was looking at]

This is a direct consequence of how IB runs the ACK protocol.

Jason

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
  2015-11-24  7:03                                             ` Jason Gunthorpe
@ 2015-11-24 12:52                                                 ` Tom Talpey
  -1 siblings, 0 replies; 140+ messages in thread
From: Tom Talpey @ 2015-11-24 12:52 UTC (permalink / raw)
  To: Jason Gunthorpe, Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	sagig-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb, axboe-b10kYP2dOMg,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On 11/24/2015 2:03 AM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 06:35:28PM -0800, Caitlin Bestler wrote:
>> Are there actual HCAs that make this mistake?
>
> All IB HCAs have this behavior and require apps to see a send CQ
> completion before making any statements about the state of the send Q
> or buffers handed over to the HCA. Tom and I have seen this in real
> systems under proper stress conditions. [Which is why I am so certain
> about this, because when I first hit it years ago I dug into the spec
> and figured out it was not a HW bug I was looking at]

To be clear, I saw the reply-completion-before-request-completion on
Windows, not Linux, but the principle is identical. It's simply a
fact of life on a multiprocessor, unless you want to throw in locks
and synchronization rules that drivers have to follow to enforce
ordered completions across queues. Which trust me, you don't.

In Windows SMB Direct, we added reference counts around pretty much
every verb interaction associated with each upper layer operation,
and did not retire them until all refcounts went to zero. It is
excruciatingly correct yet performs incredibly well.

Tom.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH 2/9] IB: add a proper completion queue abstraction
@ 2015-11-24 12:52                                                 ` Tom Talpey
  0 siblings, 0 replies; 140+ messages in thread
From: Tom Talpey @ 2015-11-24 12:52 UTC (permalink / raw)
  To: Jason Gunthorpe, Caitlin Bestler
  Cc: Bart Van Assche, Christoph Hellwig, linux-rdma, sagig, axboe,
	linux-scsi, linux-kernel

On 11/24/2015 2:03 AM, Jason Gunthorpe wrote:
> On Mon, Nov 23, 2015 at 06:35:28PM -0800, Caitlin Bestler wrote:
>> Are there actual HCAs that make this mistake?
>
> All IB HCAs have this behavior and require apps to see a send CQ
> completion before making any statements about the state of the send Q
> or buffers handed over to the HCA. Tom and I have seen this in real
> systems under proper stress conditions. [Which is why I am so certain
> about this, because when I first hit it years ago I dug into the spec
> and figured out it was not a HW bug I was looking at]

To be clear, I saw the reply-completion-before-request-completion on
Windows, not Linux, but the principle is identical. It's simply a
fact of life on a multiprocessor, unless you want to throw in locks
and synchronization rules that drivers have to follow to enforce
ordered completions across queues. Which trust me, you don't.

In Windows SMB Direct, we added reference counts around pretty much
every verb interaction associated with each upper layer operation,
and did not retire them until all refcounts went to zero. It is
excruciatingly correct yet performs incredibly well.

Tom.

^ permalink raw reply	[flat|nested] 140+ messages in thread

end of thread, other threads:[~2015-11-24 12:59 UTC | newest]

Thread overview: 140+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-13 13:46 add a proper completion queue abstraction Christoph Hellwig
2015-11-13 13:46 ` [PATCH 1/9] move blk_iopoll to limit and make it generally available Christoph Hellwig
     [not found]   ` <1447422410-20891-2-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-13 15:23     ` Or Gerlitz
2015-11-13 15:23       ` Or Gerlitz
     [not found]       ` <CAJ3xEMgj2ycv61K38ZOowTRbrri_UhQgBcaKT0ZnnMHiBrmL5A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-11-14  7:02         ` Christoph Hellwig
2015-11-14  7:02           ` Christoph Hellwig
     [not found]           ` <20151114070200.GA27738-jcswGhMUV9g@public.gmane.org>
2015-11-15  8:48             ` Sagi Grimberg
2015-11-15  8:48               ` Sagi Grimberg
     [not found]               ` <564846E9.9070301-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-15  9:04                 ` Or Gerlitz
2015-11-15  9:04                   ` Or Gerlitz
     [not found]                   ` <CAJ3xEMgvttM1D3bePz0CWhZAZ3gCSQsf_qgmq9Ny4gzK5d0bXw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-11-15 13:16                     ` Sagi Grimberg
2015-11-15 13:16                       ` Sagi Grimberg
2015-11-15 12:51                 ` Christoph Hellwig
2015-11-15 12:51                   ` Christoph Hellwig
2015-11-13 19:19     ` Bart Van Assche
2015-11-13 19:19       ` Bart Van Assche
2015-11-14  7:02       ` Christoph Hellwig
2015-11-17 17:16         ` Bart Van Assche
2015-11-17 17:16           ` Bart Van Assche
2015-11-17 17:27           ` Bart Van Assche
2015-11-18 13:58           ` Christoph Hellwig
2015-11-13 13:46 ` [PATCH 2/9] IB: add a proper completion queue abstraction Christoph Hellwig
2015-11-15  9:40   ` Sagi Grimberg
     [not found]     ` <564852F2.5080602-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-15 12:55       ` Christoph Hellwig
2015-11-15 12:55         ` Christoph Hellwig
     [not found]         ` <20151115125501.GB2218-jcswGhMUV9g@public.gmane.org>
2015-11-15 13:21           ` Sagi Grimberg
2015-11-15 13:21             ` Sagi Grimberg
     [not found]   ` <1447422410-20891-3-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-13 18:25     ` Jason Gunthorpe
2015-11-13 18:25       ` Jason Gunthorpe
     [not found]       ` <20151113182513.GB21808-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-11-13 19:57         ` Bart Van Assche
2015-11-13 19:57           ` Bart Van Assche
2015-11-13 22:06           ` Jason Gunthorpe
     [not found]             ` <20151113220636.GA32133-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-11-14  7:13               ` Christoph Hellwig
2015-11-14  7:13                 ` Christoph Hellwig
     [not found]                 ` <20151114071344.GE27738-jcswGhMUV9g@public.gmane.org>
2015-11-23 20:37                   ` Jason Gunthorpe
2015-11-23 20:37                     ` Jason Gunthorpe
2015-11-23 21:04                     ` Bart Van Assche
2015-11-23 21:04                       ` Bart Van Assche
2015-11-23 21:28                       ` Jason Gunthorpe
2015-11-23 21:54                         ` Bart Van Assche
2015-11-23 22:18                           ` Jason Gunthorpe
2015-11-23 22:33                             ` Bart Van Assche
2015-11-23 23:06                               ` Jason Gunthorpe
     [not found]                                 ` <B24F4DDE-709A-4D2D-8B26-4E83325DBB1A@asomi.com>
2015-11-24  0:00                                   ` Jason Gunthorpe
     [not found]                                     ` <20151124000011.GA9301-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-11-24  0:34                                       ` Tom Talpey
2015-11-24  0:34                                         ` Tom Talpey
     [not found]                                         ` <5653B0AD.7090402-CLs1Zie5N5HQT0dZR+AlfA@public.gmane.org>
2015-11-24  0:40                                           ` Jason Gunthorpe
2015-11-24  0:40                                             ` Jason Gunthorpe
2015-11-24  2:35                                       ` Caitlin Bestler
2015-11-24  2:35                                         ` Caitlin Bestler
     [not found]                                         ` <5653CCF0.7050501-DpaxOq6QOWMAvxtiuMwx3w@public.gmane.org>
2015-11-24  7:03                                           ` Jason Gunthorpe
2015-11-24  7:03                                             ` Jason Gunthorpe
     [not found]                                             ` <20151124070301.GA23597-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-11-24 12:52                                               ` Tom Talpey
2015-11-24 12:52                                                 ` Tom Talpey
2015-11-14  7:08         ` Christoph Hellwig
2015-11-14  7:08           ` Christoph Hellwig
2015-11-23 20:01           ` Jason Gunthorpe
     [not found]             ` <20151123200136.GA5640-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-11-23 20:57               ` Christoph Hellwig
2015-11-23 20:57                 ` Christoph Hellwig
2015-11-17 17:52     ` Bart Van Assche
2015-11-17 17:52       ` Bart Van Assche
     [not found]       ` <564B697A.2020601-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-11-18  7:55         ` Sagi Grimberg
2015-11-18  7:55           ` Sagi Grimberg
     [not found]           ` <564C2F01.6020407-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-18 18:20             ` Bart Van Assche
2015-11-18 18:20               ` Bart Van Assche
2015-11-20 10:16               ` Christoph Hellwig
2015-11-20 16:50                 ` Bart Van Assche
2015-11-22  9:51                   ` Sagi Grimberg
2015-11-22 10:13                     ` Christoph Hellwig
     [not found]                       ` <20151122101308.GA12189-jcswGhMUV9g@public.gmane.org>
2015-11-22 10:36                         ` Sagi Grimberg
2015-11-22 10:36                           ` Sagi Grimberg
     [not found]                           ` <56519A90.5010502-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-22 13:23                             ` Christoph Hellwig
2015-11-22 13:23                               ` Christoph Hellwig
     [not found]                               ` <20151122132352.GA14154-jcswGhMUV9g@public.gmane.org>
2015-11-22 14:57                                 ` Sagi Grimberg
2015-11-22 14:57                                   ` Sagi Grimberg
2015-11-22 16:55                                   ` Bart Van Assche
2015-11-18 14:00         ` Christoph Hellwig
2015-11-18 14:00           ` Christoph Hellwig
2015-11-13 13:46 ` [PATCH 3/9] IB: add a helper to safely drain a QP Christoph Hellwig
     [not found]   ` <1447422410-20891-4-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-13 16:16     ` Steve Wise
2015-11-13 16:16       ` Steve Wise
     [not found]       ` <56460CC4.3030001-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2015-11-14  7:05         ` Christoph Hellwig
2015-11-14  7:05           ` Christoph Hellwig
2015-11-18 11:32     ` Sagi Grimberg
2015-11-18 11:32       ` Sagi Grimberg
     [not found]       ` <564C61C3.3050307-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-18 14:06         ` Christoph Hellwig
2015-11-18 14:06           ` Christoph Hellwig
     [not found]           ` <20151118140645.GI18820-jcswGhMUV9g@public.gmane.org>
2015-11-18 15:21             ` Steve Wise
2015-11-18 15:21               ` Steve Wise
2015-11-15  9:34   ` Sagi Grimberg
     [not found]     ` <564851BB.1020004-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-16 16:38       ` Steve Wise
2015-11-16 16:38         ` Steve Wise
     [not found]         ` <564A067B.8030504-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2015-11-16 18:30           ` Steve Wise
2015-11-16 18:30             ` Steve Wise
2015-11-16 18:37             ` Sagi Grimberg
     [not found]               ` <564A2270.1040004-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-16 19:03                 ` Steve Wise
2015-11-16 19:03                   ` Steve Wise
2015-11-17  8:54                   ` Sagi Grimberg
2015-11-17  8:54                     ` Sagi Grimberg
2015-11-23 10:28                   ` Sagi Grimberg
2015-11-23 10:28                     ` Sagi Grimberg
2015-11-23 10:35                     ` Sagi Grimberg
2015-11-23 14:33                       ` 'Christoph Hellwig'
     [not found]                       ` <5652EC00.8010705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-11-23 14:48                         ` Steve Wise
2015-11-23 14:48                           ` Steve Wise
2015-11-23 14:44                     ` Steve Wise
2015-11-23 14:44                       ` Steve Wise
2015-11-17 17:06       ` Bart Van Assche
2015-11-17 17:06         ` Bart Van Assche
     [not found]         ` <564B5E7D.9030309-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-11-18  7:59           ` Sagi Grimberg
2015-11-18  7:59             ` Sagi Grimberg
     [not found] ` <1447422410-20891-1-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-13 13:46   ` [PATCH 4/9] srpt: chain RDMA READ/WRITE requests Christoph Hellwig
2015-11-13 13:46     ` Christoph Hellwig
2015-11-18  1:17     ` Bart Van Assche
2015-11-18  1:17       ` Bart Van Assche
2015-11-18  9:15       ` Sagi Grimberg
2015-11-18 16:32         ` Bart Van Assche
2015-11-18 16:32           ` Bart Van Assche
     [not found]           ` <564CA83B.4060403-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-11-20 10:20             ` Christoph Hellwig
2015-11-20 10:20               ` Christoph Hellwig
     [not found]       ` <564BD1AF.60200-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-11-18 14:06         ` Christoph Hellwig
2015-11-18 14:06           ` Christoph Hellwig
2015-11-13 13:46 ` [PATCH 5/9] srpt: use the new CQ API Christoph Hellwig
     [not found]   ` <1447422410-20891-6-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-17 18:22     ` Bart Van Assche
2015-11-17 18:22       ` Bart Van Assche
2015-11-17 19:38   ` Bart Van Assche
2015-11-17 19:38     ` Bart Van Assche
     [not found]     ` <564B8248.7050407-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-11-18 14:03       ` Christoph Hellwig
2015-11-18 14:03         ` Christoph Hellwig
2015-11-13 13:46 ` [PATCH 6/9] srp: " Christoph Hellwig
     [not found]   ` <1447422410-20891-7-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-17 19:56     ` Bart Van Assche
2015-11-17 19:56       ` Bart Van Assche
2015-11-18 14:03       ` Christoph Hellwig
2015-11-13 13:46 ` [PATCH 7/9] IB/iser: Use a dedicated descriptor for login Christoph Hellwig
     [not found]   ` <1447422410-20891-8-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-15  9:14     ` Or Gerlitz
2015-11-15  9:14       ` Or Gerlitz
2015-11-13 13:46 ` [PATCH 8/9] IB/iser: Use helper for container_of Christoph Hellwig
2015-11-13 13:46 ` [PATCH 9/9] IB/iser: Convert to CQ abstraction Christoph Hellwig
     [not found]   ` <1447422410-20891-10-git-send-email-hch-jcswGhMUV9g@public.gmane.org>
2015-11-15  9:21     ` Or Gerlitz
2015-11-15  9:21       ` Or Gerlitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.