All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14  0:41 Guenter Roeck
  2018-11-14  0:51   ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Guenter Roeck @ 2018-11-14  0:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

Hi,

On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
> NVMe does round-robin between queues by default, which means that
> sharing a queue map for both reads and writes can be problematic
> in terms of read servicing. It's much easier to flood the queue
> with writes and reduce the read servicing.
> 
> Implement two queue maps, one for reads and one for writes. The
> write queue count is configurable through the 'write_queues'
> parameter.
> 
> By default, we retain the previous behavior of having a single
> queue set, shared between reads and writes. Setting 'write_queues'
> to a non-zero value will create two queue sets, one for reads and
> one for writes, the latter using the configurable number of
> queues (hardware queue counts permitting).
> 
> Reviewed-by: Hannes Reinecke <hare@suse.com>
> Reviewed-by: Keith Busch <keith.busch@intel.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

This patch causes hangs when running recent versions of
-next with several architectures; see the -next column at
kerneltests.org/builders for details.  Bisect log below; this
was run with qemu on alpha. Reverting this patch as well as
"nvme: add separate poll queue map" fixes the problem.

Guenter

---
# bad: [220dcf1c6fc97f8873b6d9fe121b80decd4b71a8] Add linux-next specific files for 20181113
# good: [ccda4af0f4b92f7b4c308d3acc262f4a7e3affad] Linux 4.20-rc2
git bisect start 'HEAD' 'v4.20-rc2'
# good: [b5ae1d7e1bd7cf5dfdef977774da5cb69c60c911] Merge remote-tracking branch 'crypto/master'
git bisect good b5ae1d7e1bd7cf5dfdef977774da5cb69c60c911
# bad: [be877b847ffe96fb18e4925f05d5c2eb12b6b9f1] Merge remote-tracking branch 'block/for-next'
git bisect bad be877b847ffe96fb18e4925f05d5c2eb12b6b9f1
# good: [088122c5028636643d566c89cfdc621beed79974] Merge remote-tracking branch 'drm-intel/for-linux-next'
git bisect good 088122c5028636643d566c89cfdc621beed79974
# good: [3577f74d5ae898c34252c5cdc096a681910a919f] Merge remote-tracking branch 'drm-misc/for-linux-next'
git bisect good 3577f74d5ae898c34252c5cdc096a681910a919f
# good: [72008e28c7bf500a03f09929176bd025de94861b] Merge remote-tracking branch 'sound-asoc/for-next'
git bisect good 72008e28c7bf500a03f09929176bd025de94861b
# bad: [d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48] block: add REQ_HIPRI and inherit it from IOCB_HIPRI
git bisect bad d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48
# good: [4316b79e4321d4140164e42f228778e5bc66c84f] block: kill legacy parts of timeout handling
git bisect good 4316b79e4321d4140164e42f228778e5bc66c84f
# good: [a0fedc857dff457e123aeaa2039d28ac90e543df] Merge branch 'irq/for-block' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-4.21/block
git bisect good a0fedc857dff457e123aeaa2039d28ac90e543df
# good: [b3c661b15d5ab11d982e58bee23e05c1780528a1] blk-mq: support multiple hctx maps
git bisect good b3c661b15d5ab11d982e58bee23e05c1780528a1
# good: [67cae4c948a5311121905a2a8740c50daf7f6478] blk-mq: cleanup and improve list insertion
git bisect good 67cae4c948a5311121905a2a8740c50daf7f6478
# good: [843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8] blk-mq: initial support for multiple queue maps
git bisect good 843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8
# bad: [3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992] nvme: utilize two queue maps, one for reads and one for writes
git bisect bad 3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992
# first bad commit: [3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992] nvme: utilize two queue maps, one for reads and one for writes

> ---
>  drivers/nvme/host/pci.c | 200 +++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 181 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 49ad854d1b91..1987df13b73e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -74,11 +74,29 @@ static int io_queue_depth = 1024;
>  module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
>  MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
>  
> +static int queue_count_set(const char *val, const struct kernel_param *kp);
> +static const struct kernel_param_ops queue_count_ops = {
> +	.set = queue_count_set,
> +	.get = param_get_int,
> +};
> +
> +static int write_queues;
> +module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
> +MODULE_PARM_DESC(write_queues,
> +	"Number of queues to use for writes. If not set, reads and writes "
> +	"will share a queue set.");
> +
>  struct nvme_dev;
>  struct nvme_queue;
>  
>  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
>  
> +enum {
> +	NVMEQ_TYPE_READ,
> +	NVMEQ_TYPE_WRITE,
> +	NVMEQ_TYPE_NR,
> +};
> +
>  /*
>   * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>   */
> @@ -92,6 +110,7 @@ struct nvme_dev {
>  	struct dma_pool *prp_small_pool;
>  	unsigned online_queues;
>  	unsigned max_qid;
> +	unsigned io_queues[NVMEQ_TYPE_NR];
>  	unsigned int num_vecs;
>  	int q_depth;
>  	u32 db_stride;
> @@ -134,6 +153,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
>  	return param_set_int(val, kp);
>  }
>  
> +static int queue_count_set(const char *val, const struct kernel_param *kp)
> +{
> +	int n = 0, ret;
> +
> +	ret = kstrtoint(val, 10, &n);
> +	if (n > num_possible_cpus())
> +		n = num_possible_cpus();
> +
> +	return param_set_int(val, kp);
> +}
> +
>  static inline unsigned int sq_idx(unsigned int qid, u32 stride)
>  {
>  	return qid * 2 * stride;
> @@ -218,9 +248,20 @@ static inline void _nvme_check_size(void)
>  	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
>  }
>  
> +static unsigned int max_io_queues(void)
> +{
> +	return num_possible_cpus() + write_queues;
> +}
> +
> +static unsigned int max_queue_count(void)
> +{
> +	/* IO queues + admin queue */
> +	return 1 + max_io_queues();
> +}
> +
>  static inline unsigned int nvme_dbbuf_size(u32 stride)
>  {
> -	return ((num_possible_cpus() + 1) * 8 * stride);
> +	return (max_queue_count() * 8 * stride);
>  }
>  
>  static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
> @@ -431,12 +472,41 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
>  	return 0;
>  }
>  
> +static int queue_irq_offset(struct nvme_dev *dev)
> +{
> +	/* if we have more than 1 vec, admin queue offsets us by 1 */
> +	if (dev->num_vecs > 1)
> +		return 1;
> +
> +	return 0;
> +}
> +
>  static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
>  {
>  	struct nvme_dev *dev = set->driver_data;
> +	int i, qoff, offset;
> +
> +	offset = queue_irq_offset(dev);
> +	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
> +		struct blk_mq_queue_map *map = &set->map[i];
> +
> +		map->nr_queues = dev->io_queues[i];
> +		if (!map->nr_queues) {
> +			BUG_ON(i == NVMEQ_TYPE_READ);
>  
> -	return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
> -			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
> +			/* shared set, resuse read set parameters */
> +			map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
> +			qoff = 0;
> +			offset = queue_irq_offset(dev);
> +		}
> +
> +		map->queue_offset = qoff;
> +		blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
> +		qoff += map->nr_queues;
> +		offset += map->nr_queues;
> +	}
> +
> +	return 0;
>  }
>  
>  /**
> @@ -849,6 +919,14 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	return ret;
>  }
>  
> +static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
> +{
> +	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
> +		return NVMEQ_TYPE_READ;
> +
> +	return NVMEQ_TYPE_WRITE;
> +}
> +
>  static void nvme_pci_complete_rq(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -1475,13 +1553,14 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
>  };
>  
>  static const struct blk_mq_ops nvme_mq_ops = {
> -	.queue_rq	= nvme_queue_rq,
> -	.complete	= nvme_pci_complete_rq,
> -	.init_hctx	= nvme_init_hctx,
> -	.init_request	= nvme_init_request,
> -	.map_queues	= nvme_pci_map_queues,
> -	.timeout	= nvme_timeout,
> -	.poll		= nvme_poll,
> +	.queue_rq		= nvme_queue_rq,
> +	.rq_flags_to_type	= nvme_rq_flags_to_type,
> +	.complete		= nvme_pci_complete_rq,
> +	.init_hctx		= nvme_init_hctx,
> +	.init_request		= nvme_init_request,
> +	.map_queues		= nvme_pci_map_queues,
> +	.timeout		= nvme_timeout,
> +	.poll			= nvme_poll,
>  };
>  
>  static void nvme_dev_remove_admin(struct nvme_dev *dev)
> @@ -1891,6 +1970,87 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
>  	return ret;
>  }
>  
> +static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
> +{
> +	unsigned int this_w_queues = write_queues;
> +
> +	/*
> +	 * Setup read/write queue split
> +	 */
> +	if (nr_io_queues == 1) {
> +		dev->io_queues[NVMEQ_TYPE_READ] = 1;
> +		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
> +		return;
> +	}
> +
> +	/*
> +	 * If 'write_queues' is set, ensure it leaves room for at least
> +	 * one read queue
> +	 */
> +	if (this_w_queues >= nr_io_queues)
> +		this_w_queues = nr_io_queues - 1;
> +
> +	/*
> +	 * If 'write_queues' is set to zero, reads and writes will share
> +	 * a queue set.
> +	 */
> +	if (!this_w_queues) {
> +		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
> +		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
> +	} else {
> +		dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
> +		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
> +	}
> +}
> +
> +static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev->dev);
> +	int irq_sets[2];
> +	struct irq_affinity affd = {
> +		.pre_vectors = 1,
> +		.nr_sets = ARRAY_SIZE(irq_sets),
> +		.sets = irq_sets,
> +	};
> +	int result;
> +
> +	/*
> +	 * For irq sets, we have to ask for minvec == maxvec. This passes
> +	 * any reduction back to us, so we can adjust our queue counts and
> +	 * IRQ vector needs.
> +	 */
> +	do {
> +		nvme_calc_io_queues(dev, nr_io_queues);
> +		irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
> +		irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
> +		if (!irq_sets[1])
> +			affd.nr_sets = 1;
> +
> +		/*
> +		 * Need IRQs for read+write queues, and one for the admin queue
> +		 */
> +		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
> +
> +		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
> +				nr_io_queues,
> +				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
> +
> +		/*
> +		 * Need to reduce our vec counts
> +		 */
> +		if (result == -ENOSPC) {
> +			nr_io_queues--;
> +			if (!nr_io_queues)
> +				return result;
> +			continue;
> +		} else if (result <= 0)
> +			return -EIO;
> +		break;
> +	} while (1);
> +
> +	return result;
> +}
> +
>  static int nvme_setup_io_queues(struct nvme_dev *dev)
>  {
>  	struct nvme_queue *adminq = &dev->queues[0];
> @@ -1898,11 +2058,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>  	int result, nr_io_queues;
>  	unsigned long size;
>  
> -	struct irq_affinity affd = {
> -		.pre_vectors = 1
> -	};
> -
> -	nr_io_queues = num_possible_cpus();
> +	nr_io_queues = max_io_queues();
>  	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
>  	if (result < 0)
>  		return result;
> @@ -1937,13 +2093,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>  	 * setting up the full range we need.
>  	 */
>  	pci_free_irq_vectors(pdev);
> -	result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
> -			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
> +
> +	result = nvme_setup_irqs(dev, nr_io_queues);
>  	if (result <= 0)
>  		return -EIO;
> +
>  	dev->num_vecs = result;
>  	dev->max_qid = max(result - 1, 1);
>  
> +	dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
> +					dev->io_queues[NVMEQ_TYPE_READ],
> +					dev->io_queues[NVMEQ_TYPE_WRITE]);
> +
>  	/*
>  	 * Should investigate if there's a performance win from allocating
>  	 * more queues than interrupt vectors; it might allow the submission
> @@ -2045,6 +2206,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
>  	if (!dev->ctrl.tagset) {
>  		dev->tagset.ops = &nvme_mq_ops;
>  		dev->tagset.nr_hw_queues = dev->online_queues - 1;
> +		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
>  		dev->tagset.timeout = NVME_IO_TIMEOUT;
>  		dev->tagset.numa_node = dev_to_node(dev->dev);
>  		dev->tagset.queue_depth =
> @@ -2491,8 +2653,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (!dev)
>  		return -ENOMEM;
>  
> -	dev->queues = kcalloc_node(num_possible_cpus() + 1,
> -			sizeof(struct nvme_queue), GFP_KERNEL, node);
> +	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
> +					GFP_KERNEL, node);
>  	if (!dev->queues)
>  		goto free;
>  
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nvme: utilize two queue maps, one for reads and one for writes
  2018-11-14  0:41 [PATCH] nvme: utilize two queue maps, one for reads and one for writes Guenter Roeck
@ 2018-11-14  0:51   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2018-11-14  0:51 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

On 11/13/18 5:41 PM, Guenter Roeck wrote:
> Hi,
> 
> On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
>> NVMe does round-robin between queues by default, which means that
>> sharing a queue map for both reads and writes can be problematic
>> in terms of read servicing. It's much easier to flood the queue
>> with writes and reduce the read servicing.
>>
>> Implement two queue maps, one for reads and one for writes. The
>> write queue count is configurable through the 'write_queues'
>> parameter.
>>
>> By default, we retain the previous behavior of having a single
>> queue set, shared between reads and writes. Setting 'write_queues'
>> to a non-zero value will create two queue sets, one for reads and
>> one for writes, the latter using the configurable number of
>> queues (hardware queue counts permitting).
>>
>> Reviewed-by: Hannes Reinecke <hare@suse.com>
>> Reviewed-by: Keith Busch <keith.busch@intel.com>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> This patch causes hangs when running recent versions of
> -next with several architectures; see the -next column at
> kerneltests.org/builders for details.  Bisect log below; this
> was run with qemu on alpha. Reverting this patch as well as
> "nvme: add separate poll queue map" fixes the problem.

I don't see anything related to what hung, the trace, and so on.
Can you clue me in? Where are the test results with dmesg?

How to reproduce?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14  0:51   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2018-11-14  0:51 UTC (permalink / raw)


On 11/13/18 5:41 PM, Guenter Roeck wrote:
> Hi,
> 
> On Wed, Oct 31, 2018@08:36:31AM -0600, Jens Axboe wrote:
>> NVMe does round-robin between queues by default, which means that
>> sharing a queue map for both reads and writes can be problematic
>> in terms of read servicing. It's much easier to flood the queue
>> with writes and reduce the read servicing.
>>
>> Implement two queue maps, one for reads and one for writes. The
>> write queue count is configurable through the 'write_queues'
>> parameter.
>>
>> By default, we retain the previous behavior of having a single
>> queue set, shared between reads and writes. Setting 'write_queues'
>> to a non-zero value will create two queue sets, one for reads and
>> one for writes, the latter using the configurable number of
>> queues (hardware queue counts permitting).
>>
>> Reviewed-by: Hannes Reinecke <hare at suse.com>
>> Reviewed-by: Keith Busch <keith.busch at intel.com>
>> Signed-off-by: Jens Axboe <axboe at kernel.dk>
> 
> This patch causes hangs when running recent versions of
> -next with several architectures; see the -next column at
> kerneltests.org/builders for details.  Bisect log below; this
> was run with qemu on alpha. Reverting this patch as well as
> "nvme: add separate poll queue map" fixes the problem.

I don't see anything related to what hung, the trace, and so on.
Can you clue me in? Where are the test results with dmesg?

How to reproduce?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: nvme: utilize two queue maps, one for reads and one for writes
  2018-11-14  0:51   ` Jens Axboe
@ 2018-11-14  1:28     ` Mike Snitzer
  -1 siblings, 0 replies; 11+ messages in thread
From: Mike Snitzer @ 2018-11-14  1:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Guenter Roeck, Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

On Tue, Nov 13 2018 at  7:51pm -0500,
Jens Axboe <axboe@kernel.dk> wrote:

> On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > Hi,
> > 
> > On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
> >> NVMe does round-robin between queues by default, which means that
> >> sharing a queue map for both reads and writes can be problematic
> >> in terms of read servicing. It's much easier to flood the queue
> >> with writes and reduce the read servicing.
> >>
> >> Implement two queue maps, one for reads and one for writes. The
> >> write queue count is configurable through the 'write_queues'
> >> parameter.
> >>
> >> By default, we retain the previous behavior of having a single
> >> queue set, shared between reads and writes. Setting 'write_queues'
> >> to a non-zero value will create two queue sets, one for reads and
> >> one for writes, the latter using the configurable number of
> >> queues (hardware queue counts permitting).
> >>
> >> Reviewed-by: Hannes Reinecke <hare@suse.com>
> >> Reviewed-by: Keith Busch <keith.busch@intel.com>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > 
> > This patch causes hangs when running recent versions of
> > -next with several architectures; see the -next column at
> > kerneltests.org/builders for details.  Bisect log below; this
> > was run with qemu on alpha. Reverting this patch as well as
> > "nvme: add separate poll queue map" fixes the problem.
> 
> I don't see anything related to what hung, the trace, and so on.
> Can you clue me in? Where are the test results with dmesg?
> 
> How to reproduce?

Think Guenter should've provided a full kerneltests.org url, but I had a
look and found this for powerpc with -next:
https://kerneltests.org/builders/next-powerpc-next/builds/998/steps/buildcommand/logs/stdio

Has useful logs of the build failure due to block.

(not seeing any -next failure for alpha but Guenter said he was using
qemu so the build failure could've been any arch qemu supports)

Mike

^ permalink raw reply	[flat|nested] 11+ messages in thread

* nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14  1:28     ` Mike Snitzer
  0 siblings, 0 replies; 11+ messages in thread
From: Mike Snitzer @ 2018-11-14  1:28 UTC (permalink / raw)


On Tue, Nov 13 2018 at  7:51pm -0500,
Jens Axboe <axboe@kernel.dk> wrote:

> On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > Hi,
> > 
> > On Wed, Oct 31, 2018@08:36:31AM -0600, Jens Axboe wrote:
> >> NVMe does round-robin between queues by default, which means that
> >> sharing a queue map for both reads and writes can be problematic
> >> in terms of read servicing. It's much easier to flood the queue
> >> with writes and reduce the read servicing.
> >>
> >> Implement two queue maps, one for reads and one for writes. The
> >> write queue count is configurable through the 'write_queues'
> >> parameter.
> >>
> >> By default, we retain the previous behavior of having a single
> >> queue set, shared between reads and writes. Setting 'write_queues'
> >> to a non-zero value will create two queue sets, one for reads and
> >> one for writes, the latter using the configurable number of
> >> queues (hardware queue counts permitting).
> >>
> >> Reviewed-by: Hannes Reinecke <hare at suse.com>
> >> Reviewed-by: Keith Busch <keith.busch at intel.com>
> >> Signed-off-by: Jens Axboe <axboe at kernel.dk>
> > 
> > This patch causes hangs when running recent versions of
> > -next with several architectures; see the -next column at
> > kerneltests.org/builders for details.  Bisect log below; this
> > was run with qemu on alpha. Reverting this patch as well as
> > "nvme: add separate poll queue map" fixes the problem.
> 
> I don't see anything related to what hung, the trace, and so on.
> Can you clue me in? Where are the test results with dmesg?
> 
> How to reproduce?

Think Guenter should've provided a full kerneltests.org url, but I had a
look and found this for powerpc with -next:
https://kerneltests.org/builders/next-powerpc-next/builds/998/steps/buildcommand/logs/stdio

Has useful logs of the build failure due to block.

(not seeing any -next failure for alpha but Guenter said he was using
qemu so the build failure could've been any arch qemu supports)

Mike

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: nvme: utilize two queue maps, one for reads and one for writes
  2018-11-14  1:28     ` Mike Snitzer
@ 2018-11-14  1:36       ` Mike Snitzer
  -1 siblings, 0 replies; 11+ messages in thread
From: Mike Snitzer @ 2018-11-14  1:36 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Guenter Roeck, Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

On Tue, Nov 13 2018 at  8:28pm -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Tue, Nov 13 2018 at  7:51pm -0500,
> Jens Axboe <axboe@kernel.dk> wrote:
> 
> > On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > > Hi,
> > > 
> > > On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
> > >> NVMe does round-robin between queues by default, which means that
> > >> sharing a queue map for both reads and writes can be problematic
> > >> in terms of read servicing. It's much easier to flood the queue
> > >> with writes and reduce the read servicing.
> > >>
> > >> Implement two queue maps, one for reads and one for writes. The
> > >> write queue count is configurable through the 'write_queues'
> > >> parameter.
> > >>
> > >> By default, we retain the previous behavior of having a single
> > >> queue set, shared between reads and writes. Setting 'write_queues'
> > >> to a non-zero value will create two queue sets, one for reads and
> > >> one for writes, the latter using the configurable number of
> > >> queues (hardware queue counts permitting).
> > >>
> > >> Reviewed-by: Hannes Reinecke <hare@suse.com>
> > >> Reviewed-by: Keith Busch <keith.busch@intel.com>
> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > > 
> > > This patch causes hangs when running recent versions of
> > > -next with several architectures; see the -next column at
> > > kerneltests.org/builders for details.  Bisect log below; this
> > > was run with qemu on alpha. Reverting this patch as well as
> > > "nvme: add separate poll queue map" fixes the problem.
> > 
> > I don't see anything related to what hung, the trace, and so on.
> > Can you clue me in? Where are the test results with dmesg?
> > 
> > How to reproduce?
> 
> Think Guenter should've provided a full kerneltests.org url, but I had a
> look and found this for powerpc with -next:
> https://kerneltests.org/builders/next-powerpc-next/builds/998/steps/buildcommand/logs/stdio
> 
> Has useful logs of the build failure due to block.

Take that back, of course I only had a quick look and first scrolled to
this fragment and thought "yeap shows block build failure" (not
_really_):

opt/buildbot/slave/next-next/build/kernel/sched/psi.c: In function 'cgroup_move_task':
/opt/buildbot/slave/next-next/build/include/linux/spinlock.h:273:32: warning: 'rq' may be used uninitialized in this function [-Wmaybe-uninitialized]
 #define raw_spin_unlock(lock)  _raw_spin_unlock(lock)
                                ^~~~~~~~~~~~~~~~
/opt/buildbot/slave/next-next/build/kernel/sched/psi.c:639:13: note: 'rq' was declared here
  struct rq *rq;
             ^~

^ permalink raw reply	[flat|nested] 11+ messages in thread

* nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14  1:36       ` Mike Snitzer
  0 siblings, 0 replies; 11+ messages in thread
From: Mike Snitzer @ 2018-11-14  1:36 UTC (permalink / raw)


On Tue, Nov 13 2018 at  8:28pm -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Tue, Nov 13 2018 at  7:51pm -0500,
> Jens Axboe <axboe@kernel.dk> wrote:
> 
> > On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > > Hi,
> > > 
> > > On Wed, Oct 31, 2018@08:36:31AM -0600, Jens Axboe wrote:
> > >> NVMe does round-robin between queues by default, which means that
> > >> sharing a queue map for both reads and writes can be problematic
> > >> in terms of read servicing. It's much easier to flood the queue
> > >> with writes and reduce the read servicing.
> > >>
> > >> Implement two queue maps, one for reads and one for writes. The
> > >> write queue count is configurable through the 'write_queues'
> > >> parameter.
> > >>
> > >> By default, we retain the previous behavior of having a single
> > >> queue set, shared between reads and writes. Setting 'write_queues'
> > >> to a non-zero value will create two queue sets, one for reads and
> > >> one for writes, the latter using the configurable number of
> > >> queues (hardware queue counts permitting).
> > >>
> > >> Reviewed-by: Hannes Reinecke <hare at suse.com>
> > >> Reviewed-by: Keith Busch <keith.busch at intel.com>
> > >> Signed-off-by: Jens Axboe <axboe at kernel.dk>
> > > 
> > > This patch causes hangs when running recent versions of
> > > -next with several architectures; see the -next column at
> > > kerneltests.org/builders for details.  Bisect log below; this
> > > was run with qemu on alpha. Reverting this patch as well as
> > > "nvme: add separate poll queue map" fixes the problem.
> > 
> > I don't see anything related to what hung, the trace, and so on.
> > Can you clue me in? Where are the test results with dmesg?
> > 
> > How to reproduce?
> 
> Think Guenter should've provided a full kerneltests.org url, but I had a
> look and found this for powerpc with -next:
> https://kerneltests.org/builders/next-powerpc-next/builds/998/steps/buildcommand/logs/stdio
> 
> Has useful logs of the build failure due to block.

Take that back, of course I only had a quick look and first scrolled to
this fragment and thought "yeap shows block build failure" (not
_really_):

opt/buildbot/slave/next-next/build/kernel/sched/psi.c: In function 'cgroup_move_task':
/opt/buildbot/slave/next-next/build/include/linux/spinlock.h:273:32: warning: 'rq' may be used uninitialized in this function [-Wmaybe-uninitialized]
 #define raw_spin_unlock(lock)  _raw_spin_unlock(lock)
                                ^~~~~~~~~~~~~~~~
/opt/buildbot/slave/next-next/build/kernel/sched/psi.c:639:13: note: 'rq' was declared here
  struct rq *rq;
             ^~

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nvme: utilize two queue maps, one for reads and one for writes
  2018-11-14  0:51   ` Jens Axboe
@ 2018-11-14  4:52     ` Guenter Roeck
  -1 siblings, 0 replies; 11+ messages in thread
From: Guenter Roeck @ 2018-11-14  4:52 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

On Tue, Nov 13, 2018 at 05:51:08PM -0700, Jens Axboe wrote:
> On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > Hi,
> > 
> > On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
> >> NVMe does round-robin between queues by default, which means that
> >> sharing a queue map for both reads and writes can be problematic
> >> in terms of read servicing. It's much easier to flood the queue
> >> with writes and reduce the read servicing.
> >>
> >> Implement two queue maps, one for reads and one for writes. The
> >> write queue count is configurable through the 'write_queues'
> >> parameter.
> >>
> >> By default, we retain the previous behavior of having a single
> >> queue set, shared between reads and writes. Setting 'write_queues'
> >> to a non-zero value will create two queue sets, one for reads and
> >> one for writes, the latter using the configurable number of
> >> queues (hardware queue counts permitting).
> >>
> >> Reviewed-by: Hannes Reinecke <hare@suse.com>
> >> Reviewed-by: Keith Busch <keith.busch@intel.com>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > 
> > This patch causes hangs when running recent versions of
> > -next with several architectures; see the -next column at
> > kerneltests.org/builders for details.  Bisect log below; this
> > was run with qemu on alpha. Reverting this patch as well as
> > "nvme: add separate poll queue map" fixes the problem.
> 
> I don't see anything related to what hung, the trace, and so on.
> Can you clue me in? Where are the test results with dmesg?
> 
alpha just stalls during boot. parisc reports a hung task
in nvme_reset_work. sparc64 reports EIO when instantiating
the nvme driver, called from nvme_reset_work, and then stalls.
In all three cases, reverting the two mentioned patches fixes
the problem.

https://kerneltests.org/builders/qemu-parisc-next/builds/173/steps/qemubuildcommand_1/logs/stdio

is an example log for parisc.

I didn't check if the other boot failures (ppc looks bad)
have the same root cause.

> How to reproduce?
> 
parisc:

qemu-system-hppa -kernel vmlinux -no-reboot \
	-snapshot -device nvme,serial=foo,drive=d0 \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0,115200 ' \
	-nographic -monitor null

alpha:

qemu-system-alpha -M clipper -kernel arch/alpha/boot/vmlinux -no-reboot \
	-snapshot -device nvme,serial=foo,drive=d0 \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
	-m 128M -nographic -monitor null -serial stdio

sparc64:

qemu-system-sparc64 -M sun4u -cpu 'TI UltraSparc IIi' -m 512 \
	-snapshot -device nvme,serial=foo,drive=d0,bus=pciB \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-kernel arch/sparc/boot/image -no-reboot \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
	-nographic -monitor none

The root file systems are available from the respective subdirectories
of:

https://github.com/groeck/linux-build-test/tree/master/rootfs

Guenter

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14  4:52     ` Guenter Roeck
  0 siblings, 0 replies; 11+ messages in thread
From: Guenter Roeck @ 2018-11-14  4:52 UTC (permalink / raw)


On Tue, Nov 13, 2018@05:51:08PM -0700, Jens Axboe wrote:
> On 11/13/18 5:41 PM, Guenter Roeck wrote:
> > Hi,
> > 
> > On Wed, Oct 31, 2018@08:36:31AM -0600, Jens Axboe wrote:
> >> NVMe does round-robin between queues by default, which means that
> >> sharing a queue map for both reads and writes can be problematic
> >> in terms of read servicing. It's much easier to flood the queue
> >> with writes and reduce the read servicing.
> >>
> >> Implement two queue maps, one for reads and one for writes. The
> >> write queue count is configurable through the 'write_queues'
> >> parameter.
> >>
> >> By default, we retain the previous behavior of having a single
> >> queue set, shared between reads and writes. Setting 'write_queues'
> >> to a non-zero value will create two queue sets, one for reads and
> >> one for writes, the latter using the configurable number of
> >> queues (hardware queue counts permitting).
> >>
> >> Reviewed-by: Hannes Reinecke <hare at suse.com>
> >> Reviewed-by: Keith Busch <keith.busch at intel.com>
> >> Signed-off-by: Jens Axboe <axboe at kernel.dk>
> > 
> > This patch causes hangs when running recent versions of
> > -next with several architectures; see the -next column at
> > kerneltests.org/builders for details.  Bisect log below; this
> > was run with qemu on alpha. Reverting this patch as well as
> > "nvme: add separate poll queue map" fixes the problem.
> 
> I don't see anything related to what hung, the trace, and so on.
> Can you clue me in? Where are the test results with dmesg?
> 
alpha just stalls during boot. parisc reports a hung task
in nvme_reset_work. sparc64 reports EIO when instantiating
the nvme driver, called from nvme_reset_work, and then stalls.
In all three cases, reverting the two mentioned patches fixes
the problem.

https://kerneltests.org/builders/qemu-parisc-next/builds/173/steps/qemubuildcommand_1/logs/stdio

is an example log for parisc.

I didn't check if the other boot failures (ppc looks bad)
have the same root cause.

> How to reproduce?
> 
parisc:

qemu-system-hppa -kernel vmlinux -no-reboot \
	-snapshot -device nvme,serial=foo,drive=d0 \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0,115200 ' \
	-nographic -monitor null

alpha:

qemu-system-alpha -M clipper -kernel arch/alpha/boot/vmlinux -no-reboot \
	-snapshot -device nvme,serial=foo,drive=d0 \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
	-m 128M -nographic -monitor null -serial stdio

sparc64:

qemu-system-sparc64 -M sun4u -cpu 'TI UltraSparc IIi' -m 512 \
	-snapshot -device nvme,serial=foo,drive=d0,bus=pciB \
	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
	-kernel arch/sparc/boot/image -no-reboot \
	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
	-nographic -monitor none

The root file systems are available from the respective subdirectories
of:

https://github.com/groeck/linux-build-test/tree/master/rootfs

Guenter

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] nvme: utilize two queue maps, one for reads and one for writes
  2018-11-14  4:52     ` Guenter Roeck
@ 2018-11-14 17:12       ` Jens Axboe
  -1 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2018-11-14 17:12 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, linux-kernel

On 11/13/18 9:52 PM, Guenter Roeck wrote:
> On Tue, Nov 13, 2018 at 05:51:08PM -0700, Jens Axboe wrote:
>> On 11/13/18 5:41 PM, Guenter Roeck wrote:
>>> Hi,
>>>
>>> On Wed, Oct 31, 2018 at 08:36:31AM -0600, Jens Axboe wrote:
>>>> NVMe does round-robin between queues by default, which means that
>>>> sharing a queue map for both reads and writes can be problematic
>>>> in terms of read servicing. It's much easier to flood the queue
>>>> with writes and reduce the read servicing.
>>>>
>>>> Implement two queue maps, one for reads and one for writes. The
>>>> write queue count is configurable through the 'write_queues'
>>>> parameter.
>>>>
>>>> By default, we retain the previous behavior of having a single
>>>> queue set, shared between reads and writes. Setting 'write_queues'
>>>> to a non-zero value will create two queue sets, one for reads and
>>>> one for writes, the latter using the configurable number of
>>>> queues (hardware queue counts permitting).
>>>>
>>>> Reviewed-by: Hannes Reinecke <hare@suse.com>
>>>> Reviewed-by: Keith Busch <keith.busch@intel.com>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>
>>> This patch causes hangs when running recent versions of
>>> -next with several architectures; see the -next column at
>>> kerneltests.org/builders for details.  Bisect log below; this
>>> was run with qemu on alpha. Reverting this patch as well as
>>> "nvme: add separate poll queue map" fixes the problem.
>>
>> I don't see anything related to what hung, the trace, and so on.
>> Can you clue me in? Where are the test results with dmesg?
>>
> alpha just stalls during boot. parisc reports a hung task
> in nvme_reset_work. sparc64 reports EIO when instantiating
> the nvme driver, called from nvme_reset_work, and then stalls.
> In all three cases, reverting the two mentioned patches fixes
> the problem.

I think the below patch should fix it.

> https://kerneltests.org/builders/qemu-parisc-next/builds/173/steps/qemubuildcommand_1/logs/stdio
> 
> is an example log for parisc.
> 
> I didn't check if the other boot failures (ppc looks bad)
> have the same root cause.
> 
>> How to reproduce?
>>
> parisc:
> 
> qemu-system-hppa -kernel vmlinux -no-reboot \
> 	-snapshot -device nvme,serial=foo,drive=d0 \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0,115200 ' \
> 	-nographic -monitor null
> 
> alpha:
> 
> qemu-system-alpha -M clipper -kernel arch/alpha/boot/vmlinux -no-reboot \
> 	-snapshot -device nvme,serial=foo,drive=d0 \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
> 	-m 128M -nographic -monitor null -serial stdio
> 
> sparc64:
> 
> qemu-system-sparc64 -M sun4u -cpu 'TI UltraSparc IIi' -m 512 \
> 	-snapshot -device nvme,serial=foo,drive=d0,bus=pciB \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-kernel arch/sparc/boot/image -no-reboot \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
> 	-nographic -monitor none
> 
> The root file systems are available from the respective subdirectories
> of:
> 
> https://github.com/groeck/linux-build-test/tree/master/rootfs

This is useful, thanks! I haven't tried it yet, but I was able to
reproduce on x86 with MSI turned off.


diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8df868afa363..6c03461ad988 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2098,7 +2098,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.nr_sets = ARRAY_SIZE(irq_sets),
 		.sets = irq_sets,
 	};
-	int result;
+	int result = 0;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2113,9 +2113,16 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue
+		 * Need IRQs for read+write queues, and one for the admin queue.
+		 * If we can't get more than one vector, we have to share the
+		 * admin queue and IO queue vector. For that case, don't add
+		 * an extra vector for the admin queue, or we'll continue
+		 * asking for 2 and get -ENOSPC in return.
 		 */
-		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result == -ENOSPC && nr_io_queues == 1)
+			nr_io_queues = 1;
+		else
+			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
 				nr_io_queues,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH] nvme: utilize two queue maps, one for reads and one for writes
@ 2018-11-14 17:12       ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2018-11-14 17:12 UTC (permalink / raw)


On 11/13/18 9:52 PM, Guenter Roeck wrote:
> On Tue, Nov 13, 2018@05:51:08PM -0700, Jens Axboe wrote:
>> On 11/13/18 5:41 PM, Guenter Roeck wrote:
>>> Hi,
>>>
>>> On Wed, Oct 31, 2018@08:36:31AM -0600, Jens Axboe wrote:
>>>> NVMe does round-robin between queues by default, which means that
>>>> sharing a queue map for both reads and writes can be problematic
>>>> in terms of read servicing. It's much easier to flood the queue
>>>> with writes and reduce the read servicing.
>>>>
>>>> Implement two queue maps, one for reads and one for writes. The
>>>> write queue count is configurable through the 'write_queues'
>>>> parameter.
>>>>
>>>> By default, we retain the previous behavior of having a single
>>>> queue set, shared between reads and writes. Setting 'write_queues'
>>>> to a non-zero value will create two queue sets, one for reads and
>>>> one for writes, the latter using the configurable number of
>>>> queues (hardware queue counts permitting).
>>>>
>>>> Reviewed-by: Hannes Reinecke <hare at suse.com>
>>>> Reviewed-by: Keith Busch <keith.busch at intel.com>
>>>> Signed-off-by: Jens Axboe <axboe at kernel.dk>
>>>
>>> This patch causes hangs when running recent versions of
>>> -next with several architectures; see the -next column at
>>> kerneltests.org/builders for details.  Bisect log below; this
>>> was run with qemu on alpha. Reverting this patch as well as
>>> "nvme: add separate poll queue map" fixes the problem.
>>
>> I don't see anything related to what hung, the trace, and so on.
>> Can you clue me in? Where are the test results with dmesg?
>>
> alpha just stalls during boot. parisc reports a hung task
> in nvme_reset_work. sparc64 reports EIO when instantiating
> the nvme driver, called from nvme_reset_work, and then stalls.
> In all three cases, reverting the two mentioned patches fixes
> the problem.

I think the below patch should fix it.

> https://kerneltests.org/builders/qemu-parisc-next/builds/173/steps/qemubuildcommand_1/logs/stdio
> 
> is an example log for parisc.
> 
> I didn't check if the other boot failures (ppc looks bad)
> have the same root cause.
> 
>> How to reproduce?
>>
> parisc:
> 
> qemu-system-hppa -kernel vmlinux -no-reboot \
> 	-snapshot -device nvme,serial=foo,drive=d0 \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0,115200 ' \
> 	-nographic -monitor null
> 
> alpha:
> 
> qemu-system-alpha -M clipper -kernel arch/alpha/boot/vmlinux -no-reboot \
> 	-snapshot -device nvme,serial=foo,drive=d0 \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
> 	-m 128M -nographic -monitor null -serial stdio
> 
> sparc64:
> 
> qemu-system-sparc64 -M sun4u -cpu 'TI UltraSparc IIi' -m 512 \
> 	-snapshot -device nvme,serial=foo,drive=d0,bus=pciB \
> 	-drive file=rootfs.ext2,if=none,format=raw,id=d0 \
> 	-kernel arch/sparc/boot/image -no-reboot \
> 	-append 'root=/dev/nvme0n1 rw rootwait panic=-1 console=ttyS0' \
> 	-nographic -monitor none
> 
> The root file systems are available from the respective subdirectories
> of:
> 
> https://github.com/groeck/linux-build-test/tree/master/rootfs

This is useful, thanks! I haven't tried it yet, but I was able to
reproduce on x86 with MSI turned off.


diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8df868afa363..6c03461ad988 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2098,7 +2098,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.nr_sets = ARRAY_SIZE(irq_sets),
 		.sets = irq_sets,
 	};
-	int result;
+	int result = 0;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2113,9 +2113,16 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue
+		 * Need IRQs for read+write queues, and one for the admin queue.
+		 * If we can't get more than one vector, we have to share the
+		 * admin queue and IO queue vector. For that case, don't add
+		 * an extra vector for the admin queue, or we'll continue
+		 * asking for 2 and get -ENOSPC in return.
 		 */
-		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result == -ENOSPC && nr_io_queues == 1)
+			nr_io_queues = 1;
+		else
+			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
 				nr_io_queues,

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-11-14 17:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-14  0:41 [PATCH] nvme: utilize two queue maps, one for reads and one for writes Guenter Roeck
2018-11-14  0:51 ` Jens Axboe
2018-11-14  0:51   ` Jens Axboe
2018-11-14  1:28   ` Mike Snitzer
2018-11-14  1:28     ` Mike Snitzer
2018-11-14  1:36     ` Mike Snitzer
2018-11-14  1:36       ` Mike Snitzer
2018-11-14  4:52   ` [PATCH] " Guenter Roeck
2018-11-14  4:52     ` Guenter Roeck
2018-11-14 17:12     ` Jens Axboe
2018-11-14 17:12       ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.