* [PATCHv2] NVMe: IO Queue NUMA locality
@ 2013-07-08 19:35 Keith Busch
2013-07-09 13:41 ` Matthew Wilcox
0 siblings, 1 reply; 2+ messages in thread
From: Keith Busch @ 2013-07-08 19:35 UTC (permalink / raw)
Allocates queue memory local to the cpu for memory read by the cpu and
local to the device for memory read by the device.
Signed-off-by: Keith Busch <keith.busch at intel.com>
---
I've gotten better at testing this, pinning processes to specific cores
and seeing what happens. I find that no matter how you allocate memory,
running IO from a cpu on the same numa node as the device provides no
measurable change.
There is measurable difference when running IO on a cpu on another
domain; however, my particular device hits its peak performance on
either domain at higher queue depths and block sizes, so I'm only able
to see a difference at lower io depths. The best gains topped out at 2%
improvement with this patch vs the existing code.
No test performed worse.
I understand this method of allocating and mapping memory may not work
for CPUs without cache-coherency, but I'm not sure if there is another
way to allocate coherent memory for a specific NUMA node.
drivers/block/nvme-core.c | 42 +++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 711b51c..9cedfa0 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1022,8 +1022,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
{
- dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
- (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+ dma_unmap_single(nvmeq->q_dmadev, nvmeq->cq_dma_addr,
+ CQ_SIZE(nvmeq->q_depth), DMA_FROM_DEVICE);
+ free_pages_exact((void *)nvmeq->cqes, CQ_SIZE(nvmeq->q_depth));
+
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
kfree(nvmeq);
@@ -1055,20 +1057,22 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
}
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
- int depth, int vector)
+ int depth, int vector, int nid)
{
struct device *dmadev = &dev->pci_dev->dev;
unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
sizeof(struct nvme_cmd_info));
- struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
+ struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq) + extra,
+ GFP_KERNEL, nid);
if (!nvmeq)
return NULL;
- nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
- &nvmeq->cq_dma_addr, GFP_KERNEL);
+ nvmeq->cqes = alloc_pages_exact_nid(nid, CQ_SIZE(depth), GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
+ nvmeq->cq_dma_addr = dma_map_single(dmadev, (void *)nvmeq->cqes,
+ CQ_SIZE(depth), DMA_FROM_DEVICE);
nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
&nvmeq->sq_dma_addr, GFP_KERNEL);
@@ -1090,8 +1094,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
return nvmeq;
free_cqdma:
- dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
- nvmeq->cq_dma_addr);
+ dma_unmap_single(nvmeq->q_dmadev, nvmeq->cq_dma_addr,
+ CQ_SIZE(nvmeq->q_depth), DMA_FROM_DEVICE);
+ free_pages_exact((void *)nvmeq->cqes, CQ_SIZE(nvmeq->q_depth));
free_nvmeq:
kfree(nvmeq);
return NULL;
@@ -1110,10 +1115,11 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
}
static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid,
- int cq_size, int vector)
+ int cq_size, int vector, int nid)
{
int result;
- struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector);
+ struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector,
+ nid);
if (!nvmeq)
return ERR_PTR(-ENOMEM);
@@ -1200,7 +1206,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
if (result < 0)
return result;
- nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
+ nvmeq = nvme_alloc_queue(dev, 0, 64, 0, -1);
if (!nvmeq)
return -ENOMEM;
@@ -1671,7 +1677,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct pci_dev *pdev = dev->pci_dev;
- int result, cpu, i, vecs, nr_io_queues, db_bar_size, q_depth;
+ int result, cpu, nid, i, vecs, nr_io_queues, db_bar_size, q_depth;
nr_io_queues = num_online_cpus();
result = set_queue_count(dev, nr_io_queues);
@@ -1730,19 +1736,17 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
result = queue_request_irq(dev, dev->queues[0], "nvme admin");
/* XXX: handle failure here */
- cpu = cpumask_first(cpu_online_mask);
- for (i = 0; i < nr_io_queues; i++) {
- irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
-
q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
NVME_Q_DEPTH);
+ cpu = cpumask_first(cpu_online_mask);
for (i = 0; i < nr_io_queues; i++) {
- dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
+ irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
+ nid = cpu_to_node(cpu);
+ dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i, nid);
if (IS_ERR(dev->queues[i + 1]))
return PTR_ERR(dev->queues[i + 1]);
dev->queue_count++;
+ cpu = cpumask_next(cpu, cpu_online_mask);
}
for (; i < num_possible_cpus(); i++) {
--
1.7.10.4
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCHv2] NVMe: IO Queue NUMA locality
2013-07-08 19:35 [PATCHv2] NVMe: IO Queue NUMA locality Keith Busch
@ 2013-07-09 13:41 ` Matthew Wilcox
0 siblings, 0 replies; 2+ messages in thread
From: Matthew Wilcox @ 2013-07-09 13:41 UTC (permalink / raw)
On Mon, Jul 08, 2013@01:35:59PM -0600, Keith Busch wrote:
> There is measurable difference when running IO on a cpu on another
> domain; however, my particular device hits its peak performance on
> either domain at higher queue depths and block sizes, so I'm only able
> to see a difference at lower io depths. The best gains topped out at 2%
> improvement with this patch vs the existing code.
That's not too shabby. This is only a two-socket system you're testing
on, so I'd expect larger gains on systems with more sockets.
> I understand this method of allocating and mapping memory may not work
> for CPUs without cache-coherency, but I'm not sure if there is another
> way to allocate coherent memory for a specific NUMA node.
I found a way in the networking drivers:
int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring)
{
int orig_node = dev_to_node(dev);
int numa_node = -1;
...
if (tx_ring->q_vector)
numa_node = tx_ring->q_vector->numa_node;
...
set_dev_node(dev, numa_node);
tx_ring->desc = dma_alloc_coherent(dev,
tx_ring->size,
&tx_ring->dma,
GFP_KERNEL);
set_dev_node(dev, orig_node);
if (!tx_ring->desc)
tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
&tx_ring->dma, GFP_KERNEL);
if (!tx_ring->desc)
goto err;
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index 711b51c..9cedfa0 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -1200,7 +1206,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
> if (result < 0)
> return result;
>
> - nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
> + nvmeq = nvme_alloc_queue(dev, 0, 64, 0, -1);
> if (!nvmeq)
> return -ENOMEM;
>
I suppose we should really have the admin queue allocated on the node
closest to the device, so pass in dev_to_node(dev) instead of -1 here?
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2013-07-09 13:41 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-08 19:35 [PATCHv2] NVMe: IO Queue NUMA locality Keith Busch
2013-07-09 13:41 ` Matthew Wilcox
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.