From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga01.intel.com ([192.55.52.88]:30497 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751288AbcIAXTk (ORCPT ); Thu, 1 Sep 2016 19:19:40 -0400 Date: Thu, 1 Sep 2016 19:30:10 -0400 From: Keith Busch To: Christoph Hellwig Cc: axboe@fb.com, linux-block@vger.kernel.org, linux-nvme@lists.infradead.org Subject: Re: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask Message-ID: <20160901233010.GC10903@localhost.localdomain> References: <1472468013-29936-1-git-send-email-hch@lst.de> <1472468013-29936-5-git-send-email-hch@lst.de> <20160831163852.GB5598@localhost.localdomain> <20160901084624.GC4115@lst.de> <20160901142410.GA10903@localhost.localdomain> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii In-Reply-To: <20160901142410.GA10903@localhost.localdomain> Sender: linux-block-owner@vger.kernel.org List-Id: linux-block@vger.kernel.org On Thu, Sep 01, 2016 at 10:24:10AM -0400, Keith Busch wrote: > Yeah, I gathered that's what it was providing, but that's just barely > not enough information to do something useful. The CPUs that aren't set > have to use a previously assigned vector/queue, but which one? Unless I'm totally missing how to infer paired CPUs, I think we need arrays. Here's a stab at that. I'm using the "old" algorithm the NVMe driver used to pair vectors and cpus. It's not the most efficient way of pairing that I know of, but it is easy to follow (relatively speaking), and it actually utilizes every hardware resource available so I get very good CPU <-> Queue mappings. --- diff --git a/block/blk-mq.c b/block/blk-mq.c index 9cc08c6..c5c038e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, const struct cpumask *affinity_mask) { - int queue = -1, cpu = 0; + int queue; set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); @@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, if (!affinity_mask) return 0; /* map all cpus to queue 0 */ - /* If cpus are offline, map them to first hctx */ - for_each_online_cpu(cpu) { - if (cpumask_test_cpu(cpu, affinity_mask)) - queue++; - if (queue >= 0) + for (queue = 0; queue < set->nr_hw_queues; queue++) { + int cpu; + + for_each_cpu(cpu, &affinity_mask[queue]) set->mq_map[cpu] = queue; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98f1222..03a1ffc 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, { const struct cpumask *mask = NULL; struct msi_desc *entry; - int cpu = -1, i; + int i; for (i = 0; i < nvec; i++) { - if (dev->irq_affinity) { - cpu = cpumask_next(cpu, dev->irq_affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(dev->irq_affinity); - mask = cpumask_of(cpu); - } + if (dev->irq_affinity) + mask = &dev->irq_affinity[i]; entry = alloc_msi_entry(&dev->dev); if (!entry) { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfc..9fe548b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,14 +4,47 @@ #include #include -static int get_first_sibling(unsigned int cpu) +static int find_closest_node(int node) { - unsigned int ret; + int n, val, min_val = INT_MAX, best_node = node; + + for_each_online_node(n) { + if (n == node) + continue; + val = node_distance(node, n); + if (val < min_val) { + min_val = val; + best_node = n; + } + } + return best_node; +} + +static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask, + int count) +{ + int cpu; + + for_each_cpu(cpu, qmask) { + if (cpumask_weight(affinity_mask) >= count) + break; + cpumask_set_cpu(cpu, affinity_mask); + } +} + +static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, + const cpumask_t *new_mask, struct cpumask *affinity_mask, + int cpus_per_queue) +{ + int next_cpu; + + for_each_cpu(next_cpu, new_mask) { + cpumask_or(mask, mask, get_cpu_mask(next_cpu)); + cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu)); + cpumask_and(mask, mask, unassigned_cpus); + } + set_vec_cpus(mask, affinity_mask, cpus_per_queue); - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; } /* @@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu) */ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) { - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; + struct cpumask *affinity_mask, *masks; + unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i; + cpumask_var_t unassigned_cpus; if (max_vecs == 1) return NULL; - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { + masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL); + if (!masks) { *nr_vecs = 1; return NULL; } get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } - - if (--max_vecs == 0) - break; - } - *nr_vecs = vecs; + + cpus_per_vec = num_online_cpus() / max_vecs; + remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec); + + cpumask_copy(unassigned_cpus, cpu_online_mask); + cpu = cpumask_first(unassigned_cpus); + + for (i = 0; i < max_vecs; i++) { + cpumask_t mask; + + if (!cpumask_weight(unassigned_cpus)) + break; + + affinity_mask = &masks[i]; + + mask = *get_cpu_mask(cpu); + set_vec_cpus(&mask, affinity_mask, cpus_per_vec); + + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + topology_sibling_cpumask(cpu), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + topology_core_cpumask(cpu), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + cpumask_of_node(cpu_to_node(cpu)), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + cpumask_of_node( + find_closest_node( + cpu_to_node(cpu))), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + unassigned_cpus, affinity_mask, + cpus_per_vec); + + cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask); + cpu = cpumask_next(cpu, unassigned_cpus); + + if (remainder && !--remainder) + cpus_per_vec++; } put_online_cpus(); - return affinity_mask; + i = 0; + cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); + for_each_cpu(cpu, unassigned_cpus) { + set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0); + i = (i + 1) % max_vecs; + } + free_cpumask_var(unassigned_cpus); + + return masks; } -- From mboxrd@z Thu Jan 1 00:00:00 1970 From: keith.busch@intel.com (Keith Busch) Date: Thu, 1 Sep 2016 19:30:10 -0400 Subject: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask In-Reply-To: <20160901142410.GA10903@localhost.localdomain> References: <1472468013-29936-1-git-send-email-hch@lst.de> <1472468013-29936-5-git-send-email-hch@lst.de> <20160831163852.GB5598@localhost.localdomain> <20160901084624.GC4115@lst.de> <20160901142410.GA10903@localhost.localdomain> Message-ID: <20160901233010.GC10903@localhost.localdomain> On Thu, Sep 01, 2016@10:24:10AM -0400, Keith Busch wrote: > Yeah, I gathered that's what it was providing, but that's just barely > not enough information to do something useful. The CPUs that aren't set > have to use a previously assigned vector/queue, but which one? Unless I'm totally missing how to infer paired CPUs, I think we need arrays. Here's a stab at that. I'm using the "old" algorithm the NVMe driver used to pair vectors and cpus. It's not the most efficient way of pairing that I know of, but it is easy to follow (relatively speaking), and it actually utilizes every hardware resource available so I get very good CPU <-> Queue mappings. --- diff --git a/block/blk-mq.c b/block/blk-mq.c index 9cc08c6..c5c038e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, const struct cpumask *affinity_mask) { - int queue = -1, cpu = 0; + int queue; set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); @@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set *set, if (!affinity_mask) return 0; /* map all cpus to queue 0 */ - /* If cpus are offline, map them to first hctx */ - for_each_online_cpu(cpu) { - if (cpumask_test_cpu(cpu, affinity_mask)) - queue++; - if (queue >= 0) + for (queue = 0; queue < set->nr_hw_queues; queue++) { + int cpu; + + for_each_cpu(cpu, &affinity_mask[queue]) set->mq_map[cpu] = queue; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98f1222..03a1ffc 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, { const struct cpumask *mask = NULL; struct msi_desc *entry; - int cpu = -1, i; + int i; for (i = 0; i < nvec; i++) { - if (dev->irq_affinity) { - cpu = cpumask_next(cpu, dev->irq_affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(dev->irq_affinity); - mask = cpumask_of(cpu); - } + if (dev->irq_affinity) + mask = &dev->irq_affinity[i]; entry = alloc_msi_entry(&dev->dev); if (!entry) { diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfc..9fe548b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,14 +4,47 @@ #include #include -static int get_first_sibling(unsigned int cpu) +static int find_closest_node(int node) { - unsigned int ret; + int n, val, min_val = INT_MAX, best_node = node; + + for_each_online_node(n) { + if (n == node) + continue; + val = node_distance(node, n); + if (val < min_val) { + min_val = val; + best_node = n; + } + } + return best_node; +} + +static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask, + int count) +{ + int cpu; + + for_each_cpu(cpu, qmask) { + if (cpumask_weight(affinity_mask) >= count) + break; + cpumask_set_cpu(cpu, affinity_mask); + } +} + +static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, + const cpumask_t *new_mask, struct cpumask *affinity_mask, + int cpus_per_queue) +{ + int next_cpu; + + for_each_cpu(next_cpu, new_mask) { + cpumask_or(mask, mask, get_cpu_mask(next_cpu)); + cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu)); + cpumask_and(mask, mask, unassigned_cpus); + } + set_vec_cpus(mask, affinity_mask, cpus_per_queue); - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; } /* @@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu) */ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) { - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; + struct cpumask *affinity_mask, *masks; + unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i; + cpumask_var_t unassigned_cpus; if (max_vecs == 1) return NULL; - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { + masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL); + if (!masks) { *nr_vecs = 1; return NULL; } get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } - - if (--max_vecs == 0) - break; - } - *nr_vecs = vecs; + + cpus_per_vec = num_online_cpus() / max_vecs; + remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec); + + cpumask_copy(unassigned_cpus, cpu_online_mask); + cpu = cpumask_first(unassigned_cpus); + + for (i = 0; i < max_vecs; i++) { + cpumask_t mask; + + if (!cpumask_weight(unassigned_cpus)) + break; + + affinity_mask = &masks[i]; + + mask = *get_cpu_mask(cpu); + set_vec_cpus(&mask, affinity_mask, cpus_per_vec); + + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + topology_sibling_cpumask(cpu), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + topology_core_cpumask(cpu), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + cpumask_of_node(cpu_to_node(cpu)), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + cpumask_of_node( + find_closest_node( + cpu_to_node(cpu))), + affinity_mask, cpus_per_vec); + if (cpumask_weight(&mask) < cpus_per_vec) + add_cpus(&mask, unassigned_cpus, + unassigned_cpus, affinity_mask, + cpus_per_vec); + + cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask); + cpu = cpumask_next(cpu, unassigned_cpus); + + if (remainder && !--remainder) + cpus_per_vec++; } put_online_cpus(); - return affinity_mask; + i = 0; + cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); + for_each_cpu(cpu, unassigned_cpus) { + set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0); + i = (i + 1) % max_vecs; + } + free_cpumask_var(unassigned_cpus); + + return masks; } --