All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] numa: select nodes by cpu affinity
@ 2010-08-04 12:04 Andrew Jones
  2010-08-04 12:38 ` Keir Fraser
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Jones @ 2010-08-04 12:04 UTC (permalink / raw)
  To: xen-devel; +Cc: dulloor

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: nodemask.patch --]
[-- Type: text/x-patch, Size: 4305 bytes --]

numa: select nodes by cpu affinity

Along the same theme as changeset 21719, but expanded to all nodes
from which the domain is using processors. Don't be as strict as
exact_node_request, but rather just fall back to the current
behavior on failure. This should help performance if cpu affinity
is selected by numa-aware tools.

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -292,17 +292,47 @@
     return needed;
 }
 
+static void get_nodemask_by_cpu_affinity(
+                const struct domain *d, nodemask_t *nodemask)
+{
+    cpumask_t cpumask = CPU_MASK_NONE;
+    struct vcpu *v;
+    unsigned int node;
+
+    nodes_clear(*nodemask);
+
+    if ( d == NULL || num_online_nodes() == 1 )
+        goto all_online_nodes;
+
+    for_each_vcpu(d, v)
+        cpus_or(cpumask, cpumask, v->cpu_affinity);
+
+    if ( cpus_subset(cpu_online_map, cpumask) )
+        goto all_online_nodes;
+
+    for_each_online_node(node)
+        if ( cpus_intersects(node_to_cpumask(node), cpumask) )
+            node_set(node, *nodemask);
+    return;
+
+all_online_nodes:
+    nodes_or(*nodemask, *nodemask, node_online_map);
+    return;
+}
+
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
-    unsigned int node, unsigned int order, unsigned int memflags)
+    unsigned int node, unsigned int order, unsigned int memflags,
+    nodemask_t nodemask)
 {
     unsigned int i, j, zone = 0;
-    unsigned int num_nodes = num_online_nodes();
+    unsigned int num_nodes;
     unsigned long request = 1UL << order;
     bool_t exact_node_request = !!(memflags & MEMF_exact_node);
     cpumask_t extra_cpus_mask, mask;
     struct page_info *pg;
+    int nodemask_retry = 1;
 
     if ( node == NUMA_NO_NODE )
     {
@@ -335,6 +365,15 @@
      * zone before failing, only calc new node value if we fail to find memory 
      * in target node, this avoids needless computation on fast-path.
      */
+    if ( exact_node_request )
+        num_nodes = 1;
+    else
+    {
+        nodes_and(nodemask, nodemask, node_online_map);
+        num_nodes = nodes_weight(nodemask);
+    }
+
+try_nodemask:
     for ( i = 0; i < num_nodes; i++ )
     {
         zone = zone_hi;
@@ -353,9 +392,17 @@
             goto not_found;
 
         /* Pick next node, wrapping around if needed. */
-        node = next_node(node, node_online_map);
+        node = next_node(node, nodemask);
         if (node == MAX_NUMNODES)
-            node = first_node(node_online_map);
+            node = first_node(nodemask);
+    }
+
+    if ( nodemask_retry-- && !nodes_equal(nodemask, node_online_map) )
+    {
+        nodes_andnot(nodemask, node_online_map, nodemask);
+        num_nodes = nodes_weight(nodemask);
+        node = first_node(nodemask);
+        goto try_nodemask;
     }
 
  try_tmem:
@@ -1010,7 +1057,7 @@
     ASSERT(!in_irq());
 
     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
-        cpu_to_node(smp_processor_id()), order, memflags);
+        cpu_to_node(smp_processor_id()), order, memflags, node_online_map);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -1154,6 +1201,7 @@
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
     unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
+    nodemask_t nodemask;
 
     ASSERT(!in_irq());
 
@@ -1164,13 +1212,16 @@
     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
         return NULL;
 
+    get_nodemask_by_cpu_affinity(d, &nodemask);
+
     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags,
+                              nodemask);
 
     if ( (pg == NULL) &&
          ((memflags & MEMF_no_dma) ||
           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
-                                  node, order, memflags)) == NULL)) )
+                                  node, order, memflags, nodemask)) == NULL)) )
          return NULL;
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] numa: select nodes by cpu affinity
  2010-08-04 12:04 [PATCH] numa: select nodes by cpu affinity Andrew Jones
@ 2010-08-04 12:38 ` Keir Fraser
  2010-08-04 14:38   ` Keir Fraser
  0 siblings, 1 reply; 5+ messages in thread
From: Keir Fraser @ 2010-08-04 12:38 UTC (permalink / raw)
  To: Andrew Jones, xen-devel; +Cc: dulloor

Good idea. I will take this and rework it a bit and check it in.

 Thanks,
 Keir

On 04/08/2010 13:04, "Andrew Jones" <drjones@redhat.com> wrote:

> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] numa: select nodes by cpu affinity
  2010-08-04 12:38 ` Keir Fraser
@ 2010-08-04 14:38   ` Keir Fraser
  2010-08-04 16:01     ` Andrew Jones
  0 siblings, 1 reply; 5+ messages in thread
From: Keir Fraser @ 2010-08-04 14:38 UTC (permalink / raw)
  To: Andrew Jones, xen-devel; +Cc: dulloor

Changeset 21913 in http://xenbits.xen.org/staging/xen-unstable.hg

 -- Keir

On 04/08/2010 13:38, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> Good idea. I will take this and rework it a bit and check it in.
> 
>  Thanks,
>  Keir
> 
> On 04/08/2010 13:04, "Andrew Jones" <drjones@redhat.com> wrote:
> 
>> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] numa: select nodes by cpu affinity
  2010-08-04 14:38   ` Keir Fraser
@ 2010-08-04 16:01     ` Andrew Jones
  2010-08-04 16:15       ` Keir Fraser
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Jones @ 2010-08-04 16:01 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, dulloor

[-- Attachment #1: Type: text/plain, Size: 1070 bytes --]

On 08/04/2010 04:38 PM, Keir Fraser wrote:
> Changeset 21913 in http://xenbits.xen.org/staging/xen-unstable.hg
> 
>  -- Keir
> 
> On 04/08/2010 13:38, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> 
>> Good idea. I will take this and rework it a bit and check it in.
>>
>>  Thanks,
>>  Keir
>>
>> On 04/08/2010 13:04, "Andrew Jones" <drjones@redhat.com> wrote:
>

I also considered managing the nodemask as new domain state, as you do,
as it may come in useful elsewhere, but my principle of least patch
instincts kept me from doing it...

I'm not sure about keeping track of the last_alloc_node and then always
avoiding it (at least when there's more than 1 node) by checking it
last. I liked the way it worked before, favoring the node of the
currently running processor, but I don't have any perf numbers to know
what would be better.

I've attached a patch with a couple minor tweaks. It removes the
unnecessary node clearing from an empty initialized nodemask, and also
moves a couple of domain_update_node_affinity() calls outside
for_each_vcpu loops.

Andrew

[-- Attachment #2: nodemask-tweak.diff --]
[-- Type: text/plain, Size: 1496 bytes --]

diff --git a/xen/common/domain.c b/xen/common/domain.c
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -358,12 +358,8 @@
         cpus_or(cpumask, cpumask, v->cpu_affinity);
 
     for_each_online_node ( node )
-    {
         if ( cpus_intersects(node_to_cpumask(node), cpumask) )
             node_set(node, nodemask);
-        else
-            node_clear(node, nodemask);
-    }
 
     d->node_affinity = nodemask;
     spin_unlock(&d->node_affinity_lock);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -270,13 +270,13 @@
         SCHED_OP(VCPU2OP(v), destroy_vcpu, v);
 
         cpus_setall(v->cpu_affinity);
-        domain_update_node_affinity(d);
         v->processor = new_p;
         v->sched_priv = vcpu_priv[v->vcpu_id];
         evtchn_move_pirqs(v);
 
         new_p = cycle_cpu(new_p, c->cpu_valid);
     }
+    domain_update_node_affinity(d);
 
     d->cpupool = c;
     SCHED_OP(DOM2OP(d), free_domdata, d->sched_priv);
@@ -478,7 +478,6 @@
                 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
                         v->domain->domain_id, v->vcpu_id);
                 cpus_setall(v->cpu_affinity);
-                domain_update_node_affinity(d);
             }
 
             if ( v->processor == cpu )
@@ -501,6 +500,7 @@
             if ( v->processor == cpu )
                 ret = -EAGAIN;
         }
+        domain_update_node_affinity(d);
     }
     return ret;
 }

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] numa: select nodes by cpu affinity
  2010-08-04 16:01     ` Andrew Jones
@ 2010-08-04 16:15       ` Keir Fraser
  0 siblings, 0 replies; 5+ messages in thread
From: Keir Fraser @ 2010-08-04 16:15 UTC (permalink / raw)
  To: Andrew Jones; +Cc: xen-devel, dulloor

On 04/08/2010 17:01, "Andrew Jones" <drjones@redhat.com> wrote:

> I also considered managing the nodemask as new domain state, as you do,
> as it may come in useful elsewhere, but my principle of least patch
> instincts kept me from doing it...

Yeah, I don't fancy iterating over all vcpus for every little bitty
allocation. So it's a perf thing mainly for me.

> I'm not sure about keeping track of the last_alloc_node and then always
> avoiding it (at least when there's more than 1 node) by checking it
> last. I liked the way it worked before, favoring the node of the
> currently running processor, but I don't have any perf numbers to know
> what would be better.

Well, you can expect vcpus to move around within their affinity masks over
moderate timescales (like say seconds or minutes). And in fact the original
credit scheduler *loves* to migrate vcpus around the place over much less
reasonable timescales than that (sub second). It is nice to balance our
allocations rather than hitting one node 'unfairly' hard.

> I've attached a patch with a couple minor tweaks. It removes the
> unnecessary node clearing from an empty initialized nodemask, and also
> moves a couple of domain_update_node_affinity() calls outside
> for_each_vcpu loops.

Thanks, I tweaked your tweaks (just one tiny optimisation) and applied it so
it should show up in the staging tree rsn.

 -- Keir

> Andrew

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-08-04 16:15 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-04 12:04 [PATCH] numa: select nodes by cpu affinity Andrew Jones
2010-08-04 12:38 ` Keir Fraser
2010-08-04 14:38   ` Keir Fraser
2010-08-04 16:01     ` Andrew Jones
2010-08-04 16:15       ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.