All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device
@ 2021-07-15 16:10 Haiyang Zhang
  2021-07-15 18:18 ` Michael Kelley
  0 siblings, 1 reply; 3+ messages in thread
From: Haiyang Zhang @ 2021-07-15 16:10 UTC (permalink / raw)
  To: linux-hyperv; +Cc: haiyangz, kys, linux-kernel

The vmbus module uses a rotational algorithm to assign target CPUs to
device's channels. Depends on the timing of different device's channel
offers, different channels of a device may be assigned to the same CPU.

For example on a VM with 2 CPUs, if the NIC A and B's channels offered
in the following order, the NIC A will have both channels on CPU0, and
NIC B will have both channels on CPU1 -- see below. This kind of
assignments cause RSS spreading loads among different channels ends up
on the same CPU.

Timing of channel offers:
NIC A channel 0
NIC B channel 0
NIC A channel 1
NIC B channel 1

VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
        Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
        Rel_ID=14, target_cpu=0
        Rel_ID=17, target_cpu=0

VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
        Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
        Rel_ID=16, target_cpu=1
        Rel_ID=18, target_cpu=1


Update the vmbus' CPU assignment algorithm to avoid duplicate CPU
assignments within a device.

The new algorithm iterates 2 * #NUMA_Node + 1 times. In the first
round of checking all NUMA nodes, it tries to find previously unassigned
CPUs by this and other devices. If not available, it clears the
allocated CPU mask.
In the second round, it tries to find unassigned CPUs by the same
device.
In the last iteration, it assigns the channel to the first available CPU.
This is not normally expected, because during device probe, we limit the
number of channels of a device to be <= number of online CPUs.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

---
 drivers/hv/channel_mgmt.c | 95 ++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 30 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index caf6d0c4bc1b..fbddc4954f57 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	 */
 	mutex_lock(&vmbus_connection.channel_mutex);
 
+	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+		if (guid_equal(&channel->offermsg.offer.if_type,
+			       &newchannel->offermsg.offer.if_type) &&
+		    guid_equal(&channel->offermsg.offer.if_instance,
+			       &newchannel->offermsg.offer.if_instance)) {
+			fnew = false;
+			newchannel->primary_channel = channel;
+			break;
+		}
+	}
+
 	init_vp_index(newchannel);
 
 	/* Remember the channels that should be cleaned up upon suspend. */
@@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	 */
 	atomic_dec(&vmbus_connection.offer_in_progress);
 
-	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
-		if (guid_equal(&channel->offermsg.offer.if_type,
-			       &newchannel->offermsg.offer.if_type) &&
-		    guid_equal(&channel->offermsg.offer.if_instance,
-			       &newchannel->offermsg.offer.if_instance)) {
-			fnew = false;
-			break;
-		}
-	}
-
 	if (fnew) {
 		list_add_tail(&newchannel->listentry,
 			      &vmbus_connection.chn_list);
@@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 		/*
 		 * Process the sub-channel.
 		 */
-		newchannel->primary_channel = channel;
 		list_add_tail(&newchannel->sc_list, &channel->sc_list);
 	}
 
@@ -683,6 +683,29 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	queue_work(wq, &newchannel->add_channel_work);
 }
 
+/*
+ * Clear CPUs used by other channels of the same device.
+ * It's should only be called by init_vp_index().
+ */
+static bool hv_clear_usedcpu(struct cpumask *cmask, struct vmbus_channel *chn)
+{
+	struct vmbus_channel *primary = chn->primary_channel;
+	struct vmbus_channel *sc;
+
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+	if (!primary)
+		return !cpumask_empty(cmask);
+
+	cpumask_clear_cpu(primary->target_cpu, cmask);
+
+	list_for_each_entry(sc, &primary->sc_list, sc_list)
+		if (sc != chn)
+			cpumask_clear_cpu(sc->target_cpu, cmask);
+
+	return !cpumask_empty(cmask);
+}
+
 /*
  * We use this state to statically distribute the channel interrupt load.
  */
@@ -705,7 +728,7 @@ static void init_vp_index(struct vmbus_channel *channel)
 	cpumask_var_t available_mask;
 	struct cpumask *alloced_mask;
 	u32 target_cpu;
-	int numa_node;
+	int numa_node, i;
 
 	if ((vmbus_proto_version == VERSION_WS2008) ||
 	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
@@ -724,29 +747,41 @@ static void init_vp_index(struct vmbus_channel *channel)
 		return;
 	}
 
-	while (true) {
-		numa_node = next_numa_node_id++;
-		if (numa_node == nr_node_ids) {
-			next_numa_node_id = 0;
-			continue;
+	for (i = 1; i <= nr_node_ids * 2 + 1; i++) {
+		while (true) {
+			numa_node = next_numa_node_id++;
+			if (numa_node == nr_node_ids) {
+				next_numa_node_id = 0;
+				continue;
+			}
+			if (cpumask_empty(cpumask_of_node(numa_node)))
+				continue;
+			break;
 		}
-		if (cpumask_empty(cpumask_of_node(numa_node)))
-			continue;
-		break;
-	}
-	alloced_mask = &hv_context.hv_numa_map[numa_node];
+		alloced_mask = &hv_context.hv_numa_map[numa_node];
+
+		if (cpumask_weight(alloced_mask) ==
+		    cpumask_weight(cpumask_of_node(numa_node))) {
+			/*
+			 * We have cycled through all the CPUs in the node;
+			 * reset the alloced map.
+			 */
+			cpumask_clear(alloced_mask);
+		}
+
+		cpumask_xor(available_mask, alloced_mask,
+			    cpumask_of_node(numa_node));
+
+		/* Try to avoid duplicate cpus within a device */
+		if (channel->offermsg.offer.sub_channel_index >=
+		    num_online_cpus() ||
+		    i > nr_node_ids * 2 ||
+		    hv_clear_usedcpu(available_mask, channel))
+			break;
 
-	if (cpumask_weight(alloced_mask) ==
-	    cpumask_weight(cpumask_of_node(numa_node))) {
-		/*
-		 * We have cycled through all the CPUs in the node;
-		 * reset the alloced map.
-		 */
 		cpumask_clear(alloced_mask);
 	}
 
-	cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
-
 	target_cpu = cpumask_first(available_mask);
 	cpumask_set_cpu(target_cpu, alloced_mask);
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device
  2021-07-15 16:10 [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device Haiyang Zhang
@ 2021-07-15 18:18 ` Michael Kelley
  2021-07-15 20:30   ` Haiyang Zhang
  0 siblings, 1 reply; 3+ messages in thread
From: Michael Kelley @ 2021-07-15 18:18 UTC (permalink / raw)
  To: Haiyang Zhang, linux-hyperv; +Cc: Haiyang Zhang, KY Srinivasan, linux-kernel

From: LKML haiyangz <lkmlhyz@microsoft.com> On Behalf Of Haiyang Zhang Sent: Thursday, July 15, 2021 9:11 AM
> 
> The vmbus module uses a rotational algorithm to assign target CPUs to
> device's channels. Depends on the timing of different device's channel
> offers, different channels of a device may be assigned to the same CPU.
> 
> For example on a VM with 2 CPUs, if the NIC A and B's channels offered

s/the NIC A/NIC A/
s/offered/are offered/

> in the following order, the NIC A will have both channels on CPU0, and

s/the NIC A/NIC A/

> NIC B will have both channels on CPU1 -- see below. This kind of
> assignments cause RSS spreading loads among different channels ends up

"assignment causes RSS load that is spread across different channels to end up"

> on the same CPU.
> 
> Timing of channel offers:
> NIC A channel 0
> NIC B channel 0
> NIC A channel 1
> NIC B channel 1
> 
> VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
>         Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
>         Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
>         Rel_ID=14, target_cpu=0
>         Rel_ID=17, target_cpu=0
> 
> VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
>         Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
>         Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
>         Rel_ID=16, target_cpu=1
>         Rel_ID=18, target_cpu=1
> 
> 
> Update the vmbus' CPU assignment algorithm to avoid duplicate CPU

s/vmbus'/vmbus/

> assignments within a device.
> 
> The new algorithm iterates 2 * #NUMA_Node + 1 times. In the first
> round of checking all NUMA nodes, it tries to find previously unassigned
> CPUs by this and other devices. If not available, it clears the
> allocated CPU mask.
> In the second round, it tries to find unassigned CPUs by the same
> device.
> In the last iteration, it assigns the channel to the first available CPU.
> This is not normally expected, because during device probe, we limit the
> number of channels of a device to be <= number of online CPUs.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> 
> ---
>  drivers/hv/channel_mgmt.c | 95 ++++++++++++++++++++++++++-------------
>  1 file changed, 65 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index caf6d0c4bc1b..fbddc4954f57 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	 */
>  	mutex_lock(&vmbus_connection.channel_mutex);
> 
> +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> +		if (guid_equal(&channel->offermsg.offer.if_type,
> +			       &newchannel->offermsg.offer.if_type) &&
> +		    guid_equal(&channel->offermsg.offer.if_instance,
> +			       &newchannel->offermsg.offer.if_instance)) {
> +			fnew = false;
> +			newchannel->primary_channel = channel;
> +			break;
> +		}
> +	}
> +
>  	init_vp_index(newchannel);
> 
>  	/* Remember the channels that should be cleaned up upon suspend. */
> @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	 */
>  	atomic_dec(&vmbus_connection.offer_in_progress);
> 
> -	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> -		if (guid_equal(&channel->offermsg.offer.if_type,
> -			       &newchannel->offermsg.offer.if_type) &&
> -		    guid_equal(&channel->offermsg.offer.if_instance,
> -			       &newchannel->offermsg.offer.if_instance)) {
> -			fnew = false;
> -			break;
> -		}
> -	}
> -
>  	if (fnew) {
>  		list_add_tail(&newchannel->listentry,
>  			      &vmbus_connection.chn_list);
> @@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  		/*
>  		 * Process the sub-channel.
>  		 */
> -		newchannel->primary_channel = channel;
>  		list_add_tail(&newchannel->sc_list, &channel->sc_list);
>  	}
> 
> @@ -683,6 +683,29 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	queue_work(wq, &newchannel->add_channel_work);
>  }
> 
> +/*
> + * Clear CPUs used by other channels of the same device.
> + * It's should only be called by init_vp_index().
> + */
> +static bool hv_clear_usedcpu(struct cpumask *cmask, struct vmbus_channel *chn)
> +{
> +	struct vmbus_channel *primary = chn->primary_channel;
> +	struct vmbus_channel *sc;
> +
> +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> +
> +	if (!primary)
> +		return !cpumask_empty(cmask);

Minor note:  As this function is currently used, the cmask parameter should
never be empty.  Calling cpumask_empty() doesn't hurt anything but
this line should always return "true".

> +
> +	cpumask_clear_cpu(primary->target_cpu, cmask);
> +
> +	list_for_each_entry(sc, &primary->sc_list, sc_list)
> +		if (sc != chn)
> +			cpumask_clear_cpu(sc->target_cpu, cmask);
> +
> +	return !cpumask_empty(cmask);
> +}
> +
>  /*
>   * We use this state to statically distribute the channel interrupt load.
>   */
> @@ -705,7 +728,7 @@ static void init_vp_index(struct vmbus_channel *channel)
>  	cpumask_var_t available_mask;
>  	struct cpumask *alloced_mask;
>  	u32 target_cpu;
> -	int numa_node;
> +	int numa_node, i;
> 
>  	if ((vmbus_proto_version == VERSION_WS2008) ||
>  	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
> @@ -724,29 +747,41 @@ static void init_vp_index(struct vmbus_channel *channel)
>  		return;
>  	}
> 
> -	while (true) {
> -		numa_node = next_numa_node_id++;
> -		if (numa_node == nr_node_ids) {
> -			next_numa_node_id = 0;
> -			continue;
> +	for (i = 1; i <= nr_node_ids * 2 + 1; i++) {
> +		while (true) {
> +			numa_node = next_numa_node_id++;
> +			if (numa_node == nr_node_ids) {
> +				next_numa_node_id = 0;
> +				continue;
> +			}
> +			if (cpumask_empty(cpumask_of_node(numa_node)))
> +				continue;

This test has the potential to get the next_numa_node_id value out of sync
with the index "i" in the containing "for" loop.  The intent is to go through all
NUMA nodes up to 2 times, plus 1 more time.  But if a NUMA node is encountered
with no online CPUs, next_numa_node_id may get incremented multiple times
in a single iteration of the "for" loop.  So you could end up cycling through some
of the NUMA nodes more than twice before doing the final iteration.

> +			break;
>  		}
> -		if (cpumask_empty(cpumask_of_node(numa_node)))
> -			continue;
> -		break;
> -	}
> -	alloced_mask = &hv_context.hv_numa_map[numa_node];
> +		alloced_mask = &hv_context.hv_numa_map[numa_node];
> +
> +		if (cpumask_weight(alloced_mask) ==
> +		    cpumask_weight(cpumask_of_node(numa_node))) {
> +			/*
> +			 * We have cycled through all the CPUs in the node;
> +			 * reset the alloced map.
> +			 */
> +			cpumask_clear(alloced_mask);
> +		}
> +
> +		cpumask_xor(available_mask, alloced_mask,
> +			    cpumask_of_node(numa_node));
> +
> +		/* Try to avoid duplicate cpus within a device */
> +		if (channel->offermsg.offer.sub_channel_index >=
> +		    num_online_cpus() ||
> +		    i > nr_node_ids * 2 ||
> +		    hv_clear_usedcpu(available_mask, channel))
> +			break;
> 
> -	if (cpumask_weight(alloced_mask) ==
> -	    cpumask_weight(cpumask_of_node(numa_node))) {
> -		/*
> -		 * We have cycled through all the CPUs in the node;
> -		 * reset the alloced map.
> -		 */
>  		cpumask_clear(alloced_mask);

This clearing of the "alloced_mask" concerns me.  It is done incrementally
as the NUMA nodes are cycled through the first time.  So if an available CPU
is found in the 3rd NUMA node on the first cycle, the alloced_mask will
have been cleared for NUMA nodes 0, 1, and 2.  A subsequent call to
this function for a different device will find the CPUs in those NUMA nodes
unalloc'ed, and therefore useable.  It seems like this will tend to bunch the CPU
assignments across multiple devices much more toward NUMA node 0 than
the existing algorithm does.  Or am I missing something about how this works?

Michael

>  	}
> 
> -	cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
> -
>  	target_cpu = cpumask_first(available_mask);
>  	cpumask_set_cpu(target_cpu, alloced_mask);
> 
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device
  2021-07-15 18:18 ` Michael Kelley
@ 2021-07-15 20:30   ` Haiyang Zhang
  0 siblings, 0 replies; 3+ messages in thread
From: Haiyang Zhang @ 2021-07-15 20:30 UTC (permalink / raw)
  To: Michael Kelley, linux-hyperv; +Cc: KY Srinivasan, linux-kernel



> -----Original Message-----
> From: Michael Kelley <mikelley@microsoft.com>
> Sent: Thursday, July 15, 2021 2:18 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>; linux-
> hyperv@vger.kernel.org
> Cc: Haiyang Zhang <haiyangz@microsoft.com>; KY Srinivasan
> <kys@microsoft.com>; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU
> assignments within a device
> 
> From: LKML haiyangz <lkmlhyz@microsoft.com> On Behalf Of Haiyang Zhang
> Sent: Thursday, July 15, 2021 9:11 AM
> >
> > The vmbus module uses a rotational algorithm to assign target CPUs to
> > device's channels. Depends on the timing of different device's channel
> > offers, different channels of a device may be assigned to the same CPU.
> >
> > For example on a VM with 2 CPUs, if the NIC A and B's channels offered
> 
> s/the NIC A/NIC A/
> s/offered/are offered/

Thanks, I will update this and other descriptions.

> 
> > in the following order, the NIC A will have both channels on CPU0, and
> 
> s/the NIC A/NIC A/
> 
> > NIC B will have both channels on CPU1 -- see below. This kind of
> > assignments cause RSS spreading loads among different channels ends up
> 
> "assignment causes RSS load that is spread across different channels to end
> up"
> 
> > on the same CPU.
> >
> > Timing of channel offers:
> > NIC A channel 0
> > NIC B channel 0
> > NIC A channel 1
> > NIC B channel 1
> >
> > VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} -
> Synthetic network adapter
> >         Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
> >         Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-
> 9d57e320cccd
> >         Rel_ID=14, target_cpu=0
> >         Rel_ID=17, target_cpu=0
> >
> > VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} -
> Synthetic network adapter
> >         Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
> >         Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-
> d7baa13d6cea
> >         Rel_ID=16, target_cpu=1
> >         Rel_ID=18, target_cpu=1
> >
> >
> > Update the vmbus' CPU assignment algorithm to avoid duplicate CPU
> 
> s/vmbus'/vmbus/

> 
> > assignments within a device.
> >
> > The new algorithm iterates 2 * #NUMA_Node + 1 times. In the first
> > round of checking all NUMA nodes, it tries to find previously unassigned
> > CPUs by this and other devices. If not available, it clears the
> > allocated CPU mask.
> > In the second round, it tries to find unassigned CPUs by the same
> > device.
> > In the last iteration, it assigns the channel to the first available CPU.
> > This is not normally expected, because during device probe, we limit the
> > number of channels of a device to be <= number of online CPUs.
> >
> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> >
> > ---
> >  drivers/hv/channel_mgmt.c | 95 ++++++++++++++++++++++++++---------
> ----
> >  1 file changed, 65 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index caf6d0c4bc1b..fbddc4954f57 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	 */
> >  	mutex_lock(&vmbus_connection.channel_mutex);
> >
> > +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry)
> {
> > +		if (guid_equal(&channel->offermsg.offer.if_type,
> > +			       &newchannel->offermsg.offer.if_type) &&
> > +		    guid_equal(&channel->offermsg.offer.if_instance,
> > +			       &newchannel->offermsg.offer.if_instance)) {
> > +			fnew = false;
> > +			newchannel->primary_channel = channel;
> > +			break;
> > +		}
> > +	}
> > +
> >  	init_vp_index(newchannel);
> >
> >  	/* Remember the channels that should be cleaned up upon suspend.
> */
> > @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	 */
> >  	atomic_dec(&vmbus_connection.offer_in_progress);
> >
> > -	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry)
> {
> > -		if (guid_equal(&channel->offermsg.offer.if_type,
> > -			       &newchannel->offermsg.offer.if_type) &&
> > -		    guid_equal(&channel->offermsg.offer.if_instance,
> > -			       &newchannel->offermsg.offer.if_instance)) {
> > -			fnew = false;
> > -			break;
> > -		}
> > -	}
> > -
> >  	if (fnew) {
> >  		list_add_tail(&newchannel->listentry,
> >  			      &vmbus_connection.chn_list);
> > @@ -647,7 +648,6 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  		/*
> >  		 * Process the sub-channel.
> >  		 */
> > -		newchannel->primary_channel = channel;
> >  		list_add_tail(&newchannel->sc_list, &channel->sc_list);
> >  	}
> >
> > @@ -683,6 +683,29 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	queue_work(wq, &newchannel->add_channel_work);
> >  }
> >
> > +/*
> > + * Clear CPUs used by other channels of the same device.
> > + * It's should only be called by init_vp_index().
> > + */
> > +static bool hv_clear_usedcpu(struct cpumask *cmask, struct
> vmbus_channel *chn)
> > +{
> > +	struct vmbus_channel *primary = chn->primary_channel;
> > +	struct vmbus_channel *sc;
> > +
> > +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> > +
> > +	if (!primary)
> > +		return !cpumask_empty(cmask);
> 
> Minor note:  As this function is currently used, the cmask parameter should
> never be empty.  Calling cpumask_empty() doesn't hurt anything but
> this line should always return "true".


> 
> > +
> > +	cpumask_clear_cpu(primary->target_cpu, cmask);
> > +
> > +	list_for_each_entry(sc, &primary->sc_list, sc_list)
> > +		if (sc != chn)
> > +			cpumask_clear_cpu(sc->target_cpu, cmask);
> > +
> > +	return !cpumask_empty(cmask);
> > +}
> > +
> >  /*
> >   * We use this state to statically distribute the channel interrupt load.
> >   */
> > @@ -705,7 +728,7 @@ static void init_vp_index(struct vmbus_channel
> *channel)
> >  	cpumask_var_t available_mask;
> >  	struct cpumask *alloced_mask;
> >  	u32 target_cpu;
> > -	int numa_node;
> > +	int numa_node, i;
> >
> >  	if ((vmbus_proto_version == VERSION_WS2008) ||
> >  	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
> > @@ -724,29 +747,41 @@ static void init_vp_index(struct vmbus_channel
> *channel)
> >  		return;
> >  	}
> >
> > -	while (true) {
> > -		numa_node = next_numa_node_id++;
> > -		if (numa_node == nr_node_ids) {
> > -			next_numa_node_id = 0;
> > -			continue;
> > +	for (i = 1; i <= nr_node_ids * 2 + 1; i++) {
> > +		while (true) {
> > +			numa_node = next_numa_node_id++;
> > +			if (numa_node == nr_node_ids) {
> > +				next_numa_node_id = 0;
> > +				continue;
> > +			}
> > +			if
> (cpumask_empty(cpumask_of_node(numa_node)))
> > +				continue;
> 
> This test has the potential to get the next_numa_node_id value out of sync
> with the index "i" in the containing "for" loop.  The intent is to go through all
> NUMA nodes up to 2 times, plus 1 more time.  But if a NUMA node is
> encountered
> with no online CPUs, next_numa_node_id may get incremented multiple
> times
> in a single iteration of the "for" loop.  So you could end up cycling through
> some
> of the NUMA nodes more than twice before doing the final iteration.

I will switch to an update algorithm: see blow.

> 
> > +			break;
> >  		}
> > -		if (cpumask_empty(cpumask_of_node(numa_node)))
> > -			continue;
> > -		break;
> > -	}
> > -	alloced_mask = &hv_context.hv_numa_map[numa_node];
> > +		alloced_mask = &hv_context.hv_numa_map[numa_node];
> > +
> > +		if (cpumask_weight(alloced_mask) ==
> > +		    cpumask_weight(cpumask_of_node(numa_node))) {
> > +			/*
> > +			 * We have cycled through all the CPUs in the node;
> > +			 * reset the alloced map.
> > +			 */
> > +			cpumask_clear(alloced_mask);
> > +		}
> > +
> > +		cpumask_xor(available_mask, alloced_mask,
> > +			    cpumask_of_node(numa_node));
> > +
> > +		/* Try to avoid duplicate cpus within a device */
> > +		if (channel->offermsg.offer.sub_channel_index >=
> > +		    num_online_cpus() ||
> > +		    i > nr_node_ids * 2 ||
> > +		    hv_clear_usedcpu(available_mask, channel))
> > +			break;
> >
> > -	if (cpumask_weight(alloced_mask) ==
> > -	    cpumask_weight(cpumask_of_node(numa_node))) {
> > -		/*
> > -		 * We have cycled through all the CPUs in the node;
> > -		 * reset the alloced map.
> > -		 */
> >  		cpumask_clear(alloced_mask);
> 
> This clearing of the "alloced_mask" concerns me.  It is done incrementally
> as the NUMA nodes are cycled through the first time.  So if an available CPU
> is found in the 3rd NUMA node on the first cycle, the alloced_mask will
> have been cleared for NUMA nodes 0, 1, and 2.  A subsequent call to
> this function for a different device will find the CPUs in those NUMA nodes
> unalloc'ed, and therefore useable.  It seems like this will tend to bunch the
> CPU
> assignments across multiple devices much more toward NUMA node 0 than
> the existing algorithm does.  Or am I missing something about how this works?

The assignment is a rotational algorithm, and next_numa_node_id is a global 
variable, which keeps the last assignment's numa id. So if the last device used 
numa#3, the next device will try numa#4 (or numa 0, if there are only 4 nodes). 
Then the next channel of this device will try the next numa node #5. And the 
next one will try #6, etc... Because the rotational algorithm keeps rotating, 
there is no preference towards any NUMA node.

But, I believe since we use cpumask_first(available_mask) instead of a rotation 
within a node, the new algorithm does have preference on the "small" cpu 
numbers WITHIN a numa node due to possible "extra" cpumask_clear() than 
the old algorithm.

So I updated the algorithm like this below. Basically, it keeps running the rotational 
algorithm until it finds a cpu not used by self. And any new device will start rotation
after the point of last cpu assignment. So, it doesn't have any preference on any 
node or cpu.

Existing_find_next() {
	Get the next numa node
	Clear this node if all occupied

	Cpu = the next available cpu of this node;
	cpumask_set_cpu(cpu);

	return cpu;
}

Updated_find_next() {
	N = number_online_cpu()

	for (i=1, i<= N+1, i++) {
		cpu= Existing_find_next()
		If (chn_index >= N ||
		  i > N ||
		  NotUsedBySelf(cpu))
		Break;
	}

	channel->target_cpu = cpu;
}

Thanks,
- Haiyang


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-07-15 20:30 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-15 16:10 [PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device Haiyang Zhang
2021-07-15 18:18 ` Michael Kelley
2021-07-15 20:30   ` Haiyang Zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.