From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S932728AbcGFXSm (ORCPT <rfc822;w@1wt.eu>);
	Wed, 6 Jul 2016 19:18:42 -0400
Received: from p3plsmtps2ded02.prod.phx3.secureserver.net ([208.109.80.59]:52347
	"EHLO p3plsmtps2ded02.prod.phx3.secureserver.net"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1755651AbcGFXSk (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
	Wed, 6 Jul 2016 19:18:40 -0400
x-originating-ip: 72.167.245.219
From: kys@exchange.microsoft.com
To: gregkh@linuxfoundation.org, linux-kernel@vger.kernel.org,
        devel@linuxdriverproject.org, olaf@aepfle.de, apw@canonical.com,
        vkuznets@redhat.com, jasowang@redhat.com,
        leann.ogasawara@canonical.com
Cc: "K. Y. Srinivasan" <kys@exchange.microsoft.com>,
        "K. Y. Srinivasan" <kys@microsoft.com>
Subject: [PATCH 1/1] Drivers: hv: Introduce a policy for controlling channel affinity
Date: Wed,  6 Jul 2016 18:02:15 -0700
Message-Id: <1467853335-9374-1-git-send-email-kys@exchange.microsoft.com>
X-Mailer: git-send-email 1.7.4.1
Reply-To: kys@microsoft.com
X-CMAE-Envelope: MS4wfLn0+xo7D95pvOPvuVBh+ooB0srMgwdpyR8zTzT9KVLNypJD3ZN7cHbSSfWF4DqT655LdvOmCZ1FcUjQAgo2WvWb/RokVyrI3ZEgfLCaHRm8GCnb6dNn
 hSlFHa6ziXy1Mjen2ApDYdH+j2lW216hJNTGOxfJNtrHnz35HyYh1ce5QiEkVrAB6skKtvxT+Du3AKmnsLBDT4twaI+p0ADRPOTpkFxtuDWtEFT7tqbkSYuV
 rALyNhf5iZJuuq00ZbfGDMzl5YSDgb+iORdnQgfij+Qm1t36o5EmZ+JDGmdk8XF0eAMsSP0PJBQsQ+qE9j/pb3cjeb9XcovOjEO2w66XNVxjAJZZqIbA8JvP
 N1ryK+5wUq0GqeUvLM5FTxlF2obPaQQiG+cjavAJktYb7TG+fBmNo33jc1Y9y5qM05pdhcdJimLigTuUim6NBKFt9Hetc6tpO8X+/tPbhwN7C86zkzaPKHfl
 sLqt+zk8g/HzRl2NC/cvZuniYqWIMrzisJnkdg==
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

From: K. Y. Srinivasan <kys@exchange.microsoft.com>

Introduce a mechanism to control how channels will be affinitized. We will
support two policies:

1. HV_BALANCED: All performance critical channels will be dstributed
evenly amongst all the available NUMA nodes. Once the Node is assigned,
we will assign the CPU based on a simple round robin scheme.

2. HV_LOCALIZED: Only the primary channels are distributed across all
NUMA nodes. Sub-channels will be in the same NUMA node as the primary
channel. This is the current behaviour.

The default policy will be the HV_BALANCED as it can minimize the remote
memory access on NUMA machines with applications that span NUMA nodes.

Signed-off-by: K. Y. Srinivasan <kys@exchange.microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
---
 drivers/hv/channel_mgmt.c |   68 +++++++++++++++++++++++++-------------------
 include/linux/hyperv.h    |   23 +++++++++++++++
 2 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 8345869..aaa2c4b 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -338,8 +338,9 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
 	 * We need to free the bit for init_vp_index() to work in the case
 	 * of sub-channel, when we reload drivers like hv_netvsc.
 	 */
-	cpumask_clear_cpu(channel->target_cpu,
-			  &primary_channel->alloced_cpus_in_node);
+	if (channel->affinity_policy == HV_LOCALIZED)
+		cpumask_clear_cpu(channel->target_cpu,
+				  &primary_channel->alloced_cpus_in_node);
 
 	free_channel(channel);
 }
@@ -524,17 +525,17 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
 	}
 
 	/*
-	 * We distribute primary channels evenly across all the available
-	 * NUMA nodes and within the assigned NUMA node we will assign the
-	 * first available CPU to the primary channel.
-	 * The sub-channels will be assigned to the CPUs available in the
-	 * NUMA node evenly.
+	 * Based on the channel affinity policy, we will assign the NUMA
+	 * nodes.
 	 */
-	if (!primary) {
+
+	if ((channel->affinity_policy == HV_BALANCED) || (!primary)) {
 		while (true) {
 			next_node = next_numa_node_id++;
-			if (next_node == nr_node_ids)
+			if (next_node == nr_node_ids) {
 				next_node = next_numa_node_id = 0;
+				continue;
+			}
 			if (cpumask_empty(cpumask_of_node(next_node)))
 				continue;
 			break;
@@ -558,15 +559,17 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
 
 	cur_cpu = -1;
 
-	/*
-	 * Normally Hyper-V host doesn't create more subchannels than there
-	 * are VCPUs on the node but it is possible when not all present VCPUs
-	 * on the node are initialized by guest. Clear the alloced_cpus_in_node
-	 * to start over.
-	 */
-	if (cpumask_equal(&primary->alloced_cpus_in_node,
-			  cpumask_of_node(primary->numa_node)))
-		cpumask_clear(&primary->alloced_cpus_in_node);
+	if (primary->affinity_policy == HV_LOCALIZED) {
+		/*
+		 * Normally Hyper-V host doesn't create more subchannels
+		 * than there are VCPUs on the node but it is possible when not
+		 * all present VCPUs on the node are initialized by guest.
+		 * Clear the alloced_cpus_in_node to start over.
+		 */
+		if (cpumask_equal(&primary->alloced_cpus_in_node,
+				  cpumask_of_node(primary->numa_node)))
+			cpumask_clear(&primary->alloced_cpus_in_node);
+	}
 
 	while (true) {
 		cur_cpu = cpumask_next(cur_cpu, &available_mask);
@@ -577,17 +580,24 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
 			continue;
 		}
 
-		/*
-		 * NOTE: in the case of sub-channel, we clear the sub-channel
-		 * related bit(s) in primary->alloced_cpus_in_node in
-		 * hv_process_channel_removal(), so when we reload drivers
-		 * like hv_netvsc in SMP guest, here we're able to re-allocate
-		 * bit from primary->alloced_cpus_in_node.
-		 */
-		if (!cpumask_test_cpu(cur_cpu,
-				&primary->alloced_cpus_in_node)) {
-			cpumask_set_cpu(cur_cpu,
-					&primary->alloced_cpus_in_node);
+		if (primary->affinity_policy == HV_LOCALIZED) {
+			/*
+			 * NOTE: in the case of sub-channel, we clear the
+			 * sub-channel related bit(s) in
+			 * primary->alloced_cpus_in_node in
+			 * hv_process_channel_removal(), so when we
+			 * reload drivers like hv_netvsc in SMP guest, here
+			 * we're able to re-allocate
+			 * bit from primary->alloced_cpus_in_node.
+			 */
+			if (!cpumask_test_cpu(cur_cpu,
+					      &primary->alloced_cpus_in_node)) {
+				cpumask_set_cpu(cur_cpu,
+						&primary->alloced_cpus_in_node);
+				cpumask_set_cpu(cur_cpu, alloced_mask);
+				break;
+			}
+		} else {
 			cpumask_set_cpu(cur_cpu, alloced_mask);
 			break;
 		}
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 897e4a7..f0f02ec 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -674,6 +674,11 @@ enum hv_signal_policy {
 	HV_SIGNAL_POLICY_EXPLICIT,
 };
 
+enum hv_numa_policy {
+	HV_BALANCED = 0,
+	HV_LOCALIZED,
+};
+
 enum vmbus_device_type {
 	HV_IDE = 0,
 	HV_SCSI,
@@ -876,6 +881,18 @@ struct vmbus_channel {
 	 */
 	bool low_latency;
 
+	/*
+	 * NUMA distribution policy:
+	 * We support teo policies:
+	 * 1) Balanced: Here all performance critical channels are
+	 *    distributed evenly amongst all the NUMA nodes.
+	 *    This policy will be the default policy.
+	 * 2) Localized: All channels of a given instance of a
+	 *    performance critical service will be assigned CPUs
+	 *    within a selected NUMA node.
+	 */
+	enum hv_numa_policy affinity_policy;
+
 };
 
 static inline void set_channel_lock_state(struct vmbus_channel *c, bool state)
@@ -895,6 +912,12 @@ static inline void set_channel_signal_state(struct vmbus_channel *c,
 	c->signal_policy = policy;
 }
 
+static inline void set_channel_affinity_state(struct vmbus_channel *c,
+					      enum hv_numa_policy policy)
+{
+	c->affinity_policy = policy;
+}
+
 static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
 {
 	c->batched_reading = state;
-- 
1.7.4.1