* [PATCH 1/2] Drivers: hv: vmbus: Resolve race between init_vp_index() and CPU hotplug
2020-05-22 17:18 [PATCH] VMBus channel interrupts reassignment - Fixes Andrea Parri (Microsoft)
@ 2020-05-22 17:19 ` Andrea Parri (Microsoft)
2020-05-22 18:34 ` Michael Kelley
2020-05-22 17:19 ` [PATCH 2/2] Drivers: hv: vmbus: Resolve more races involving init_vp_index() Andrea Parri (Microsoft)
2020-05-23 9:04 ` [PATCH] VMBus channel interrupts reassignment - Fixes Wei Liu
2 siblings, 1 reply; 6+ messages in thread
From: Andrea Parri (Microsoft) @ 2020-05-22 17:19 UTC (permalink / raw)
To: linux-hyperv, linux-kernel
Cc: K . Y . Srinivasan, Haiyang Zhang, Stephen Hemminger, Wei Liu,
Michael Kelley, Andrea Parri (Microsoft)
vmbus_process_offer() does two things (among others):
1) first, it sets the channel's target CPU with cpu_hotplug_lock;
2) it then adds the channel to the channel list(s) with channel_mutex.
Since cpu_hotplug_lock is released before (2), the channel's target CPU
(as designated in (1)) can be deemed "free" by hv_synic_cleanup() and go
offline before the channel is added to the list.
Fix the race condition by "extending" the cpu_hotplug_lock critical
section to include (2) (and (1)), nesting the channel_mutex critical
section within the cpu_hotplug_lock critical section as done elsewhere
(hv_synic_cleanup(), target_cpu_store()) in the hyperv drivers code.
Move even further by extending the channel_mutex critical section to
include (1) (and (2)): this change allows to remove (the now redundant)
bind_channel_to_cpu_lock, and generally simplifies the handling of the
target CPUs (that are now always modified with channel_mutex held).
Fixes: d570aec0f2154e ("Drivers: hv: vmbus: Synchronize init_vp_index() vs. CPU hotplug")
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
---
drivers/hv/channel_mgmt.c | 46 +++++++++++++++------------------------
1 file changed, 18 insertions(+), 28 deletions(-)
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index fde806d6525b0..89eaacf069a80 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -554,26 +554,34 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
bool fnew = true;
/*
- * Initialize the target_CPU before inserting the channel in
- * the chn_list and sc_list lists, within the channel_mutex
- * critical section:
+ * Synchronize vmbus_process_offer() and CPU hotplugging:
*
* CPU1 CPU2
*
- * [vmbus_process_offer()] [hv_syninc_cleanup()]
+ * [vmbus_process_offer()] [Hot removal of the CPU]
*
- * STORE target_cpu LOCK channel_mutex
- * LOCK channel_mutex SEARCH chn_list
- * INSERT chn_list LOAD target_cpu
- * UNLOCK channel_mutex UNLOCK channel_mutex
+ * CPU_READ_LOCK CPUS_WRITE_LOCK
+ * LOAD cpu_online_mask SEARCH chn_list
+ * STORE target_cpu LOAD target_cpu
+ * INSERT chn_list STORE cpu_online_mask
+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
+ *
+ * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
*
* Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
* CPU2's LOAD from *not* seing CPU1's STORE
*/
- init_vp_index(newchannel, hv_get_dev_type(newchannel));
+ cpus_read_lock();
+ /*
+ * Serializes the modifications of the chn_list list as well as
+ * the accesses to next_numa_node_id in init_vp_index().
+ */
mutex_lock(&vmbus_connection.channel_mutex);
+ init_vp_index(newchannel, hv_get_dev_type(newchannel));
+
/* Remember the channels that should be cleaned up upon suspend. */
if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
@@ -623,6 +631,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
vmbus_channel_map_relid(newchannel);
mutex_unlock(&vmbus_connection.channel_mutex);
+ cpus_read_unlock();
/*
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
@@ -655,13 +664,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
* We use this state to statically distribute the channel interrupt load.
*/
static int next_numa_node_id;
-/*
- * init_vp_index() accesses global variables like next_numa_node_id, and
- * it can run concurrently for primary channels and sub-channels: see
- * vmbus_process_offer(), so we need the lock to protect the global
- * variables.
- */
-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
/*
* Starting with Win8, we can statically distribute the incoming
@@ -700,15 +702,6 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
return;
}
- /* No CPUs can come up or down during this. */
- cpus_read_lock();
-
- /*
- * Serializes the accesses to the global variable next_numa_node_id.
- * See also the header comment of the spin lock declaration.
- */
- spin_lock(&bind_channel_to_cpu_lock);
-
while (true) {
numa_node = next_numa_node_id++;
if (numa_node == nr_node_ids) {
@@ -739,9 +732,6 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
channel->target_cpu = target_cpu;
channel->target_vp = hv_cpu_number_to_vp_number(target_cpu);
- spin_unlock(&bind_channel_to_cpu_lock);
- cpus_read_unlock();
-
free_cpumask_var(available_mask);
}
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/2] Drivers: hv: vmbus: Resolve more races involving init_vp_index()
2020-05-22 17:18 [PATCH] VMBus channel interrupts reassignment - Fixes Andrea Parri (Microsoft)
2020-05-22 17:19 ` [PATCH 1/2] Drivers: hv: vmbus: Resolve race between init_vp_index() and CPU hotplug Andrea Parri (Microsoft)
@ 2020-05-22 17:19 ` Andrea Parri (Microsoft)
2020-05-22 18:41 ` Michael Kelley
2020-05-23 9:04 ` [PATCH] VMBus channel interrupts reassignment - Fixes Wei Liu
2 siblings, 1 reply; 6+ messages in thread
From: Andrea Parri (Microsoft) @ 2020-05-22 17:19 UTC (permalink / raw)
To: linux-hyperv, linux-kernel
Cc: K . Y . Srinivasan, Haiyang Zhang, Stephen Hemminger, Wei Liu,
Michael Kelley, Andrea Parri (Microsoft)
init_vp_index() uses the (per-node) hv_numa_map[] masks to record the
CPUs allocated for channel interrupts at a given time, and distribute
the performance-critical channels across the available CPUs: in part.,
the mask of "candidate" target CPUs in a given NUMA node, for a newly
offered channel, is determined by XOR-ing the node's CPU mask and the
node's hv_numa_map. This operation/mechanism assumes that no offline
CPUs is set in the hv_numa_map mask, an assumption that does not hold
since such mask is currently not updated when a channel is removed or
assigned to a different CPU.
To address the issues described above, this adds hooks in the channel
removal path (hv_process_channel_removal()) and in target_cpu_store()
in order to clear, resp. to update, the hv_numa_map[] masks as needed.
This also adds a (missed) update of the masks in init_vp_index() (cf.,
e.g., the memory-allocation failure path in this function).
Like in the case of init_vp_index(), such hooks require to determine
if the given channel is performance critical. init_vp_index() does
this by parsing the channel's offer, it can not rely on the device
data structure (device_obj) to retrieve such information because the
device data structure has not been allocated/linked with the channel
by the time that init_vp_index() executes. A similar situation may
hold in hv_is_alloced_cpu() (defined below); the adopted approach is
to "cache" the device type of the channel, as computed by parsing the
channel's offer, in the channel structure itself.
Fixes: 7527810573436f ("Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type")
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
---
drivers/hv/channel_mgmt.c | 22 +++++++++++++-----
drivers/hv/hyperv_vmbus.h | 48 +++++++++++++++++++++++++++++++++++++++
drivers/hv/vmbus_drv.c | 19 +++++++++++-----
include/linux/hyperv.h | 7 ++++++
4 files changed, 84 insertions(+), 12 deletions(-)
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 89eaacf069a80..417a95e5094dd 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -24,9 +24,9 @@
#include "hyperv_vmbus.h"
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
+static void init_vp_index(struct vmbus_channel *channel);
-static const struct vmbus_device vmbus_devs[] = {
+const struct vmbus_device vmbus_devs[] = {
/* IDE */
{ .dev_type = HV_IDE,
HV_IDE_GUID,
@@ -431,6 +431,13 @@ void hv_process_channel_removal(struct vmbus_channel *channel)
spin_unlock_irqrestore(&primary_channel->lock, flags);
}
+ /*
+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
+ * init_vp_index() can (re-)use the CPU.
+ */
+ if (hv_is_perf_channel(channel))
+ hv_clear_alloced_cpu(channel->target_cpu);
+
/*
* Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
* the relid is invalidated; after hibernation, when the user-space app
@@ -497,7 +504,7 @@ static void vmbus_add_channel_work(struct work_struct *work)
if (!newchannel->device_obj)
goto err_deq_chan;
- newchannel->device_obj->device_id = hv_get_dev_type(newchannel);
+ newchannel->device_obj->device_id = newchannel->device_id;
/*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
@@ -580,7 +587,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
*/
mutex_lock(&vmbus_connection.channel_mutex);
- init_vp_index(newchannel, hv_get_dev_type(newchannel));
+ init_vp_index(newchannel);
/* Remember the channels that should be cleaned up upon suspend. */
if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
@@ -676,9 +683,9 @@ static int next_numa_node_id;
* evenly among all the available NUMA nodes. Once the node is assigned,
* we will assign the CPU based on a simple round robin scheme.
*/
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
+static void init_vp_index(struct vmbus_channel *channel)
{
- bool perf_chn = vmbus_devs[dev_type].perf_device;
+ bool perf_chn = hv_is_perf_channel(channel);
cpumask_var_t available_mask;
struct cpumask *alloced_mask;
u32 target_cpu;
@@ -699,6 +706,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
channel->target_cpu = VMBUS_CONNECT_CPU;
channel->target_vp =
hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU);
+ if (perf_chn)
+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
return;
}
@@ -862,6 +871,7 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel,
sizeof(struct vmbus_channel_offer_channel));
channel->monitor_grp = (u8)offer->monitorid / 32;
channel->monitor_bit = (u8)offer->monitorid % 32;
+ channel->device_id = hv_get_dev_type(channel);
}
/*
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 5e5cebe5d048f..40e2b9f91163c 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -395,6 +395,54 @@ enum delay {
MESSAGE_DELAY = 1,
};
+extern const struct vmbus_device vmbus_devs[];
+
+static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
+{
+ return vmbus_devs[channel->device_id].perf_device;
+}
+
+static inline bool hv_is_alloced_cpu(unsigned int cpu)
+{
+ struct vmbus_channel *channel, *sc;
+
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
+ /*
+ * List additions/deletions as well as updates of the target CPUs are
+ * protected by channel_mutex.
+ */
+ list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+ if (!hv_is_perf_channel(channel))
+ continue;
+ if (channel->target_cpu == cpu)
+ return true;
+ list_for_each_entry(sc, &channel->sc_list, sc_list) {
+ if (sc->target_cpu == cpu)
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline void hv_set_alloced_cpu(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
+}
+
+static inline void hv_clear_alloced_cpu(unsigned int cpu)
+{
+ if (hv_is_alloced_cpu(cpu))
+ return;
+ cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
+}
+
+static inline void hv_update_alloced_cpus(unsigned int old_cpu,
+ unsigned int new_cpu)
+{
+ hv_set_alloced_cpu(new_cpu);
+ hv_clear_alloced_cpu(old_cpu);
+}
+
#ifdef CONFIG_HYPERV_TESTING
int hv_debug_add_dev_dir(struct hv_device *dev);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index c2a4a7c0b99a0..47747755d2e1d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1687,8 +1687,8 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
static ssize_t target_cpu_store(struct vmbus_channel *channel,
const char *buf, size_t count)
{
+ u32 target_cpu, origin_cpu;
ssize_t ret = count;
- u32 target_cpu;
if (vmbus_proto_version < VERSION_WIN10_V4_1)
return -EIO;
@@ -1741,7 +1741,8 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
goto cpu_store_unlock;
}
- if (channel->target_cpu == target_cpu)
+ origin_cpu = channel->target_cpu;
+ if (target_cpu == origin_cpu)
goto cpu_store_unlock;
if (vmbus_send_modifychannel(channel->offermsg.child_relid,
@@ -1763,14 +1764,20 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
* in on a CPU that is different from the channel target_cpu value.
*/
- if (channel->change_target_cpu_callback)
- (*channel->change_target_cpu_callback)(channel,
- channel->target_cpu, target_cpu);
-
channel->target_cpu = target_cpu;
channel->target_vp = hv_cpu_number_to_vp_number(target_cpu);
channel->numa_node = cpu_to_node(target_cpu);
+ /* See init_vp_index(). */
+ if (hv_is_perf_channel(channel))
+ hv_update_alloced_cpus(origin_cpu, target_cpu);
+
+ /* Currently set only for storvsc channels. */
+ if (channel->change_target_cpu_callback) {
+ (*channel->change_target_cpu_callback)(channel,
+ origin_cpu, target_cpu);
+ }
+
cpu_store_unlock:
mutex_unlock(&vmbus_connection.channel_mutex);
cpus_read_unlock();
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index d783847d8cb46..40df3103e890b 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -901,6 +901,13 @@ struct vmbus_channel {
bool probe_done;
+ /*
+ * Cache the device ID here for easy access; this is useful, in
+ * particular, in situations where the channel's device_obj has
+ * not been allocated/initialized yet.
+ */
+ u16 device_id;
+
/*
* We must offload the handling of the primary/sub channels
* from the single-threaded vmbus_connection.work_queue to
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread