stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
@ 2022-02-25 20:21 Jacob Keller
  2022-02-25 20:21 ` [PATCH 2/2] ice: fix concurrent reset and removal of VFs Jacob Keller
  2022-02-28 11:10 ` [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Greg KH
  0 siblings, 2 replies; 10+ messages in thread
From: Jacob Keller @ 2022-02-25 20:21 UTC (permalink / raw)
  To: stable; +Cc: Brett Creeley, Konrad Jankowski, Tony Nguyen, Jacob Keller

From: Brett Creeley <brett.creeley@intel.com>

commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.

[I had to fix the cherry-pick manually as the patch added a line around
some context that was missing.]

The VF can be configured via the PF's ndo ops at the same time the PF is
receiving/handling virtchnl messages. This has many issues, with
one of them being the ndo op could be actively resetting a VF (i.e.
resetting it to the default state and deleting/re-adding the VF's VSI)
while a virtchnl message is being handled. The following error was seen
because a VF ndo op was used to change a VF's trust setting while the
VIRTCHNL_OP_CONFIG_VSI_QUEUES was ongoing:

[35274.192484] ice 0000:88:00.0: Failed to set LAN Tx queue context, error: ICE_ERR_PARAM
[35274.193074] ice 0000:88:00.0: VF 0 failed opcode 6, retval: -5
[35274.193640] iavf 0000:88:01.0: PF returned error -5 (IAVF_ERR_PARAM) to our request 6

Fix this by making sure the virtchnl handling and VF ndo ops that
trigger VF resets cannot run concurrently. This is done by adding a
struct mutex cfg_lock to each VF structure. For VF ndo ops, the mutex
will be locked around the critical operations and VFR. Since the ndo ops
will trigger a VFR, the virtchnl thread will use mutex_trylock(). This
is done because if any other thread (i.e. VF ndo op) has the mutex, then
that means the current VF message being handled is no longer valid, so
just ignore it.

This issue can be seen using the following commands:

for i in {0..50}; do
        rmmod ice
        modprobe ice

        sleep 1

        echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs
        echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs

        ip link set ens785f1 vf 0 trust on
        ip link set ens785f0 vf 0 trust on

        sleep 2

        echo 0 > /sys/class/net/ens785f0/device/sriov_numvfs
        echo 0 > /sys/class/net/ens785f1/device/sriov_numvfs
        sleep 1
        echo 1 > /sys/class/net/ens785f0/device/sriov_numvfs
        echo 1 > /sys/class/net/ens785f1/device/sriov_numvfs

        ip link set ens785f1 vf 0 trust on
        ip link set ens785f0 vf 0 trust on
done

Fixes: 7c710869d64e ("ice: Add handlers for VF netdevice operations")
Cc: <stable@vger.kernel.org>
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 25 +++++++++++++++++++
 .../net/ethernet/intel/ice/ice_virtchnl_pf.h  |  5 ++++
 2 files changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index a78e8f00cf71..a0b8436f50ba 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -646,6 +646,8 @@ void ice_free_vfs(struct ice_pf *pf)
 			set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states);
 			ice_free_vf_res(&pf->vf[i]);
 		}
+
+		mutex_destroy(&pf->vf[i].cfg_lock);
 	}
 
 	if (ice_sriov_free_msix_res(pf))
@@ -1894,6 +1896,8 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf)
 		 */
 		ice_vf_ctrl_invalidate_vsi(vf);
 		ice_vf_fdir_init(vf);
+
+		mutex_init(&vf->cfg_lock);
 	}
 }
 
@@ -4082,6 +4086,8 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
 		return 0;
 	}
 
+	mutex_lock(&vf->cfg_lock);
+
 	vf->port_vlan_info = vlanprio;
 
 	if (vf->port_vlan_info)
@@ -4091,6 +4097,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos,
 		dev_info(dev, "Clearing port VLAN on VF %d\n", vf_id);
 
 	ice_vc_reset_vf(vf);
+	mutex_unlock(&vf->cfg_lock);
 
 	return 0;
 }
@@ -4465,6 +4472,15 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 		return;
 	}
 
+	/* VF is being configured in another context that triggers a VFR, so no
+	 * need to process this message
+	 */
+	if (!mutex_trylock(&vf->cfg_lock)) {
+		dev_info(dev, "VF %u is being configured in another context that will trigger a VFR, so there is no need to handle this message\n",
+			 vf->vf_id);
+		return;
+	}
+
 	switch (v_opcode) {
 	case VIRTCHNL_OP_VERSION:
 		err = ice_vc_get_ver_msg(vf, msg);
@@ -4553,6 +4569,8 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 		dev_info(dev, "PF failed to honor VF %d, opcode %d, error %d\n",
 			 vf_id, v_opcode, err);
 	}
+
+	mutex_unlock(&vf->cfg_lock);
 }
 
 /**
@@ -4668,6 +4686,8 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 		return -EINVAL;
 	}
 
+	mutex_lock(&vf->cfg_lock);
+
 	/* VF is notified of its new MAC via the PF's response to the
 	 * VIRTCHNL_OP_GET_VF_RESOURCES message after the VF has been reset
 	 */
@@ -4686,6 +4706,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 	}
 
 	ice_vc_reset_vf(vf);
+	mutex_unlock(&vf->cfg_lock);
 	return 0;
 }
 
@@ -4715,11 +4736,15 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)
 	if (trusted == vf->trusted)
 		return 0;
 
+	mutex_lock(&vf->cfg_lock);
+
 	vf->trusted = trusted;
 	ice_vc_reset_vf(vf);
 	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n",
 		 vf_id, trusted ? "" : "un");
 
+	mutex_unlock(&vf->cfg_lock);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
index 38b4dc82c5c1..a750e9a9d712 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -74,6 +74,11 @@ struct ice_mdd_vf_events {
 struct ice_vf {
 	struct ice_pf *pf;
 
+	/* Used during virtchnl message handling and NDO ops against the VF
+	 * that will trigger a VFR
+	 */
+	struct mutex cfg_lock;
+
 	u16 vf_id;			/* VF ID in the PF space */
 	u16 lan_vsi_idx;		/* index into PF struct */
 	u16 ctrl_vsi_idx;
-- 
2.35.1.129.gb80121027d12


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] ice: fix concurrent reset and removal of VFs
  2022-02-25 20:21 [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Jacob Keller
@ 2022-02-25 20:21 ` Jacob Keller
  2022-02-28 11:10 ` [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Greg KH
  1 sibling, 0 replies; 10+ messages in thread
From: Jacob Keller @ 2022-02-25 20:21 UTC (permalink / raw)
  To: stable; +Cc: Jacob Keller, Konrad Jankowski, Tony Nguyen

commit fadead80fe4c033b5e514fcbadd20b55c4494112 upstream.

Commit c503e63200c6 ("ice: Stop processing VF messages during teardown")
introduced a driver state flag, ICE_VF_DEINIT_IN_PROGRESS, which is
intended to prevent some issues with concurrently handling messages from
VFs while tearing down the VFs.

This change was motivated by crashes caused while tearing down and
bringing up VFs in rapid succession.

It turns out that the fix actually introduces issues with the VF driver
caused because the PF no longer responds to any messages sent by the VF
during its .remove routine. This results in the VF potentially removing
its DMA memory before the PF has shut down the device queues.

Additionally, the fix doesn't actually resolve concurrency issues within
the ice driver. It is possible for a VF to initiate a reset just prior
to the ice driver removing VFs. This can result in the remove task
concurrently operating while the VF is being reset. This results in
similar memory corruption and panics purportedly fixed by that commit.

Fix this concurrency at its root by protecting both the reset and
removal flows using the existing VF cfg_lock. This ensures that we
cannot remove the VF while any outstanding critical tasks such as a
virtchnl message or a reset are occurring.

This locking change also fixes the root cause originally fixed by commit
c503e63200c6 ("ice: Stop processing VF messages during teardown"), so we
can simply revert it.

Note that I kept these two changes together because simply reverting the
original commit alone would leave the driver vulnerable to worse race
conditions.

Fixes: c503e63200c6 ("ice: Stop processing VF messages during teardown")
Cc: <stable@vger.kernel.org> # e6ba5273d4ed: ice: Fix race conditions between virtchnl handling and VF ndo ops
Cc: <stable@vger.kernel.org>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h          |  1 -
 drivers/net/ethernet/intel/ice/ice_main.c     |  2 +
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 42 +++++++++++--------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index d119812755b7..387322615e08 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -231,7 +231,6 @@ enum ice_pf_state {
 	ICE_VFLR_EVENT_PENDING,
 	ICE_FLTR_OVERFLOW_PROMISC,
 	ICE_VF_DIS,
-	ICE_VF_DEINIT_IN_PROGRESS,
 	ICE_CFG_BUSY,
 	ICE_SERVICE_SCHED,
 	ICE_SERVICE_DIS,
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index ab2dea0d2c1a..8a0c928853e6 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1679,7 +1679,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				 * reset, so print the event prior to reset.
 				 */
 				ice_print_vf_rx_mdd_event(vf);
+				mutex_lock(&pf->vf[i].cfg_lock);
 				ice_reset_vf(&pf->vf[i], false);
+				mutex_unlock(&pf->vf[i].cfg_lock);
 			}
 		}
 	}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index a0b8436f50ba..4054adb5279c 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -615,8 +615,6 @@ void ice_free_vfs(struct ice_pf *pf)
 	struct ice_hw *hw = &pf->hw;
 	unsigned int tmp, i;
 
-	set_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
-
 	if (!pf->vf)
 		return;
 
@@ -632,22 +630,26 @@ void ice_free_vfs(struct ice_pf *pf)
 	else
 		dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n");
 
-	/* Avoid wait time by stopping all VFs at the same time */
-	ice_for_each_vf(pf, i)
-		ice_dis_vf_qs(&pf->vf[i]);
-
 	tmp = pf->num_alloc_vfs;
 	pf->num_qps_per_vf = 0;
 	pf->num_alloc_vfs = 0;
 	for (i = 0; i < tmp; i++) {
-		if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		mutex_lock(&vf->cfg_lock);
+
+		ice_dis_vf_qs(vf);
+
+		if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
 			/* disable VF qp mappings and set VF disable state */
-			ice_dis_vf_mappings(&pf->vf[i]);
-			set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states);
-			ice_free_vf_res(&pf->vf[i]);
+			ice_dis_vf_mappings(vf);
+			set_bit(ICE_VF_STATE_DIS, vf->vf_states);
+			ice_free_vf_res(vf);
 		}
 
-		mutex_destroy(&pf->vf[i].cfg_lock);
+		mutex_unlock(&vf->cfg_lock);
+
+		mutex_destroy(&vf->cfg_lock);
 	}
 
 	if (ice_sriov_free_msix_res(pf))
@@ -683,7 +685,6 @@ void ice_free_vfs(struct ice_pf *pf)
 				i);
 
 	clear_bit(ICE_VF_DIS, pf->state);
-	clear_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
 	clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
 }
 
@@ -1567,6 +1568,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
+		mutex_lock(&vf->cfg_lock);
+
 		vf->driver_caps = 0;
 		ice_vc_set_default_allowlist(vf);
 
@@ -1581,6 +1584,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 		ice_vf_pre_vsi_rebuild(vf);
 		ice_vf_rebuild_vsi(vf);
 		ice_vf_post_vsi_rebuild(vf);
+
+		mutex_unlock(&vf->cfg_lock);
 	}
 
 	ice_flush(hw);
@@ -1627,6 +1632,8 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 	u32 reg;
 	int i;
 
+	lockdep_assert_held(&vf->cfg_lock);
+
 	dev = ice_pf_to_dev(pf);
 
 	if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
@@ -2113,9 +2120,12 @@ void ice_process_vflr_event(struct ice_pf *pf)
 		bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32;
 		/* read GLGEN_VFLRSTAT register to find out the flr VFs */
 		reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
-		if (reg & BIT(bit_idx))
+		if (reg & BIT(bit_idx)) {
 			/* GLGEN_VFLRSTAT bit will be cleared in ice_reset_vf */
+			mutex_lock(&vf->cfg_lock);
 			ice_reset_vf(vf, true);
+			mutex_unlock(&vf->cfg_lock);
+		}
 	}
 }
 
@@ -2192,7 +2202,9 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 	if (!vf)
 		return;
 
+	mutex_lock(&vf->cfg_lock);
 	ice_vc_reset_vf(vf);
+	mutex_unlock(&vf->cfg_lock);
 }
 
 /**
@@ -4429,10 +4441,6 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 	struct device *dev;
 	int err = 0;
 
-	/* if de-init is underway, don't process messages from VF */
-	if (test_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state))
-		return;
-
 	dev = ice_pf_to_dev(pf);
 	if (ice_validate_vf_id(pf, vf_id)) {
 		err = -EINVAL;
-- 
2.35.1.129.gb80121027d12


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-25 20:21 [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Jacob Keller
  2022-02-25 20:21 ` [PATCH 2/2] ice: fix concurrent reset and removal of VFs Jacob Keller
@ 2022-02-28 11:10 ` Greg KH
  2022-02-28 11:23   ` Greg KH
  2022-02-28 19:13   ` Keller, Jacob E
  1 sibling, 2 replies; 10+ messages in thread
From: Greg KH @ 2022-02-28 11:10 UTC (permalink / raw)
  To: Jacob Keller; +Cc: stable, Brett Creeley, Konrad Jankowski, Tony Nguyen

On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> From: Brett Creeley <brett.creeley@intel.com>
> 
> commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> 
> [I had to fix the cherry-pick manually as the patch added a line around
> some context that was missing.]

What stable tree(s) is this for?

Thanks,

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-28 11:10 ` [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Greg KH
@ 2022-02-28 11:23   ` Greg KH
  2022-02-28 19:13     ` Keller, Jacob E
  2022-02-28 20:57     ` Keller, Jacob E
  2022-02-28 19:13   ` Keller, Jacob E
  1 sibling, 2 replies; 10+ messages in thread
From: Greg KH @ 2022-02-28 11:23 UTC (permalink / raw)
  To: Jacob Keller; +Cc: stable, Brett Creeley, Konrad Jankowski, Tony Nguyen

On Mon, Feb 28, 2022 at 12:10:20PM +0100, Greg KH wrote:
> On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > From: Brett Creeley <brett.creeley@intel.com>
> > 
> > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> > 
> > [I had to fix the cherry-pick manually as the patch added a line around
> > some context that was missing.]
> 
> What stable tree(s) is this for?$

Looks like it applied only to 5.15.y.  Can you also provide backports
for the other older kernels that these are needed for?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-28 11:10 ` [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Greg KH
  2022-02-28 11:23   ` Greg KH
@ 2022-02-28 19:13   ` Keller, Jacob E
  1 sibling, 0 replies; 10+ messages in thread
From: Keller, Jacob E @ 2022-02-28 19:13 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, Brett Creeley, Jankowski, Konrad0, Nguyen, Anthony L



> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Monday, February 28, 2022 3:10 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: stable@vger.kernel.org; Brett Creeley <brett.creeley@intel.com>; Jankowski,
> Konrad0 <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>
> Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF
> ndo ops
> 
> On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > From: Brett Creeley <brett.creeley@intel.com>
> >
> > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> >
> > [I had to fix the cherry-pick manually as the patch added a line around
> > some context that was missing.]
> 
> What stable tree(s) is this for?
> 

Ah, yea this only applied to 5.15

> Thanks,
> 
> greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-28 11:23   ` Greg KH
@ 2022-02-28 19:13     ` Keller, Jacob E
  2022-02-28 20:57     ` Keller, Jacob E
  1 sibling, 0 replies; 10+ messages in thread
From: Keller, Jacob E @ 2022-02-28 19:13 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, Brett Creeley, Jankowski, Konrad0, Nguyen, Anthony L



> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Monday, February 28, 2022 3:24 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: stable@vger.kernel.org; Brett Creeley <brett.creeley@intel.com>; Jankowski,
> Konrad0 <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>
> Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF
> ndo ops
> 
> On Mon, Feb 28, 2022 at 12:10:20PM +0100, Greg KH wrote:
> > On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > > From: Brett Creeley <brett.creeley@intel.com>
> > >
> > > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> > >
> > > [I had to fix the cherry-pick manually as the patch added a line around
> > > some context that was missing.]
> >
> > What stable tree(s) is this for?$
> 
> Looks like it applied only to 5.15.y.  Can you also provide backports
> for the other older kernels that these are needed for?
> 

Yes, let me take a closer look at the older ones.

> thanks,
> 
> greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-28 11:23   ` Greg KH
  2022-02-28 19:13     ` Keller, Jacob E
@ 2022-02-28 20:57     ` Keller, Jacob E
  2022-03-05 13:43       ` Greg KH
  1 sibling, 1 reply; 10+ messages in thread
From: Keller, Jacob E @ 2022-02-28 20:57 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, Jankowski, Konrad0, Nguyen, Anthony L



> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Monday, February 28, 2022 3:24 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: stable@vger.kernel.org; Brett Creeley <brett.creeley@intel.com>; Jankowski,
> Konrad0 <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>
> Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF
> ndo ops
> 
> On Mon, Feb 28, 2022 at 12:10:20PM +0100, Greg KH wrote:
> > On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > > From: Brett Creeley <brett.creeley@intel.com>
> > >
> > > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> > >
> > > [I had to fix the cherry-pick manually as the patch added a line around
> > > some context that was missing.]
> >
> > What stable tree(s) is this for?$
> 
> Looks like it applied only to 5.15.y.  Can you also provide backports
> for the other older kernels that these are needed for?
> 

Hi Greg!

I sent a series that should apply from v5.8 through 5.12 (excepting one patch that was already on 5.10 but not the other trees, for some reason). I also sent a series for 5.13 and one for 5.14.

The code prior to 5.8 still has this bug, going back to the first release with ice, (4.20), but it is different enough that I had trouble determining what the correct patch is. Especially for the first patch which is necessary for the later fixes, since it introduces the lock we need. I'm going to spend a bit more time later this evening to see if I can sort it out. If not, we may have to live without the fixes on those trees.

Thanks,
Jake

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-02-28 20:57     ` Keller, Jacob E
@ 2022-03-05 13:43       ` Greg KH
  2022-03-07 18:38         ` Keller, Jacob E
  0 siblings, 1 reply; 10+ messages in thread
From: Greg KH @ 2022-03-05 13:43 UTC (permalink / raw)
  To: Keller, Jacob E; +Cc: stable, Jankowski, Konrad0, Nguyen, Anthony L

On Mon, Feb 28, 2022 at 08:57:50PM +0000, Keller, Jacob E wrote:
> 
> 
> > -----Original Message-----
> > From: Greg KH <gregkh@linuxfoundation.org>
> > Sent: Monday, February 28, 2022 3:24 AM
> > To: Keller, Jacob E <jacob.e.keller@intel.com>
> > Cc: stable@vger.kernel.org; Brett Creeley <brett.creeley@intel.com>; Jankowski,
> > Konrad0 <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> > <anthony.l.nguyen@intel.com>
> > Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF
> > ndo ops
> > 
> > On Mon, Feb 28, 2022 at 12:10:20PM +0100, Greg KH wrote:
> > > On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > > > From: Brett Creeley <brett.creeley@intel.com>
> > > >
> > > > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> > > >
> > > > [I had to fix the cherry-pick manually as the patch added a line around
> > > > some context that was missing.]
> > >
> > > What stable tree(s) is this for?$
> > 
> > Looks like it applied only to 5.15.y.  Can you also provide backports
> > for the other older kernels that these are needed for?
> > 
> 
> Hi Greg!
> 
> I sent a series that should apply from v5.8 through 5.12 (excepting one patch that was already on 5.10 but not the other trees, for some reason). I also sent a series for 5.13 and one for 5.14.
> 
> The code prior to 5.8 still has this bug, going back to the first release with ice, (4.20), but it is different enough that I had trouble determining what the correct patch is. Especially for the first patch which is necessary for the later fixes, since it introduces the lock we need. I'm going to spend a bit more time later this evening to see if I can sort it out. If not, we may have to live without the fixes on those trees.

I've applied the 2 missing changes to the 5.10.y tree.  The other ones
are all end-of-life, sorry.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops
  2022-03-05 13:43       ` Greg KH
@ 2022-03-07 18:38         ` Keller, Jacob E
  0 siblings, 0 replies; 10+ messages in thread
From: Keller, Jacob E @ 2022-03-07 18:38 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, Jankowski, Konrad0, Nguyen, Anthony L



> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Saturday, March 05, 2022 5:43 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: stable@vger.kernel.org; Jankowski, Konrad0
> <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>
> Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF
> ndo ops
> 
> On Mon, Feb 28, 2022 at 08:57:50PM +0000, Keller, Jacob E wrote:
> >
> >
> > > -----Original Message-----
> > > From: Greg KH <gregkh@linuxfoundation.org>
> > > Sent: Monday, February 28, 2022 3:24 AM
> > > To: Keller, Jacob E <jacob.e.keller@intel.com>
> > > Cc: stable@vger.kernel.org; Brett Creeley <brett.creeley@intel.com>;
> Jankowski,
> > > Konrad0 <konrad0.jankowski@intel.com>; Nguyen, Anthony L
> > > <anthony.l.nguyen@intel.com>
> > > Subject: Re: [PATCH 1/2] ice: Fix race conditions between virtchnl handling
> and VF
> > > ndo ops
> > >
> > > On Mon, Feb 28, 2022 at 12:10:20PM +0100, Greg KH wrote:
> > > > On Fri, Feb 25, 2022 at 12:21:00PM -0800, Jacob Keller wrote:
> > > > > From: Brett Creeley <brett.creeley@intel.com>
> > > > >
> > > > > commit e6ba5273d4ede03d075d7a116b8edad1f6115f4d upstream.
> > > > >
> > > > > [I had to fix the cherry-pick manually as the patch added a line around
> > > > > some context that was missing.]
> > > >
> > > > What stable tree(s) is this for?$
> > >
> > > Looks like it applied only to 5.15.y.  Can you also provide backports
> > > for the other older kernels that these are needed for?
> > >
> >
> > Hi Greg!
> >
> > I sent a series that should apply from v5.8 through 5.12 (excepting one patch
> that was already on 5.10 but not the other trees, for some reason). I also sent a
> series for 5.13 and one for 5.14.
> >
> > The code prior to 5.8 still has this bug, going back to the first release with ice,
> (4.20), but it is different enough that I had trouble determining what the correct
> patch is. Especially for the first patch which is necessary for the later fixes, since it
> introduces the lock we need. I'm going to spend a bit more time later this evening
> to see if I can sort it out. If not, we may have to live without the fixes on those
> trees.
> 
> I've applied the 2 missing changes to the 5.10.y tree.  The other ones
> are all end-of-life, sorry.
> 
> thanks,
> 
> greg k-h

Thanks Greg!

-Jake

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/2] ice: fix concurrent reset and removal of VFs
  2022-02-28 20:46 Jacob Keller
@ 2022-02-28 20:47 ` Jacob Keller
  0 siblings, 0 replies; 10+ messages in thread
From: Jacob Keller @ 2022-02-28 20:47 UTC (permalink / raw)
  To: stable; +Cc: Jacob Keller

commit fadead80fe4c033b5e514fcbadd20b55c4494112 upstream.

Commit c503e63200c6 ("ice: Stop processing VF messages during teardown")
introduced a driver state flag, ICE_VF_DEINIT_IN_PROGRESS, which is
intended to prevent some issues with concurrently handling messages from
VFs while tearing down the VFs.

This change was motivated by crashes caused while tearing down and
bringing up VFs in rapid succession.

It turns out that the fix actually introduces issues with the VF driver
caused because the PF no longer responds to any messages sent by the VF
during its .remove routine. This results in the VF potentially removing
its DMA memory before the PF has shut down the device queues.

Additionally, the fix doesn't actually resolve concurrency issues within
the ice driver. It is possible for a VF to initiate a reset just prior
to the ice driver removing VFs. This can result in the remove task
concurrently operating while the VF is being reset. This results in
similar memory corruption and panics purportedly fixed by that commit.

Fix this concurrency at its root by protecting both the reset and
removal flows using the existing VF cfg_lock. This ensures that we
cannot remove the VF while any outstanding critical tasks such as a
virtchnl message or a reset are occurring.

This locking change also fixes the root cause originally fixed by commit
c503e63200c6 ("ice: Stop processing VF messages during teardown"), so we
can simply revert it.

Note that I kept these two changes together because simply reverting the
original commit alone would leave the driver vulnerable to worse race
conditions.

Fixes: c503e63200c6 ("ice: Stop processing VF messages during teardown")
Cc: <stable@vger.kernel.org> # 5.14.x: e6ba5273d4ed: ice: Fix race conditions between virtchnl handling and VF ndo ops
Cc: <stable@vger.kernel.org> # 5.14.x
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
This should apply to 5.14.x

 drivers/net/ethernet/intel/ice/ice.h          |  1 -
 drivers/net/ethernet/intel/ice/ice_main.c     |  2 +
 .../net/ethernet/intel/ice/ice_virtchnl_pf.c  | 42 +++++++++++--------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 8b23fbf3cdf4..0d6df020a63d 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -234,7 +234,6 @@ enum ice_pf_state {
 	ICE_VFLR_EVENT_PENDING,
 	ICE_FLTR_OVERFLOW_PROMISC,
 	ICE_VF_DIS,
-	ICE_VF_DEINIT_IN_PROGRESS,
 	ICE_CFG_BUSY,
 	ICE_SERVICE_SCHED,
 	ICE_SERVICE_DIS,
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 2a20881d07e8..8808ede1b2bf 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1679,7 +1679,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 				 * reset, so print the event prior to reset.
 				 */
 				ice_print_vf_rx_mdd_event(vf);
+				mutex_lock(&pf->vf[i].cfg_lock);
 				ice_reset_vf(&pf->vf[i], false);
+				mutex_unlock(&pf->vf[i].cfg_lock);
 			}
 		}
 	}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index d2f79d579745..ed8f47a901c7 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -615,8 +615,6 @@ void ice_free_vfs(struct ice_pf *pf)
 	struct ice_hw *hw = &pf->hw;
 	unsigned int tmp, i;
 
-	set_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
-
 	if (!pf->vf)
 		return;
 
@@ -632,22 +630,26 @@ void ice_free_vfs(struct ice_pf *pf)
 	else
 		dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n");
 
-	/* Avoid wait time by stopping all VFs at the same time */
-	ice_for_each_vf(pf, i)
-		ice_dis_vf_qs(&pf->vf[i]);
-
 	tmp = pf->num_alloc_vfs;
 	pf->num_qps_per_vf = 0;
 	pf->num_alloc_vfs = 0;
 	for (i = 0; i < tmp; i++) {
-		if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) {
+		struct ice_vf *vf = &pf->vf[i];
+
+		mutex_lock(&vf->cfg_lock);
+
+		ice_dis_vf_qs(vf);
+
+		if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
 			/* disable VF qp mappings and set VF disable state */
-			ice_dis_vf_mappings(&pf->vf[i]);
-			set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states);
-			ice_free_vf_res(&pf->vf[i]);
+			ice_dis_vf_mappings(vf);
+			set_bit(ICE_VF_STATE_DIS, vf->vf_states);
+			ice_free_vf_res(vf);
 		}
 
-		mutex_destroy(&pf->vf[i].cfg_lock);
+		mutex_unlock(&vf->cfg_lock);
+
+		mutex_destroy(&vf->cfg_lock);
 	}
 
 	if (ice_sriov_free_msix_res(pf))
@@ -683,7 +685,6 @@ void ice_free_vfs(struct ice_pf *pf)
 				i);
 
 	clear_bit(ICE_VF_DIS, pf->state);
-	clear_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
 	clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
 }
 
@@ -1567,6 +1568,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
+		mutex_lock(&vf->cfg_lock);
+
 		vf->driver_caps = 0;
 		ice_vc_set_default_allowlist(vf);
 
@@ -1580,6 +1583,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 		ice_vf_pre_vsi_rebuild(vf);
 		ice_vf_rebuild_vsi(vf);
 		ice_vf_post_vsi_rebuild(vf);
+
+		mutex_unlock(&vf->cfg_lock);
 	}
 
 	ice_flush(hw);
@@ -1626,6 +1631,8 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 	u32 reg;
 	int i;
 
+	lockdep_assert_held(&vf->cfg_lock);
+
 	dev = ice_pf_to_dev(pf);
 
 	if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
@@ -2111,9 +2118,12 @@ void ice_process_vflr_event(struct ice_pf *pf)
 		bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32;
 		/* read GLGEN_VFLRSTAT register to find out the flr VFs */
 		reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
-		if (reg & BIT(bit_idx))
+		if (reg & BIT(bit_idx)) {
 			/* GLGEN_VFLRSTAT bit will be cleared in ice_reset_vf */
+			mutex_lock(&vf->cfg_lock);
 			ice_reset_vf(vf, true);
+			mutex_unlock(&vf->cfg_lock);
+		}
 	}
 }
 
@@ -2190,7 +2200,9 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 	if (!vf)
 		return;
 
+	mutex_lock(&vf->cfg_lock);
 	ice_vc_reset_vf(vf);
+	mutex_unlock(&vf->cfg_lock);
 }
 
 /**
@@ -4427,10 +4439,6 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 	struct device *dev;
 	int err = 0;
 
-	/* if de-init is underway, don't process messages from VF */
-	if (test_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state))
-		return;
-
 	dev = ice_pf_to_dev(pf);
 	if (ice_validate_vf_id(pf, vf_id)) {
 		err = -EINVAL;
-- 
2.35.1.355.ge7e302376dd6


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-03-07 18:39 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-25 20:21 [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Jacob Keller
2022-02-25 20:21 ` [PATCH 2/2] ice: fix concurrent reset and removal of VFs Jacob Keller
2022-02-28 11:10 ` [PATCH 1/2] ice: Fix race conditions between virtchnl handling and VF ndo ops Greg KH
2022-02-28 11:23   ` Greg KH
2022-02-28 19:13     ` Keller, Jacob E
2022-02-28 20:57     ` Keller, Jacob E
2022-03-05 13:43       ` Greg KH
2022-03-07 18:38         ` Keller, Jacob E
2022-02-28 19:13   ` Keller, Jacob E
2022-02-28 20:46 Jacob Keller
2022-02-28 20:47 ` [PATCH 2/2] ice: fix concurrent reset and removal of VFs Jacob Keller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).