linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Majd Dibbiny <majd@mellanox.com>, Moni Shoua <monis@mellanox.com>,
	Leon Romanovsky <leonro@mellanox.com>,
	Jason Gunthorpe <jgg@mellanox.com>,
	Sasha Levin <sashal@kernel.org>,
	linux-rdma@vger.kernel.org
Subject: [PATCH AUTOSEL 4.19 084/205] IB/mlx5: Change TX affinity assignment in RoCE LAG mode
Date: Fri,  8 Nov 2019 06:35:51 -0500	[thread overview]
Message-ID: <20191108113752.12502-84-sashal@kernel.org> (raw)
In-Reply-To: <20191108113752.12502-1-sashal@kernel.org>

From: Majd Dibbiny <majd@mellanox.com>

[ Upstream commit c6a21c3864fc7f5febae7d096cd136f397c791f2 ]

In the current code, the TX affinity is per RoCE device, which can cause
unfairness between different contexts. e.g. if we open two contexts, and
each open 10 QPs concurrently, all of the QPs of the first context might
end up on the first port instead of distributed on the two ports as
expected

To overcome this unfairness between processes, we maintain per device TX
affinity, and per process TX affinity.

The allocation algorithm is as follow:

1. Hold two tx_port_affinity atomic variables, one per RoCE device and one
   per ucontext. Both initialized to 0.

2. In mlx5_ib_alloc_ucontext do:
 2.1. ucontext.tx_port_affinity = device.tx_port_affinity
 2.2. device.tx_port_affinity += 1

3. In modify QP INIT2RST:
 3.1. qp.tx_port_affinity = ucontext.tx_port_affinity % MLX5_PORT_NUM
 3.2. ucontext.tx_port_affinity += 1

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c    |  8 ++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  4 ++-
 drivers/infiniband/hw/mlx5/qp.c      | 37 +++++++++++++++++++++++++---
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index c05eae93170eb..f4ffdc588ea07 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1823,6 +1823,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	context->lib_caps = req.lib_caps;
 	print_lib_caps(dev, context->lib_caps);
 
+	if (mlx5_lag_is_active(dev->mdev)) {
+		u8 port = mlx5_core_native_port_num(dev->mdev);
+
+		atomic_set(&context->tx_port_affinity,
+			   atomic_add_return(
+				   1, &dev->roce[port].tx_port_affinity));
+	}
+
 	return &context->ibucontext;
 
 out_mdev:
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 941d1df54631a..6a060c84598fe 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -139,6 +139,8 @@ struct mlx5_ib_ucontext {
 	u64			lib_caps;
 	DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
 	u16			devx_uid;
+	/* For RoCE LAG TX affinity */
+	atomic_t		tx_port_affinity;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -700,7 +702,7 @@ struct mlx5_roce {
 	rwlock_t		netdev_lock;
 	struct net_device	*netdev;
 	struct notifier_block	nb;
-	atomic_t		next_port;
+	atomic_t		tx_port_affinity;
 	enum ib_port_state last_port_state;
 	struct mlx5_ib_dev	*dev;
 	u8			native_port_num;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 77b1f3fd086ad..69669c3b13d85 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2908,6 +2908,37 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	return 0;
 }
 
+static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_pd *pd,
+				    struct mlx5_ib_qp_base *qp_base,
+				    u8 port_num)
+{
+	struct mlx5_ib_ucontext *ucontext = NULL;
+	unsigned int tx_port_affinity;
+
+	if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
+		ucontext = to_mucontext(pd->ibpd.uobject->context);
+
+	if (ucontext) {
+		tx_port_affinity = (unsigned int)atomic_add_return(
+					   1, &ucontext->tx_port_affinity) %
+					   MLX5_MAX_PORTS +
+				   1;
+		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n",
+				tx_port_affinity, qp_base->mqp.qpn, ucontext);
+	} else {
+		tx_port_affinity =
+			(unsigned int)atomic_add_return(
+				1, &dev->roce[port_num].tx_port_affinity) %
+				MLX5_MAX_PORTS +
+			1;
+		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n",
+				tx_port_affinity, qp_base->mqp.qpn);
+	}
+
+	return tx_port_affinity;
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state,
@@ -2973,6 +3004,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (!context)
 		return -ENOMEM;
 
+	pd = get_pd(qp);
 	context->flags = cpu_to_be32(mlx5_st << 16);
 
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
@@ -3001,9 +3033,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
 			if (mlx5_lag_is_active(dev->mdev)) {
 				u8 p = mlx5_core_native_port_num(dev->mdev);
-				tx_affinity = (unsigned int)atomic_add_return(1,
-						&dev->roce[p].next_port) %
-						MLX5_MAX_PORTS + 1;
+				tx_affinity = get_tx_affinity(dev, pd, base, p);
 				context->flags |= cpu_to_be32(tx_affinity << 24);
 			}
 		}
@@ -3061,7 +3091,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			goto out;
 	}
 
-	pd = get_pd(qp);
 	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
 		&send_cq, &recv_cq);
 
-- 
2.20.1


  parent reply	other threads:[~2019-11-08 12:09 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20191108113752.12502-1-sashal@kernel.org>
2019-11-08 11:35 ` [PATCH AUTOSEL 4.19 054/205] IB/rxe: avoid back-to-back retries Sasha Levin
2019-11-08 11:35 ` [PATCH AUTOSEL 4.19 055/205] IB/rxe: fixes for rdma read retry Sasha Levin
2019-11-08 11:35 ` [PATCH AUTOSEL 4.19 075/205] net/mlx5: Fix atomic_mode enum values Sasha Levin
2019-11-08 11:35 ` Sasha Levin [this message]
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 096/205] IB/mlx5: Don't hold spin lock while checking device state Sasha Levin
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 097/205] IB/ipoib: Ensure that MTU isn't less than minimum permitted Sasha Levin
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 098/205] RDMA/core: Rate limit MAD error messages Sasha Levin
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 099/205] RDMA/core: Follow correct unregister order between sysfs and cgroup Sasha Levin
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 129/205] RDMA/hns: Fix an error code in hns_roce_v2_init_eq_table() Sasha Levin
2019-11-08 11:36 ` [PATCH AUTOSEL 4.19 130/205] IB/hfi1: Missing return value in error path for user sdma Sasha Levin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191108113752.12502-84-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=jgg@mellanox.com \
    --cc=leonro@mellanox.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=majd@mellanox.com \
    --cc=monis@mellanox.com \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).