All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
@ 2016-01-12 10:32 Sagi Grimberg
       [not found] ` <1452594732-9573-1-git-send-email-sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-12 10:32 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA; +Cc: Matan Barak, Leon Romanovsky

mlx5 keeps a lot of internal accounting for wr processing.
mlx5_ib_wq consists of multiple arrays:
struct mlx5_ib_wq {
	u64		       *wrid;
	u32		       *wr_data;
	struct wr_list	       *w_list;
	unsigned	       *wqe_head;
	...
}

Each time we access each of these arrays, even for a single index
we fetch a cacheline. Reduce cacheline bounces by fitting these members
in a cacheline aligned struct (swr_ctx) and allocate an array. Accessing
this array will fetch all of these members in a single shot.

Since the receive queue needs only the wrid we use a nameless union
where in the rwr_ctx we only have wrid member.

Signed-off-by: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/cq.c      | 18 +++++++--------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +++++++++++++----
 drivers/infiniband/hw/mlx5/qp.c      | 45 +++++++++++++++---------------------
 3 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index b14316603e44..5eb0fcac72b1 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -102,7 +102,7 @@ static void *next_cqe_sw(struct mlx5_ib_cq *cq)
 
 static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
 {
-	switch (wq->wr_data[idx]) {
+	switch (wq->swr_ctx[idx].wr_data) {
 	case MLX5_IB_WR_UMR:
 		return 0;
 
@@ -194,7 +194,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
 		}
 	} else {
 		wq	  = &qp->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid;
 		++wq->tail;
 	}
 	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
@@ -378,9 +378,9 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
 		if (idx == head)
 			break;
 
-		tail = qp->sq.w_list[idx].next;
+		tail = qp->sq.swr_ctx[idx].w_list.next;
 	} while (1);
-	tail = qp->sq.w_list[idx].next;
+	tail = qp->sq.swr_ctx[idx].w_list.next;
 	qp->sq.last_poll = tail;
 }
 
@@ -490,8 +490,8 @@ repoll:
 		idx = wqe_ctr & (wq->wqe_cnt - 1);
 		handle_good_req(wc, cqe64, wq, idx);
 		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
-		wc->wr_id = wq->wrid[idx];
-		wq->tail = wq->wqe_head[idx] + 1;
+		wc->wr_id = wq->swr_ctx[idx].wrid;
+		wq->tail = wq->swr_ctx[idx].wqe_head + 1;
 		wc->status = IB_WC_SUCCESS;
 		break;
 	case MLX5_CQE_RESP_WR_IMM:
@@ -516,8 +516,8 @@ repoll:
 			wq = &(*cur_qp)->sq;
 			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
 			idx = wqe_ctr & (wq->wqe_cnt - 1);
-			wc->wr_id = wq->wrid[idx];
-			wq->tail = wq->wqe_head[idx] + 1;
+			wc->wr_id = wq->swr_ctx[idx].wrid;
+			wq->tail = wq->swr_ctx[idx].wqe_head + 1;
 		} else {
 			struct mlx5_ib_srq *srq;
 
@@ -528,7 +528,7 @@ repoll:
 				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
 			} else {
 				wq = &(*cur_qp)->rq;
-				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid;
 				++wq->tail;
 			}
 		}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d4b227126265..84cb8fc072a1 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -129,11 +129,24 @@ struct wr_list {
 	u16	next;
 };
 
+/* Please don't let this exceed a single cacheline */
+struct swr_ctx {
+	u64		wrid;
+	u32		wr_data;
+	struct wr_list	w_list;
+	u32		wqe_head;
+	u8		rsvd[12];
+}__packed;
+
+struct rwr_ctx {
+	u64		       wrid;
+}__packed;
+
 struct mlx5_ib_wq {
-	u64		       *wrid;
-	u32		       *wr_data;
-	struct wr_list	       *w_list;
-	unsigned	       *wqe_head;
+	union {
+		struct swr_ctx *swr_ctx;
+		struct rwr_ctx *rwr_ctx;
+	};
 	u16		        unsig_count;
 
 	/* serialize post to the work queue
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 1ea049ed87da..a6b88902d7af 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -794,14 +794,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 		goto err_free;
 	}
 
-	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
-	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
-	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
-	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
-	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
-
-	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
-	    !qp->sq.w_list || !qp->sq.wqe_head) {
+	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
+				 GFP_KERNEL);
+	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
+				 GFP_KERNEL);
+	if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) {
 		err = -ENOMEM;
 		goto err_wrid;
 	}
@@ -811,11 +808,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 
 err_wrid:
 	mlx5_db_free(dev->mdev, &qp->db);
-	kfree(qp->sq.wqe_head);
-	kfree(qp->sq.w_list);
-	kfree(qp->sq.wrid);
-	kfree(qp->sq.wr_data);
-	kfree(qp->rq.wrid);
+	kfree(qp->sq.swr_ctx);
+	kfree(qp->rq.rwr_ctx);
 
 err_free:
 	kvfree(*in);
@@ -831,11 +825,8 @@ err_uuar:
 static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
 	mlx5_db_free(dev->mdev, &qp->db);
-	kfree(qp->sq.wqe_head);
-	kfree(qp->sq.w_list);
-	kfree(qp->sq.wrid);
-	kfree(qp->sq.wr_data);
-	kfree(qp->rq.wrid);
+	kfree(qp->sq.swr_ctx);
+	kfree(qp->rq.rwr_ctx);
 	mlx5_buf_free(dev->mdev, &qp->buf);
 	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
 }
@@ -2623,11 +2614,11 @@ static void finish_wqe(struct mlx5_ib_qp *qp,
 	if (unlikely(qp->wq_sig))
 		ctrl->signature = wq_sig(ctrl);
 
-	qp->sq.wrid[idx] = wr_id;
-	qp->sq.w_list[idx].opcode = mlx5_opcode;
-	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.swr_ctx[idx].wrid = wr_id;
+	qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode;
+	qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq;
 	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
-	qp->sq.w_list[idx].next = qp->sq.cur_post;
+	qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post;
 }
 
 
@@ -2708,7 +2699,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 			case IB_WR_LOCAL_INV:
 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV;
 				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
 				set_linv_wr(qp, &seg, &size);
 				num_sge = 0;
@@ -2716,7 +2707,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 
 			case IB_WR_REG_MR:
 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-				qp->sq.wr_data[idx] = IB_WR_REG_MR;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_MR;
 				ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
 				err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
 				if (err) {
@@ -2727,7 +2718,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				break;
 
 			case IB_WR_REG_SIG_MR:
-				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
+				qp->sq.swr_ctx[idx].wr_data = IB_WR_REG_SIG_MR;
 				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
 
 				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
@@ -2829,7 +2820,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				mlx5_ib_warn(dev, "bad opcode\n");
 				goto out;
 			}
-			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			qp->sq.swr_ctx[idx].wr_data = MLX5_IB_WR_UMR;
 			ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey);
 			set_reg_umr_segment(seg, wr);
 			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
@@ -2977,7 +2968,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
 		}
 
-		qp->rq.wrid[ind] = wr->wr_id;
+		qp->rq.rwr_ctx[ind].wrid = wr->wr_id;
 
 		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
 	}
-- 
1.8.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found] ` <1452594732-9573-1-git-send-email-sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2016-01-12 12:16   ` Or Gerlitz
       [not found]     ` <5694EEAA.3050600-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2016-01-12 14:37   ` Yann Droneaud
  1 sibling, 1 reply; 12+ messages in thread
From: Or Gerlitz @ 2016-01-12 12:16 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky

On 1/12/2016 12:32 PM, Sagi Grimberg wrote:
> mlx5 keeps a lot of internal accounting for wr processing.
> mlx5_ib_wq consists of multiple arrays:
> struct mlx5_ib_wq {
> 	u64		       *wrid;
> 	u32		       *wr_data;
> 	struct wr_list	       *w_list;
> 	unsigned	       *wqe_head;
> 	...
> }
>
> Each time we access each of these arrays, even for a single index
> we fetch a cacheline. Reduce cacheline bounces by fitting these members
> in a cacheline aligned struct (swr_ctx) and allocate an array. Accessing
> this array will fetch all of these members in a single shot.
>
> Since the receive queue needs only the wrid we use a nameless union
> where in the rwr_ctx we only have wrid member.

Have some performance numbers before/after this patch to support the 
proposed change?

Also, I have asked you the same question re the iser remote invalidation 
series [1], this datais needed there too.

Or.

[1] http://marc.info/?l=linux-rdma&m=145188949807652&w=2

--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -129,11 +129,24 @@ struct wr_list {
  	u16	next;
  };
  
+/* Please don't let this exceed a single cacheline */
+struct swr_ctx {
+	u64		wrid;
+	u32		wr_data;
+	struct wr_list	w_list;
+	u32		wqe_head;
+	u8		rsvd[12];
+}__packed;


what the role of the reserved field, is that for alignment purposes? if 
yes, maybe better namewould be "align"

Nit, (same) checkpatch error here and below

ERROR: space required after that close brace '}'
#111: FILE: drivers/infiniband/hw/mlx5/mlx5_ib.h:139:
+}__packed;

> +
> +struct rwr_ctx {
> +	u64		       wrid;
> +}__packed;
> +

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found] ` <1452594732-9573-1-git-send-email-sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2016-01-12 12:16   ` Or Gerlitz
@ 2016-01-12 14:37   ` Yann Droneaud
       [not found]     ` <1452609431.9500.24.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 12+ messages in thread
From: Yann Droneaud @ 2016-01-12 14:37 UTC (permalink / raw)
  To: Sagi Grimberg, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Matan Barak, Leon Romanovsky

Hi,

Le mardi 12 janvier 2016 à 12:32 +0200, Sagi Grimberg a écrit :
> mlx5 keeps a lot of internal accounting for wr processing.
> mlx5_ib_wq consists of multiple arrays:
> struct mlx5_ib_wq {
> 	u64		       *wrid;
> 	u32		       *wr_data;
> 	struct wr_list	       *w_list;
> 	unsigned	       *wqe_head;
> 	...
> }
> 
> Each time we access each of these arrays, even for a single index
> we fetch a cacheline. Reduce cacheline bounces by fitting these
> members
> in a cacheline aligned struct (swr_ctx) and allocate an array.
> Accessing
> this array will fetch all of these members in a single shot.
> 
> Since the receive queue needs only the wrid we use a nameless union
> where in the rwr_ctx we only have wrid member.
> 
> Signed-off-by: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/infiniband/hw/mlx5/cq.c      | 18 +++++++--------
>  drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +++++++++++++----
>  drivers/infiniband/hw/mlx5/qp.c      | 45 +++++++++++++++-----------
> ----------
>  3 files changed, 44 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index d4b227126265..84cb8fc072a1 100644
> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -129,11 +129,24 @@ struct wr_list {
>  	u16	next;
>  };
>  
> +/* Please don't let this exceed a single cacheline */

Don't add a comment, add a compile time assert:

BUILD_BUG_ON(sizeof(struct swr_ctx) <= L1_CACHE_BYTES);

> +struct swr_ctx {
> +	u64		wrid;
> +	u32		wr_data;
> +	struct wr_list	w_list;
> +	u32		wqe_head;
> +	u8		rsvd[12];
> +}__packed;
> +

Packing the structure might make some fields unaligned and, on some
architecture, unaligned access are likely unwelcomed.

What about

struct swr_ctx {
	u64		wrid;
	u32		wr_data;
	struct wr_list	w_list;
	u32		wqe_head;
} ____cacheline_aligned;


> +struct rwr_ctx {
> +	u64		       wrid;
> +}__packed;
> +
>  struct mlx5_ib_wq {
> -	u64		       *wrid;
> -	u32		       *wr_data;
> -	struct wr_list	       *w_list;
> -	unsigned	       *wqe_head;
> +	union {
> +		struct swr_ctx *swr_ctx;
> +		struct rwr_ctx *rwr_ctx;
> +	};
>  	u16		        unsig_count;

Check the structure layout is the one you expect with pahole.


diff --git a/drivers/infiniband/hw/mlx5/qp.c
> b/drivers/infiniband/hw/mlx5/qp.c
> index 1ea049ed87da..a6b88902d7af 100644
> --- a/drivers/infiniband/hw/mlx5/qp.c
> +++ b/drivers/infiniband/hw/mlx5/qp.c
> @@ -794,14 +794,11 @@ static int create_kernel_qp(struct mlx5_ib_dev
> *dev,
>  		goto err_free;
>  	}
>  
> -	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
> -	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
> -	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
> -	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
> -	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
> -
> -	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
> -	    !qp->sq.w_list || !qp->sq.wqe_head) {
> +	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
> +				 GFP_KERNEL);
> +	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
> +				 GFP_KERNEL);
> 

Anyway, I'm not sure about the alignment of the memory returned by
kcalloc(), I should have known by the time but don't find time to
figure how it's handled on various SL*B allocator implementation.

Regards.

-- 
Yann Droneaud
OPTEYA


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]     ` <5694EEAA.3050600-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2016-01-12 14:44       ` Sagi Grimberg
       [not found]         ` <56951130.60802-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-12 14:44 UTC (permalink / raw)
  To: Or Gerlitz, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky


>> Each time we access each of these arrays, even for a single index
>> we fetch a cacheline. Reduce cacheline bounces by fitting these members
>> in a cacheline aligned struct (swr_ctx) and allocate an array. Accessing
>> this array will fetch all of these members in a single shot.
>>
>> Since the receive queue needs only the wrid we use a nameless union
>> where in the rwr_ctx we only have wrid member.
>
> Have some performance numbers before/after this patch to support the
> proposed change?

I didn't took the time to measure cache hit/miss. I just noticed it
a while ago and it's been bugging me for some time so I figured I'd
send it out...

> Also, I have asked you the same question re the iser remote invalidation
> series [1], this datais needed there too.

I didn't get to it, we had rough initial numbers, but we have yet to do
a full evaluation on different devices.

> +/* Please don't let this exceed a single cacheline */
> +struct swr_ctx {
> +    u64        wrid;
> +    u32        wr_data;
> +    struct wr_list    w_list;
> +    u32        wqe_head;
> +    u8        rsvd[12];
> +}__packed;
>
>
> what the role of the reserved field, is that for alignment purposes? if
> yes, maybe better namewould be "align"

OK. I can change it.

>
> Nit, (same) checkpatch error here and below
>
> ERROR: space required after that close brace '}'
> #111: FILE: drivers/infiniband/hw/mlx5/mlx5_ib.h:139:
> +}__packed;

Will fix.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]         ` <56951130.60802-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2016-01-12 14:53           ` Or Gerlitz
       [not found]             ` <56951375.1050704-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Or Gerlitz @ 2016-01-12 14:53 UTC (permalink / raw)
  To: Sagi Grimberg, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky

On 1/12/2016 4:44 PM, Sagi Grimberg wrote:
>
>>> Each time we access each of these arrays, even for a single index
>>> we fetch a cacheline. Reduce cacheline bounces by fitting these members
>>> in a cacheline aligned struct (swr_ctx) and allocate an array. 
>>> Accessing
>>> this array will fetch all of these members in a single shot.
>>>
>>> Since the receive queue needs only the wrid we use a nameless union
>>> where in the rwr_ctx we only have wrid member.
>>
>> Have some performance numbers before/after this patch to support the
>> proposed change?
>
> I didn't took the time to measure cache hit/miss. I just noticed it
> a while ago and it's been bugging me for some time so I figured I'd
> send it out... 

The thing is that for data-path changes on high performance network 
drivers, we @ least need to know that the perf is as good as it was 
before the change. So you could run your iser perf IOPS test 
before/after the change and post 1-2 lines with results as part of the 
change-log.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]             ` <56951375.1050704-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2016-01-12 14:58               ` Sagi Grimberg
       [not found]                 ` <569514A5.1050705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-12 14:58 UTC (permalink / raw)
  To: Or Gerlitz, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky


>> I didn't took the time to measure cache hit/miss. I just noticed it
>> a while ago and it's been bugging me for some time so I figured I'd
>> send it out...
>
> The thing is that for data-path changes on high performance network
> drivers, we @ least need to know that the perf is as good as it was
> before the change. So you could run your iser perf IOPS test
> before/after the change and post 1-2 lines with results as part of the
> change-log.

I tested iser perf and it seems to sustain. I didn't see any major
difference but I really don't think block storage iops/latency is not
the correct way to evaluate this change.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]                 ` <569514A5.1050705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2016-01-12 15:05                   ` Or Gerlitz
       [not found]                     ` <56951634.5030307-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2016-01-13  8:55                   ` Or Gerlitz
  1 sibling, 1 reply; 12+ messages in thread
From: Or Gerlitz @ 2016-01-12 15:05 UTC (permalink / raw)
  To: Sagi Grimberg, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky

On 1/12/2016 4:58 PM, Sagi Grimberg wrote:
>
>>> I didn't took the time to measure cache hit/miss. I just noticed it
>>> a while ago and it's been bugging me for some time so I figured I'd
>>> send it out...
>>
>> The thing is that for data-path changes on high performance network
>> drivers, we @ least need to know that the perf is as good as it was
>> before the change. So you could run your iser perf IOPS test
>> before/after the change and post 1-2 lines with results as part of the
>> change-log.
>
> I tested iser perf and it seems to sustain. I didn't see any major
> difference but I really don't think block storage iops/latency is not
> the correct way to evaluate this change. 

maybe one nice option would be to take a look on how things are 
organized in libmlx5 around that corner
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]                     ` <56951634.5030307-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2016-01-12 15:09                       ` Sagi Grimberg
  0 siblings, 0 replies; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-12 15:09 UTC (permalink / raw)
  To: Or Gerlitz, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky


> maybe one nice option would be to take a look on how things are
> organized in libmlx5 around that corner

We don't have this accounting in libmlx5. Although I suspect we'll have
it at some point.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]     ` <1452609431.9500.24.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
@ 2016-01-13  8:44       ` Sagi Grimberg
       [not found]         ` <56960E5C.5000607-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-13  8:44 UTC (permalink / raw)
  To: Yann Droneaud, Sagi Grimberg, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Matan Barak, Leon Romanovsky


> Hi,

Hi Yann,

>>
>> +/* Please don't let this exceed a single cacheline */
>
> Don't add a comment, add a compile time assert:
>
> BUILD_BUG_ON(sizeof(struct swr_ctx) <= L1_CACHE_BYTES);

Sure, I'll do that.

>> +struct swr_ctx {
>> +	u64		wrid;
>> +	u32		wr_data;
>> +	struct wr_list	w_list;
>> +	u32		wqe_head;
>> +	u8		rsvd[12];
>> +}__packed;
>> +
>
> Packing the structure might make some fields unaligned and, on some
> architecture, unaligned access are likely unwelcomed.

I manually align, so I can remove the packing...

> What about
>
> struct swr_ctx {
> 	u64		wrid;
> 	u32		wr_data;
> 	struct wr_list	w_list;
> 	u32		wqe_head;
> } ____cacheline_aligned;

The reason why I didn't use cacheline_aligned is that in 64B cacheline
architectures I'll get padding to 64B while I can only padd to 32B. With
that I get the benefit of fetching 2 entries which is sorta like an
implicit prefetch (useful for the next post_send).

Do you know any annotation that can do this for me?

>> +struct rwr_ctx {
>> +	u64		       wrid;
>> +}__packed;
>> +
>>   struct mlx5_ib_wq {
>> -	u64		       *wrid;
>> -	u32		       *wr_data;
>> -	struct wr_list	       *w_list;
>> -	unsigned	       *wqe_head;
>> +	union {
>> +		struct swr_ctx *swr_ctx;
>> +		struct rwr_ctx *rwr_ctx;
>> +	};
>>   	u16		        unsig_count;
>
> Check the structure layout is the one you expect with pahole.

I did, given that I simply removed 3 pointers I get the same layout
(2B hole after unsig_count). The nice side effect of this is that
mlx5_ib_wq is now smaller than a 64B cacheline (reduced from 80B to
56B).

>> +	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
>> +				 GFP_KERNEL);
>> +	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->sq.rwr_ctx),
>> +				 GFP_KERNEL);
>>
>
> Anyway, I'm not sure about the alignment of the memory returned by
> kcalloc(), I should have known by the time but don't find time to
> figure how it's handled on various SL*B allocator implementation.

Can you explain why should I worry about the alignment of kcalloc
here?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]                 ` <569514A5.1050705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  2016-01-12 15:05                   ` Or Gerlitz
@ 2016-01-13  8:55                   ` Or Gerlitz
  1 sibling, 0 replies; 12+ messages in thread
From: Or Gerlitz @ 2016-01-13  8:55 UTC (permalink / raw)
  To: Sagi Grimberg, Sagi Grimberg
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Leon Romanovsky

On 1/12/2016 4:58 PM, Sagi Grimberg wrote:
> I tested iser perf and it seems to sustain. 

That's good. I also think we should move fwd with the patch once the 
review is passed OKay, even if we don't have the means to evaluate  the 
gain, future will tell.

> I didn't see any major difference but I really don't think block 
> storage iops/latency is not the correct way to evaluate this change. 

yes... pktgen doesn't work for IPoIB (if it was, we could use 64B UDP 
packets w.o csum to see the impact) maybe RDS (rds-stress tool) has more 
ops/sec then conventional block rdma driver?

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]         ` <56960E5C.5000607-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2016-01-13  9:26           ` Yann Droneaud
       [not found]             ` <1452677194.9500.27.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Yann Droneaud @ 2016-01-13  9:26 UTC (permalink / raw)
  To: Sagi Grimberg, Sagi Grimberg, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Matan Barak, Leon Romanovsky

Le mercredi 13 janvier 2016 à 10:44 +0200, Sagi Grimberg a écrit :
> > Anyway, I'm not sure about the alignment of the memory returned by
> > kcalloc(), I should have known by the time but don't find time to
> > figure how it's handled on various SL*B allocator implementation.
> 
> Can you explain why should I worry about the alignment of kcalloc
> here?

Say it returns memory aligned on 16 bytes, then accessing some items in
the array is going to touch 2 cachelines, defeating the purpose of the
padding or the alignment attribute added to the item structure.

Regards.

-- 
Yann Droneaud
OPTEYA

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing
       [not found]             ` <1452677194.9500.27.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
@ 2016-01-13  9:37               ` Sagi Grimberg
  0 siblings, 0 replies; 12+ messages in thread
From: Sagi Grimberg @ 2016-01-13  9:37 UTC (permalink / raw)
  To: Yann Droneaud, Sagi Grimberg, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Matan Barak, Leon Romanovsky



>>> Anyway, I'm not sure about the alignment of the memory returned by
>>> kcalloc(), I should have known by the time but don't find time to
>>> figure how it's handled on various SL*B allocator implementation.
>>
>> Can you explain why should I worry about the alignment of kcalloc
>> here?
>
> Say it returns memory aligned on 16 bytes, then accessing some items in
> the array is going to touch 2 cachelines, defeating the purpose of the
> padding or the alignment attribute added to the item structure.

I see, I think the alignment is ARCH_KMALLOC_MINALIGN. I'm not aware
of aligned array allocation. Do you have a suggestion how can I align
the allocation? I can manually pad and align but it would mean I'd need
to save two extra pointers (aligned pointer and orig pointer for kfree)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-01-13  9:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-12 10:32 [PATCH] IB/mlx5: Reduce mlx5_ib_wq cacheline bouncing Sagi Grimberg
     [not found] ` <1452594732-9573-1-git-send-email-sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2016-01-12 12:16   ` Or Gerlitz
     [not found]     ` <5694EEAA.3050600-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2016-01-12 14:44       ` Sagi Grimberg
     [not found]         ` <56951130.60802-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2016-01-12 14:53           ` Or Gerlitz
     [not found]             ` <56951375.1050704-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2016-01-12 14:58               ` Sagi Grimberg
     [not found]                 ` <569514A5.1050705-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2016-01-12 15:05                   ` Or Gerlitz
     [not found]                     ` <56951634.5030307-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2016-01-12 15:09                       ` Sagi Grimberg
2016-01-13  8:55                   ` Or Gerlitz
2016-01-12 14:37   ` Yann Droneaud
     [not found]     ` <1452609431.9500.24.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
2016-01-13  8:44       ` Sagi Grimberg
     [not found]         ` <56960E5C.5000607-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2016-01-13  9:26           ` Yann Droneaud
     [not found]             ` <1452677194.9500.27.camel-RlY5vtjFyJ3QT0dZR+AlfA@public.gmane.org>
2016-01-13  9:37               ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.