Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
@ 2019-10-09 16:54 Chuck Lever
  2019-10-11 17:26 ` Parav Pandit
  0 siblings, 1 reply; 5+ messages in thread
From: Chuck Lever @ 2019-10-09 16:54 UTC (permalink / raw)
  To: linux-rdma

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 drivers/infiniband/core/Makefile |    2 
 drivers/infiniband/core/cq.c     |   29 +++--
 drivers/infiniband/core/trace.c  |   14 ++
 include/rdma/ib_verbs.h          |    2 
 include/trace/events/rdma_core.h |  218 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 255 insertions(+), 10 deletions(-)
 create mode 100644 drivers/infiniband/core/trace.c
 create mode 100644 include/trace/events/rdma_core.h

Changes since v2:
- Removed extraneous changes to include/trace/events/rdma.h

Changes since RFC:
- Addressed comments from Parav Pandit <parav@mellanox.com>


diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 09881bd..68d9e27 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
-				nldev.o restrack.o counters.o
+				nldev.o restrack.o counters.o trace.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index bbfded6..bcde992 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
 #include <rdma/ib_verbs.h>
 
+#include <trace/events/rdma_core.h>
+
 /* # of WCs to poll for with a single call to ib_poll_cq */
 #define IB_POLL_BATCH			16
 #define IB_POLL_BATCH_DIRECT		8
@@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
 
 	dim->state = DIM_START_MEASURE;
 
+	trace_cq_modify(cq, comps, usec);
 	cq->device->ops.modify_cq(cq, comps, usec);
 }
 
@@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 {
 	int i, n, completed = 0;
 
-	/*
-	 * budget might be (-1) if the caller does not
-	 * want to bound this call, thus we need unsigned
-	 * minimum here.
-	 */
-	while ((n = ib_poll_cq(cq, min_t(u32, batch,
-					 budget - completed), wcs)) > 0) {
+	trace_cq_process(cq);
+	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
+		trace_cq_poll(cq, batch, n);
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = &wcs[i];
 
@@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
 		}
 
 		completed += n;
-
 		if (n != batch || (budget != -1 && completed >= budget))
 			break;
+
+		/*
+		 * budget might be (-1) if the caller does not
+		 * want to bound this call, thus we need unsigned
+		 * minimum here.
+		 */
+		batch = min_t(u32, batch, budget - completed);
 	}
 
 	return completed;
@@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
 	if (completed < budget) {
 		irq_poll_complete(&cq->iop);
-		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
+			trace_cq_reschedule(cq);
 			irq_poll_sched(&cq->iop);
+		}
 	}
 
 	if (dim)
@@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
 
 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
 {
+	trace_cq_schedule(cq);
 	irq_poll_sched(&cq->iop);
 }
 
@@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 {
+	trace_cq_schedule(cq);
 	queue_work(cq->comp_wq, &cq->work);
 }
 
@@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 		goto out_destroy_cq;
 	}
 
+	trace_cq_alloc(cq, comp_vector, poll_ctx);
 	return cq;
 
 out_destroy_cq:
@@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 		WARN_ON_ONCE(1);
 	}
 
+	trace_cq_free(cq);
 	rdma_restrack_del(&cq->res);
 	cq->device->ops.destroy_cq(cq, udata);
 	if (cq->dim)
diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
new file mode 100644
index 0000000..6c3514b
--- /dev/null
+++ b/drivers/infiniband/core/trace.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trace points for core RDMA functions.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include <rdma/ib_verbs.h>
+
+#include <trace/events/rdma_core.h>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6a47ba8..95a6bce 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1555,6 +1555,8 @@ struct ib_cq {
 	};
 	struct workqueue_struct *comp_wq;
 	struct dim *dim;
+	ktime_t timestamp;
+	bool interrupt;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
diff --git a/include/trace/events/rdma_core.h b/include/trace/events/rdma_core.h
new file mode 100644
index 0000000..c1397a3
--- /dev/null
+++ b/include/trace/events/rdma_core.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Trace point definitions for core RDMA functions.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rdma_core
+
+#if !defined(_TRACE_RDMA_CORE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RDMA_CORE_H
+
+#include <linux/tracepoint.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+
+/*
+ * enum ib_poll_context, from include/rdma/ib_verbs.h
+ */
+#define IB_POLL_CTX_LIST			\
+	ib_poll_ctx(DIRECT)			\
+	ib_poll_ctx(SOFTIRQ)			\
+	ib_poll_ctx(WORKQUEUE)			\
+	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
+
+#undef ib_poll_ctx
+#undef ib_poll_ctx_end
+
+#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
+#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
+
+IB_POLL_CTX_LIST
+
+#undef ib_poll_ctx
+#undef ib_poll_ctx_end
+
+#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
+#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
+
+#define rdma_show_ib_poll_ctx(x) \
+		__print_symbolic(x, IB_POLL_CTX_LIST)
+
+/**
+ ** Completion Queue events
+ **/
+
+TRACE_EVENT(cq_schedule,
+	TP_PROTO(
+		struct ib_cq *cq
+	),
+
+	TP_ARGS(cq),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+	),
+
+	TP_fast_assign(
+		cq->timestamp = ktime_get();
+		cq->interrupt = true;
+
+		__entry->id = cq->res.id;
+	),
+
+	TP_printk("id %u", __entry->id)
+);
+
+TRACE_EVENT(cq_reschedule,
+	TP_PROTO(
+		struct ib_cq *cq
+	),
+
+	TP_ARGS(cq),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+	),
+
+	TP_fast_assign(
+		cq->timestamp = ktime_get();
+		cq->interrupt = false;
+
+		__entry->id = cq->res.id;
+	),
+
+	TP_printk("id %u", __entry->id)
+);
+
+TRACE_EVENT(cq_process,
+	TP_PROTO(
+		const struct ib_cq *cq
+	),
+
+	TP_ARGS(cq),
+
+	TP_STRUCT__entry(
+		__field(s64, latency)
+		__field(u32, id)
+		__field(bool, interrupt)
+	),
+
+	TP_fast_assign(
+		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
+
+		__entry->id = cq->res.id;
+		__entry->latency = ktime_to_us(latency);
+		__entry->interrupt = cq->interrupt;
+	),
+
+	TP_printk("id %u wake-up took %lld [us] from %s",
+		__entry->id, __entry->latency,
+		__entry->interrupt ? "interrupt" : "reschedule"
+	)
+);
+
+TRACE_EVENT(cq_poll,
+	TP_PROTO(
+		const struct ib_cq *cq,
+		int requested,
+		int rc
+	),
+
+	TP_ARGS(cq, requested, rc),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+		__field(int, requested)
+		__field(int, rc)
+	),
+
+	TP_fast_assign(
+		__entry->id = cq->res.id;
+		__entry->requested = requested;
+		__entry->rc = rc;
+	),
+
+	TP_printk("id %u requested %d, returned %d",
+		__entry->id, __entry->requested, __entry->rc
+	)
+);
+
+TRACE_EVENT(cq_modify,
+	TP_PROTO(
+		const struct ib_cq *cq,
+		u16 comps,
+		u16 usec
+	),
+
+	TP_ARGS(cq, comps, usec),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+		__field(unsigned int, comps)
+		__field(unsigned int, usec)
+	),
+
+	TP_fast_assign(
+		__entry->id = cq->res.id;
+		__entry->comps = comps;
+		__entry->usec = usec;
+	),
+
+	TP_printk("id %u comps=%u usec=%u",
+		__entry->id, __entry->comps, __entry->usec
+	)
+);
+
+TRACE_EVENT(cq_alloc,
+	TP_PROTO(
+		const struct ib_cq *cq,
+		int comp_vector,
+		enum ib_poll_context poll_ctx
+	),
+
+	TP_ARGS(cq, comp_vector, poll_ctx),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+		__field(int, comp_vector)
+		__field(unsigned long, poll_ctx)
+	),
+
+	TP_fast_assign(
+		__entry->id = cq->res.id;
+		__entry->comp_vector = comp_vector;
+		__entry->poll_ctx = poll_ctx;
+	),
+
+	TP_printk("id %u comp_vector=%d poll_ctx=%s",
+		__entry->id, __entry->comp_vector,
+		rdma_show_ib_poll_ctx(__entry->poll_ctx)
+	)
+);
+
+TRACE_EVENT(cq_free,
+	TP_PROTO(
+		const struct ib_cq *cq
+	),
+
+	TP_ARGS(cq),
+
+	TP_STRUCT__entry(
+		__field(u32, id)
+	),
+
+	TP_fast_assign(
+		__entry->id = cq->res.id;
+	),
+
+	TP_printk("id %u", __entry->id)
+);
+
+#endif /* _TRACE_RDMA_CORE_H */
+
+#include <trace/define_trace.h>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
  2019-10-09 16:54 [PATCH v3] IB/core: Trace points for diagnosing completion queue issues Chuck Lever
@ 2019-10-11 17:26 ` Parav Pandit
  2019-10-11 17:38   ` Chuck Lever
  0 siblings, 1 reply; 5+ messages in thread
From: Parav Pandit @ 2019-10-11 17:26 UTC (permalink / raw)
  To: Chuck Lever, linux-rdma



> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Chuck Lever
> Sent: Wednesday, October 9, 2019 11:55 AM
> To: linux-rdma@vger.kernel.org
> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
> issues
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  drivers/infiniband/core/Makefile |    2
>  drivers/infiniband/core/cq.c     |   29 +++--
>  drivers/infiniband/core/trace.c  |   14 ++
>  include/rdma/ib_verbs.h          |    2
>  include/trace/events/rdma_core.h |  218
> ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 255 insertions(+), 10 deletions(-)  create mode 100644
> drivers/infiniband/core/trace.c  create mode 100644
> include/trace/events/rdma_core.h
> 
> Changes since v2:
> - Removed extraneous changes to include/trace/events/rdma.h
> 
> Changes since RFC:
> - Addressed comments from Parav Pandit <parav@mellanox.com>
> 
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index 09881bd..68d9e27 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
> verbs.o cq.o rw.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o mr_pool.o addr.o
> sa_query.o \
>  				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> -				nldev.o restrack.o counters.o
> +				nldev.o restrack.o counters.o trace.o
> 
>  ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>  ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
> bbfded6..bcde992 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -7,6 +7,8 @@
>  #include <linux/slab.h>
>  #include <rdma/ib_verbs.h>
> 
> +#include <trace/events/rdma_core.h>
> +
>  /* # of WCs to poll for with a single call to ib_poll_cq */
>  #define IB_POLL_BATCH			16
>  #define IB_POLL_BATCH_DIRECT		8
> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
> 
>  	dim->state = DIM_START_MEASURE;
> 
> +	trace_cq_modify(cq, comps, usec);
>  	cq->device->ops.modify_cq(cq, comps, usec);  }
> 
> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs,  {
>  	int i, n, completed = 0;
> 
> -	/*
> -	 * budget might be (-1) if the caller does not
> -	 * want to bound this call, thus we need unsigned
> -	 * minimum here.
> -	 */
> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
> -					 budget - completed), wcs)) > 0) {
> +	trace_cq_process(cq);
> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
With this change, it will poll for batch.
This is functional change than just adding the trace points.
I am not sure if this has any effect on the overall polling.
But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.

> +		trace_cq_poll(cq, batch, n);
>  		for (i = 0; i < n; i++) {
>  			struct ib_wc *wc = &wcs[i];
> 
> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs,
>  		}
> 
>  		completed += n;
> -
>  		if (n != batch || (budget != -1 && completed >= budget))
>  			break;
> +
> +		/*
> +		 * budget might be (-1) if the caller does not
> +		 * want to bound this call, thus we need unsigned
> +		 * minimum here.
> +		 */
> +		batch = min_t(u32, batch, budget - completed);
>  	}
> 
>  	return completed;
> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
>  	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>  	if (completed < budget) {
>  		irq_poll_complete(&cq->iop);
> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> +			trace_cq_reschedule(cq);
>  			irq_poll_sched(&cq->iop);
> +		}
>  	}
> 
>  	if (dim)
> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
> 
>  static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)  {
> +	trace_cq_schedule(cq);
>  	irq_poll_sched(&cq->iop);
>  }
> 
> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
> 
>  static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)  {
> +	trace_cq_schedule(cq);
>  	queue_work(cq->comp_wq, &cq->work);
>  }
> 
> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
> void *private,
>  		goto out_destroy_cq;
>  	}
> 
> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
>  	return cq;
> 
>  out_destroy_cq:
> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
> *udata)
>  		WARN_ON_ONCE(1);
>  	}
> 
> +	trace_cq_free(cq);
>  	rdma_restrack_del(&cq->res);
>  	cq->device->ops.destroy_cq(cq, udata);
>  	if (cq->dim)
> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
> new file mode 100644 index 0000000..6c3514b
> --- /dev/null
> +++ b/drivers/infiniband/core/trace.c
> @@ -0,0 +1,14 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Trace points for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#define CREATE_TRACE_POINTS
> +
> +#include <rdma/ib_verbs.h>
> +
> +#include <trace/events/rdma_core.h>
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> 6a47ba8..95a6bce 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1555,6 +1555,8 @@ struct ib_cq {
>  	};
>  	struct workqueue_struct *comp_wq;
>  	struct dim *dim;
> +	ktime_t timestamp;
> +	bool interrupt;
Its unclear when to update timestamp and interrupt. Comment will help.
>  	/*
>  	 * Implementation details of the RDMA core, don't use in drivers:
>  	 */
> diff --git a/include/trace/events/rdma_core.h
> b/include/trace/events/rdma_core.h
> new file mode 100644
> index 0000000..c1397a3
> --- /dev/null
> +++ b/include/trace/events/rdma_core.h
> @@ -0,0 +1,218 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Trace point definitions for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM rdma_core
> +
> +#if !defined(_TRACE_RDMA_CORE_H) ||
> defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_RDMA_CORE_H
> +
> +#include <linux/tracepoint.h>
> +#include <rdma/ib_verbs.h>
> +#include <rdma/restrack.h>
> +
> +/*
> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
> +#define IB_POLL_CTX_LIST			\
> +	ib_poll_ctx(DIRECT)			\
> +	ib_poll_ctx(SOFTIRQ)			\
> +	ib_poll_ctx(WORKQUEUE)			\
> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
> +
> +IB_POLL_CTX_LIST
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
> +
> +#define rdma_show_ib_poll_ctx(x) \
> +		__print_symbolic(x, IB_POLL_CTX_LIST)
> +
> +/**
> + ** Completion Queue events
> + **/
> +
> +TRACE_EVENT(cq_schedule,
> +	TP_PROTO(
> +		struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		cq->timestamp = ktime_get();
> +		cq->interrupt = true;
> +
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +TRACE_EVENT(cq_reschedule,
> +	TP_PROTO(
> +		struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		cq->timestamp = ktime_get();
> +		cq->interrupt = false;
> +
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +TRACE_EVENT(cq_process,
> +	TP_PROTO(
> +		const struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(s64, latency)
> +		__field(u32, id)
> +		__field(bool, interrupt)
> +	),
> +
> +	TP_fast_assign(
> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
> +
> +		__entry->id = cq->res.id;
> +		__entry->latency = ktime_to_us(latency);
> +		__entry->interrupt = cq->interrupt;
> +	),
> +
> +	TP_printk("id %u wake-up took %lld [us] from %s",
It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.

I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.


> +		__entry->id, __entry->latency,
> +		__entry->interrupt ? "interrupt" : "reschedule"
> +	)
> +);
> +
> +TRACE_EVENT(cq_poll,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		int requested,
> +		int rc
> +	),
> +
> +	TP_ARGS(cq, requested, rc),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(int, requested)
> +		__field(int, rc)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->requested = requested;
> +		__entry->rc = rc;
> +	),
> +
> +	TP_printk("id %u requested %d, returned %d",
> +		__entry->id, __entry->requested, __entry->rc
> +	)
> +);
> +
> +TRACE_EVENT(cq_modify,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		u16 comps,
> +		u16 usec
> +	),
> +
> +	TP_ARGS(cq, comps, usec),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(unsigned int, comps)
> +		__field(unsigned int, usec)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->comps = comps;
> +		__entry->usec = usec;
> +	),
> +
> +	TP_printk("id %u comps=%u usec=%u",
> +		__entry->id, __entry->comps, __entry->usec
> +	)
> +);
> +
> +TRACE_EVENT(cq_alloc,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		int comp_vector,
> +		enum ib_poll_context poll_ctx
> +	),
> +
> +	TP_ARGS(cq, comp_vector, poll_ctx),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(int, comp_vector)
> +		__field(unsigned long, poll_ctx)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->comp_vector = comp_vector;
> +		__entry->poll_ctx = poll_ctx;
> +	),
> +
> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
> +		__entry->id, __entry->comp_vector,
> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
> +	)
> +);
> +
> +TRACE_EVENT(cq_free,
> +	TP_PROTO(
> +		const struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +#endif /* _TRACE_RDMA_CORE_H */
> +
> +#include <trace/define_trace.h>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
  2019-10-11 17:26 ` Parav Pandit
@ 2019-10-11 17:38   ` Chuck Lever
  2019-10-11 17:44     ` Chuck Lever
  2019-10-11 17:46     ` Parav Pandit
  0 siblings, 2 replies; 5+ messages in thread
From: Chuck Lever @ 2019-10-11 17:38 UTC (permalink / raw)
  To: Parav Pandit; +Cc: linux-rdma



> On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@mellanox.com> wrote:
> 
> 
> 
>> -----Original Message-----
>> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
>> owner@vger.kernel.org> On Behalf Of Chuck Lever
>> Sent: Wednesday, October 9, 2019 11:55 AM
>> To: linux-rdma@vger.kernel.org
>> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
>> issues
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> drivers/infiniband/core/Makefile |    2
>> drivers/infiniband/core/cq.c     |   29 +++--
>> drivers/infiniband/core/trace.c  |   14 ++
>> include/rdma/ib_verbs.h          |    2
>> include/trace/events/rdma_core.h |  218
>> ++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 255 insertions(+), 10 deletions(-)  create mode 100644
>> drivers/infiniband/core/trace.c  create mode 100644
>> include/trace/events/rdma_core.h
>> 
>> Changes since v2:
>> - Removed extraneous changes to include/trace/events/rdma.h
>> 
>> Changes since RFC:
>> - Addressed comments from Parav Pandit <parav@mellanox.com>
>> 
>> 
>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>> index 09881bd..68d9e27 100644
>> --- a/drivers/infiniband/core/Makefile
>> +++ b/drivers/infiniband/core/Makefile
>> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
>> verbs.o cq.o rw.o sysfs.o \
>> 				device.o fmr_pool.o cache.o netlink.o \
>> 				roce_gid_mgmt.o mr_pool.o addr.o
>> sa_query.o \
>> 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>> -				nldev.o restrack.o counters.o
>> +				nldev.o restrack.o counters.o trace.o
>> 
>> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
>> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
>> bbfded6..bcde992 100644
>> --- a/drivers/infiniband/core/cq.c
>> +++ b/drivers/infiniband/core/cq.c
>> @@ -7,6 +7,8 @@
>> #include <linux/slab.h>
>> #include <rdma/ib_verbs.h>
>> 
>> +#include <trace/events/rdma_core.h>
>> +
>> /* # of WCs to poll for with a single call to ib_poll_cq */
>> #define IB_POLL_BATCH			16
>> #define IB_POLL_BATCH_DIRECT		8
>> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
>> 
>> 	dim->state = DIM_START_MEASURE;
>> 
>> +	trace_cq_modify(cq, comps, usec);
>> 	cq->device->ops.modify_cq(cq, comps, usec);  }
>> 
>> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>> struct ib_wc *wcs,  {
>> 	int i, n, completed = 0;
>> 
>> -	/*
>> -	 * budget might be (-1) if the caller does not
>> -	 * want to bound this call, thus we need unsigned
>> -	 * minimum here.
>> -	 */
>> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
>> -					 budget - completed), wcs)) > 0) {
>> +	trace_cq_process(cq);
>> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
> Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
> With this change, it will poll for batch.
> This is functional change than just adding the trace points.
> I am not sure if this has any effect on the overall polling.
> But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.

Or find a way to add the trace point without the functional change.


>> +		trace_cq_poll(cq, batch, n);
>> 		for (i = 0; i < n; i++) {
>> 			struct ib_wc *wc = &wcs[i];
>> 
>> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>> struct ib_wc *wcs,
>> 		}
>> 
>> 		completed += n;
>> -
>> 		if (n != batch || (budget != -1 && completed >= budget))
>> 			break;
>> +
>> +		/*
>> +		 * budget might be (-1) if the caller does not
>> +		 * want to bound this call, thus we need unsigned
>> +		 * minimum here.
>> +		 */
>> +		batch = min_t(u32, batch, budget - completed);
>> 	}
>> 
>> 	return completed;
>> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
>> budget)
>> 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>> 	if (completed < budget) {
>> 		irq_poll_complete(&cq->iop);
>> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
>> +			trace_cq_reschedule(cq);
>> 			irq_poll_sched(&cq->iop);
>> +		}
>> 	}
>> 
>> 	if (dim)
>> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
>> budget)
>> 
>> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)  {
>> +	trace_cq_schedule(cq);
>> 	irq_poll_sched(&cq->iop);
>> }
>> 
>> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
>> 
>> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)  {
>> +	trace_cq_schedule(cq);
>> 	queue_work(cq->comp_wq, &cq->work);
>> }
>> 
>> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
>> void *private,
>> 		goto out_destroy_cq;
>> 	}
>> 
>> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
>> 	return cq;
>> 
>> out_destroy_cq:
>> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
>> *udata)
>> 		WARN_ON_ONCE(1);
>> 	}
>> 
>> +	trace_cq_free(cq);
>> 	rdma_restrack_del(&cq->res);
>> 	cq->device->ops.destroy_cq(cq, udata);
>> 	if (cq->dim)
>> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
>> new file mode 100644 index 0000000..6c3514b
>> --- /dev/null
>> +++ b/drivers/infiniband/core/trace.c
>> @@ -0,0 +1,14 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Trace points for core RDMA functions.
>> + *
>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>> + *
>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>> + */
>> +
>> +#define CREATE_TRACE_POINTS
>> +
>> +#include <rdma/ib_verbs.h>
>> +
>> +#include <trace/events/rdma_core.h>
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
>> 6a47ba8..95a6bce 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -1555,6 +1555,8 @@ struct ib_cq {
>> 	};
>> 	struct workqueue_struct *comp_wq;
>> 	struct dim *dim;
>> +	ktime_t timestamp;
>> +	bool interrupt;
> Its unclear when to update timestamp and interrupt. Comment will help.

These are both updated only in the new trace points. Is more than
that needed in a comment?


>> 	/*
>> 	 * Implementation details of the RDMA core, don't use in drivers:
>> 	 */
>> diff --git a/include/trace/events/rdma_core.h
>> b/include/trace/events/rdma_core.h
>> new file mode 100644
>> index 0000000..c1397a3
>> --- /dev/null
>> +++ b/include/trace/events/rdma_core.h
>> @@ -0,0 +1,218 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Trace point definitions for core RDMA functions.
>> + *
>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>> + *
>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>> + */
>> +
>> +#undef TRACE_SYSTEM
>> +#define TRACE_SYSTEM rdma_core
>> +
>> +#if !defined(_TRACE_RDMA_CORE_H) ||
>> defined(TRACE_HEADER_MULTI_READ)
>> +#define _TRACE_RDMA_CORE_H
>> +
>> +#include <linux/tracepoint.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <rdma/restrack.h>
>> +
>> +/*
>> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
>> +#define IB_POLL_CTX_LIST			\
>> +	ib_poll_ctx(DIRECT)			\
>> +	ib_poll_ctx(SOFTIRQ)			\
>> +	ib_poll_ctx(WORKQUEUE)			\
>> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
>> +
>> +#undef ib_poll_ctx
>> +#undef ib_poll_ctx_end
>> +
>> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
>> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
>> +
>> +IB_POLL_CTX_LIST
>> +
>> +#undef ib_poll_ctx
>> +#undef ib_poll_ctx_end
>> +
>> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
>> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
>> +
>> +#define rdma_show_ib_poll_ctx(x) \
>> +		__print_symbolic(x, IB_POLL_CTX_LIST)
>> +
>> +/**
>> + ** Completion Queue events
>> + **/
>> +
>> +TRACE_EVENT(cq_schedule,
>> +	TP_PROTO(
>> +		struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		cq->timestamp = ktime_get();
>> +		cq->interrupt = true;
>> +
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +TRACE_EVENT(cq_reschedule,
>> +	TP_PROTO(
>> +		struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		cq->timestamp = ktime_get();
>> +		cq->interrupt = false;
>> +
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +TRACE_EVENT(cq_process,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(s64, latency)
>> +		__field(u32, id)
>> +		__field(bool, interrupt)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
>> +
>> +		__entry->id = cq->res.id;
>> +		__entry->latency = ktime_to_us(latency);
>> +		__entry->interrupt = cq->interrupt;
>> +	),
>> +
>> +	TP_printk("id %u wake-up took %lld [us] from %s",
> It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.

I left out the "cq" here because the trace point names are prefixed
with "cq_". However, now that you bring it up, I can imagine cases
where a trace point might report information about two different
resources that both have a restrack ID.

How about "cq.id=%u" ?


> I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
> I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.

On the other hand, a trace point might someday want to report the
value of a field in struct ib_cq.


>> +		__entry->id, __entry->latency,
>> +		__entry->interrupt ? "interrupt" : "reschedule"
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_poll,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		int requested,
>> +		int rc
>> +	),
>> +
>> +	TP_ARGS(cq, requested, rc),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(int, requested)
>> +		__field(int, rc)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->requested = requested;
>> +		__entry->rc = rc;
>> +	),
>> +
>> +	TP_printk("id %u requested %d, returned %d",
>> +		__entry->id, __entry->requested, __entry->rc
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_modify,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		u16 comps,
>> +		u16 usec
>> +	),
>> +
>> +	TP_ARGS(cq, comps, usec),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(unsigned int, comps)
>> +		__field(unsigned int, usec)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->comps = comps;
>> +		__entry->usec = usec;
>> +	),
>> +
>> +	TP_printk("id %u comps=%u usec=%u",
>> +		__entry->id, __entry->comps, __entry->usec
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_alloc,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq,
>> +		int comp_vector,
>> +		enum ib_poll_context poll_ctx
>> +	),
>> +
>> +	TP_ARGS(cq, comp_vector, poll_ctx),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +		__field(int, comp_vector)
>> +		__field(unsigned long, poll_ctx)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +		__entry->comp_vector = comp_vector;
>> +		__entry->poll_ctx = poll_ctx;
>> +	),
>> +
>> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
>> +		__entry->id, __entry->comp_vector,
>> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
>> +	)
>> +);
>> +
>> +TRACE_EVENT(cq_free,
>> +	TP_PROTO(
>> +		const struct ib_cq *cq
>> +	),
>> +
>> +	TP_ARGS(cq),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(u32, id)
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->id = cq->res.id;
>> +	),
>> +
>> +	TP_printk("id %u", __entry->id)
>> +);
>> +
>> +#endif /* _TRACE_RDMA_CORE_H */
>> +
>> +#include <trace/define_trace.h>

--
Chuck Lever




^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
  2019-10-11 17:38   ` Chuck Lever
@ 2019-10-11 17:44     ` Chuck Lever
  2019-10-11 17:46     ` Parav Pandit
  1 sibling, 0 replies; 5+ messages in thread
From: Chuck Lever @ 2019-10-11 17:44 UTC (permalink / raw)
  To: Parav Pandit; +Cc: linux-rdma



> On Oct 11, 2019, at 1:38 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
> 
> 
> 
>> On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@mellanox.com> wrote:
>> 
>> 
>> 
>>> -----Original Message-----
>>> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
>>> owner@vger.kernel.org> On Behalf Of Chuck Lever
>>> Sent: Wednesday, October 9, 2019 11:55 AM
>>> To: linux-rdma@vger.kernel.org
>>> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
>>> issues
>>> 
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> drivers/infiniband/core/Makefile |    2
>>> drivers/infiniband/core/cq.c     |   29 +++--
>>> drivers/infiniband/core/trace.c  |   14 ++
>>> include/rdma/ib_verbs.h          |    2
>>> include/trace/events/rdma_core.h |  218
>>> ++++++++++++++++++++++++++++++++++++++
>>> 5 files changed, 255 insertions(+), 10 deletions(-)  create mode 100644
>>> drivers/infiniband/core/trace.c  create mode 100644
>>> include/trace/events/rdma_core.h
>>> 
>>> Changes since v2:
>>> - Removed extraneous changes to include/trace/events/rdma.h
>>> 
>>> Changes since RFC:
>>> - Addressed comments from Parav Pandit <parav@mellanox.com>
>>> 
>>> 
>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>>> index 09881bd..68d9e27 100644
>>> --- a/drivers/infiniband/core/Makefile
>>> +++ b/drivers/infiniband/core/Makefile
>>> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
>>> verbs.o cq.o rw.o sysfs.o \
>>> 				device.o fmr_pool.o cache.o netlink.o \
>>> 				roce_gid_mgmt.o mr_pool.o addr.o
>>> sa_query.o \
>>> 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
>>> -				nldev.o restrack.o counters.o
>>> +				nldev.o restrack.o counters.o trace.o
>>> 
>>> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>>> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
>>> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
>>> bbfded6..bcde992 100644
>>> --- a/drivers/infiniband/core/cq.c
>>> +++ b/drivers/infiniband/core/cq.c
>>> @@ -7,6 +7,8 @@
>>> #include <linux/slab.h>
>>> #include <rdma/ib_verbs.h>
>>> 
>>> +#include <trace/events/rdma_core.h>
>>> +
>>> /* # of WCs to poll for with a single call to ib_poll_cq */
>>> #define IB_POLL_BATCH			16
>>> #define IB_POLL_BATCH_DIRECT		8
>>> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
>>> 
>>> 	dim->state = DIM_START_MEASURE;
>>> 
>>> +	trace_cq_modify(cq, comps, usec);
>>> 	cq->device->ops.modify_cq(cq, comps, usec);  }
>>> 
>>> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>>> struct ib_wc *wcs,  {
>>> 	int i, n, completed = 0;
>>> 
>>> -	/*
>>> -	 * budget might be (-1) if the caller does not
>>> -	 * want to bound this call, thus we need unsigned
>>> -	 * minimum here.
>>> -	 */
>>> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
>>> -					 budget - completed), wcs)) > 0) {
>>> +	trace_cq_process(cq);
>>> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
>> Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
>> With this change, it will poll for batch.
>> This is functional change than just adding the trace points.
>> I am not sure if this has any effect on the overall polling.
>> But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.
> 
> Or find a way to add the trace point without the functional change.
> 
> 
>>> +		trace_cq_poll(cq, batch, n);
>>> 		for (i = 0; i < n; i++) {
>>> 			struct ib_wc *wc = &wcs[i];
>>> 
>>> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>>> struct ib_wc *wcs,
>>> 		}
>>> 
>>> 		completed += n;
>>> -
>>> 		if (n != batch || (budget != -1 && completed >= budget))
>>> 			break;
>>> +
>>> +		/*
>>> +		 * budget might be (-1) if the caller does not
>>> +		 * want to bound this call, thus we need unsigned
>>> +		 * minimum here.
>>> +		 */
>>> +		batch = min_t(u32, batch, budget - completed);
>>> 	}
>>> 
>>> 	return completed;
>>> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
>>> budget)
>>> 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>>> 	if (completed < budget) {
>>> 		irq_poll_complete(&cq->iop);
>>> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>>> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
>>> +			trace_cq_reschedule(cq);
>>> 			irq_poll_sched(&cq->iop);
>>> +		}
>>> 	}
>>> 
>>> 	if (dim)
>>> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
>>> budget)
>>> 
>>> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)  {
>>> +	trace_cq_schedule(cq);
>>> 	irq_poll_sched(&cq->iop);
>>> }
>>> 
>>> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
>>> 
>>> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)  {
>>> +	trace_cq_schedule(cq);
>>> 	queue_work(cq->comp_wq, &cq->work);
>>> }
>>> 
>>> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
>>> void *private,
>>> 		goto out_destroy_cq;
>>> 	}
>>> 
>>> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
>>> 	return cq;
>>> 
>>> out_destroy_cq:
>>> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
>>> *udata)
>>> 		WARN_ON_ONCE(1);
>>> 	}
>>> 
>>> +	trace_cq_free(cq);
>>> 	rdma_restrack_del(&cq->res);
>>> 	cq->device->ops.destroy_cq(cq, udata);
>>> 	if (cq->dim)
>>> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
>>> new file mode 100644 index 0000000..6c3514b
>>> --- /dev/null
>>> +++ b/drivers/infiniband/core/trace.c
>>> @@ -0,0 +1,14 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * Trace points for core RDMA functions.
>>> + *
>>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>>> + *
>>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>>> + */
>>> +
>>> +#define CREATE_TRACE_POINTS
>>> +
>>> +#include <rdma/ib_verbs.h>
>>> +
>>> +#include <trace/events/rdma_core.h>
>>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
>>> 6a47ba8..95a6bce 100644
>>> --- a/include/rdma/ib_verbs.h
>>> +++ b/include/rdma/ib_verbs.h
>>> @@ -1555,6 +1555,8 @@ struct ib_cq {
>>> 	};
>>> 	struct workqueue_struct *comp_wq;
>>> 	struct dim *dim;
>>> +	ktime_t timestamp;
>>> +	bool interrupt;
>> Its unclear when to update timestamp and interrupt. Comment will help.
> 
> These are both updated only in the new trace points. Is more than
> that needed in a comment?
> 
> 
>>> 	/*
>>> 	 * Implementation details of the RDMA core, don't use in drivers:
>>> 	 */
>>> diff --git a/include/trace/events/rdma_core.h
>>> b/include/trace/events/rdma_core.h
>>> new file mode 100644
>>> index 0000000..c1397a3
>>> --- /dev/null
>>> +++ b/include/trace/events/rdma_core.h
>>> @@ -0,0 +1,218 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * Trace point definitions for core RDMA functions.
>>> + *
>>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>>> + *
>>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>>> + */
>>> +
>>> +#undef TRACE_SYSTEM
>>> +#define TRACE_SYSTEM rdma_core
>>> +
>>> +#if !defined(_TRACE_RDMA_CORE_H) ||
>>> defined(TRACE_HEADER_MULTI_READ)
>>> +#define _TRACE_RDMA_CORE_H
>>> +
>>> +#include <linux/tracepoint.h>
>>> +#include <rdma/ib_verbs.h>
>>> +#include <rdma/restrack.h>
>>> +
>>> +/*
>>> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
>>> +#define IB_POLL_CTX_LIST			\
>>> +	ib_poll_ctx(DIRECT)			\
>>> +	ib_poll_ctx(SOFTIRQ)			\
>>> +	ib_poll_ctx(WORKQUEUE)			\
>>> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
>>> +
>>> +#undef ib_poll_ctx
>>> +#undef ib_poll_ctx_end
>>> +
>>> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
>>> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
>>> +
>>> +IB_POLL_CTX_LIST
>>> +
>>> +#undef ib_poll_ctx
>>> +#undef ib_poll_ctx_end
>>> +
>>> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
>>> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
>>> +
>>> +#define rdma_show_ib_poll_ctx(x) \
>>> +		__print_symbolic(x, IB_POLL_CTX_LIST)
>>> +
>>> +/**
>>> + ** Completion Queue events
>>> + **/
>>> +
>>> +TRACE_EVENT(cq_schedule,
>>> +	TP_PROTO(
>>> +		struct ib_cq *cq
>>> +	),
>>> +
>>> +	TP_ARGS(cq),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		cq->timestamp = ktime_get();
>>> +		cq->interrupt = true;
>>> +
>>> +		__entry->id = cq->res.id;
>>> +	),
>>> +
>>> +	TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_reschedule,
>>> +	TP_PROTO(
>>> +		struct ib_cq *cq
>>> +	),
>>> +
>>> +	TP_ARGS(cq),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		cq->timestamp = ktime_get();
>>> +		cq->interrupt = false;
>>> +
>>> +		__entry->id = cq->res.id;
>>> +	),
>>> +
>>> +	TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_process,
>>> +	TP_PROTO(
>>> +		const struct ib_cq *cq
>>> +	),
>>> +
>>> +	TP_ARGS(cq),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(s64, latency)
>>> +		__field(u32, id)
>>> +		__field(bool, interrupt)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
>>> +
>>> +		__entry->id = cq->res.id;
>>> +		__entry->latency = ktime_to_us(latency);
>>> +		__entry->interrupt = cq->interrupt;
>>> +	),
>>> +
>>> +	TP_printk("id %u wake-up took %lld [us] from %s",
>> It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.
> 
> I left out the "cq" here because the trace point names are prefixed
> with "cq_". However, now that you bring it up, I can imagine cases
> where a trace point might report information about two different
> resources that both have a restrack ID.
> 
> How about "cq.id=%u" ?
> 
> 
>> I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
>> I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.
> 
> On the other hand, a trace point might someday want to report the
> value of a field in struct ib_cq.

Btw, is it worth adding a helper function to return res.id?

u32 rdma_restrack_id(const struct rdma_restrack_entry *res)
{
	return res->id;
}

Or is this just noise?


>>> +		__entry->id, __entry->latency,
>>> +		__entry->interrupt ? "interrupt" : "reschedule"
>>> +	)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_poll,
>>> +	TP_PROTO(
>>> +		const struct ib_cq *cq,
>>> +		int requested,
>>> +		int rc
>>> +	),
>>> +
>>> +	TP_ARGS(cq, requested, rc),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +		__field(int, requested)
>>> +		__field(int, rc)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->id = cq->res.id;
>>> +		__entry->requested = requested;
>>> +		__entry->rc = rc;
>>> +	),
>>> +
>>> +	TP_printk("id %u requested %d, returned %d",
>>> +		__entry->id, __entry->requested, __entry->rc
>>> +	)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_modify,
>>> +	TP_PROTO(
>>> +		const struct ib_cq *cq,
>>> +		u16 comps,
>>> +		u16 usec
>>> +	),
>>> +
>>> +	TP_ARGS(cq, comps, usec),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +		__field(unsigned int, comps)
>>> +		__field(unsigned int, usec)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->id = cq->res.id;
>>> +		__entry->comps = comps;
>>> +		__entry->usec = usec;
>>> +	),
>>> +
>>> +	TP_printk("id %u comps=%u usec=%u",
>>> +		__entry->id, __entry->comps, __entry->usec
>>> +	)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_alloc,
>>> +	TP_PROTO(
>>> +		const struct ib_cq *cq,
>>> +		int comp_vector,
>>> +		enum ib_poll_context poll_ctx
>>> +	),
>>> +
>>> +	TP_ARGS(cq, comp_vector, poll_ctx),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +		__field(int, comp_vector)
>>> +		__field(unsigned long, poll_ctx)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->id = cq->res.id;
>>> +		__entry->comp_vector = comp_vector;
>>> +		__entry->poll_ctx = poll_ctx;
>>> +	),
>>> +
>>> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
>>> +		__entry->id, __entry->comp_vector,
>>> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
>>> +	)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_free,
>>> +	TP_PROTO(
>>> +		const struct ib_cq *cq
>>> +	),
>>> +
>>> +	TP_ARGS(cq),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(u32, id)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->id = cq->res.id;
>>> +	),
>>> +
>>> +	TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +#endif /* _TRACE_RDMA_CORE_H */
>>> +
>>> +#include <trace/define_trace.h>
> 
> --
> Chuck Lever

--
Chuck Lever




^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
  2019-10-11 17:38   ` Chuck Lever
  2019-10-11 17:44     ` Chuck Lever
@ 2019-10-11 17:46     ` Parav Pandit
  1 sibling, 0 replies; 5+ messages in thread
From: Parav Pandit @ 2019-10-11 17:46 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-rdma



> -----Original Message-----
> From: Chuck Lever <chuck.lever@oracle.com>
> Sent: Friday, October 11, 2019 12:39 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: linux-rdma@vger.kernel.org
> Subject: Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue
> issues
> 
> 
> 
> > On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@mellanox.com> wrote:
> >
> >
> >
> >> -----Original Message-----
> >> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> >> owner@vger.kernel.org> On Behalf Of Chuck Lever
> >> Sent: Wednesday, October 9, 2019 11:55 AM
> >> To: linux-rdma@vger.kernel.org
> >> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion
> >> queue issues
> >>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> drivers/infiniband/core/Makefile |    2
> >> drivers/infiniband/core/cq.c     |   29 +++--
> >> drivers/infiniband/core/trace.c  |   14 ++
> >> include/rdma/ib_verbs.h          |    2
> >> include/trace/events/rdma_core.h |  218
> >> ++++++++++++++++++++++++++++++++++++++
> >> 5 files changed, 255 insertions(+), 10 deletions(-)  create mode
> >> 100644 drivers/infiniband/core/trace.c  create mode 100644
> >> include/trace/events/rdma_core.h
> >>
> >> Changes since v2:
> >> - Removed extraneous changes to include/trace/events/rdma.h
> >>
> >> Changes since RFC:
> >> - Addressed comments from Parav Pandit <parav@mellanox.com>
> >>
> >>
> >> diff --git a/drivers/infiniband/core/Makefile
> >> b/drivers/infiniband/core/Makefile
> >> index 09881bd..68d9e27 100644
> >> --- a/drivers/infiniband/core/Makefile
> >> +++ b/drivers/infiniband/core/Makefile
> >> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
> >> verbs.o cq.o rw.o sysfs.o \
> >> 				device.o fmr_pool.o cache.o netlink.o \
> >> 				roce_gid_mgmt.o mr_pool.o addr.o
> >> sa_query.o \
> >> 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> >> -				nldev.o restrack.o counters.o
> >> +				nldev.o restrack.o counters.o trace.o
> >>
> >> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
> >> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
> >> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
> >> bbfded6..bcde992 100644
> >> --- a/drivers/infiniband/core/cq.c
> >> +++ b/drivers/infiniband/core/cq.c
> >> @@ -7,6 +7,8 @@
> >> #include <linux/slab.h>
> >> #include <rdma/ib_verbs.h>
> >>
> >> +#include <trace/events/rdma_core.h>
> >> +
> >> /* # of WCs to poll for with a single call to ib_poll_cq */
> >> #define IB_POLL_BATCH			16
> >> #define IB_POLL_BATCH_DIRECT		8
> >> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct
> >> *w)
> >>
> >> 	dim->state = DIM_START_MEASURE;
> >>
> >> +	trace_cq_modify(cq, comps, usec);
> >> 	cq->device->ops.modify_cq(cq, comps, usec);  }
> >>
> >> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int
> >> budget, struct ib_wc *wcs,  {
> >> 	int i, n, completed = 0;
> >>
> >> -	/*
> >> -	 * budget might be (-1) if the caller does not
> >> -	 * want to bound this call, thus we need unsigned
> >> -	 * minimum here.
> >> -	 */
> >> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
> >> -					 budget - completed), wcs)) > 0) {
> >> +	trace_cq_process(cq);
> >> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
> > Before this change, on first attempt to poll the cq, it will poll for min(batch,
> budget).
> > With this change, it will poll for batch.
> > This is functional change than just adding the trace points.
> > I am not sure if this has any effect on the overall polling.
> > But it may be worth to keep such functional change in pre-patch which consist
> of this change, moving comment section, batch recalculation.
> 
> Or find a way to add the trace point without the functional change.
> 
> 
:-) yes.

> >> +		trace_cq_poll(cq, batch, n);
> >> 		for (i = 0; i < n; i++) {
> >> 			struct ib_wc *wc = &wcs[i];
> >>
> >> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int
> >> budget, struct ib_wc *wcs,
> >> 		}
> >>
> >> 		completed += n;
> >> -
> >> 		if (n != batch || (budget != -1 && completed >= budget))
> >> 			break;
> >> +
> >> +		/*
> >> +		 * budget might be (-1) if the caller does not
> >> +		 * want to bound this call, thus we need unsigned
> >> +		 * minimum here.
> >> +		 */
> >> +		batch = min_t(u32, batch, budget - completed);
> >> 	}
> >>
> >> 	return completed;
> >> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop,
> >> int
> >> budget)
> >> 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
> >> 	if (completed < budget) {
> >> 		irq_poll_complete(&cq->iop);
> >> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> >> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> >> +			trace_cq_reschedule(cq);
> >> 			irq_poll_sched(&cq->iop);
> >> +		}
> >> 	}
> >>
> >> 	if (dim)
> >> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop,
> >> int
> >> budget)
> >>
> >> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
> >> {
> >> +	trace_cq_schedule(cq);
> >> 	irq_poll_sched(&cq->iop);
> >> }
> >>
> >> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct
> >> *work)
> >>
> >> static void ib_cq_completion_workqueue(struct ib_cq *cq, void
> >> *private)  {
> >> +	trace_cq_schedule(cq);
> >> 	queue_work(cq->comp_wq, &cq->work);
> >> }
> >>
> >> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device
> >> *dev, void *private,
> >> 		goto out_destroy_cq;
> >> 	}
> >>
> >> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
> >> 	return cq;
> >>
> >> out_destroy_cq:
> >> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct
> >> ib_udata
> >> *udata)
> >> 		WARN_ON_ONCE(1);
> >> 	}
> >>
> >> +	trace_cq_free(cq);
> >> 	rdma_restrack_del(&cq->res);
> >> 	cq->device->ops.destroy_cq(cq, udata);
> >> 	if (cq->dim)
> >> diff --git a/drivers/infiniband/core/trace.c
> >> b/drivers/infiniband/core/trace.c new file mode 100644 index
> >> 0000000..6c3514b
> >> --- /dev/null
> >> +++ b/drivers/infiniband/core/trace.c
> >> @@ -0,0 +1,14 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/*
> >> + * Trace points for core RDMA functions.
> >> + *
> >> + * Author: Chuck Lever <chuck.lever@oracle.com>
> >> + *
> >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> >> + */
> >> +
> >> +#define CREATE_TRACE_POINTS
> >> +
> >> +#include <rdma/ib_verbs.h>
> >> +
> >> +#include <trace/events/rdma_core.h>
> >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> >> 6a47ba8..95a6bce 100644
> >> --- a/include/rdma/ib_verbs.h
> >> +++ b/include/rdma/ib_verbs.h
> >> @@ -1555,6 +1555,8 @@ struct ib_cq {
> >> 	};
> >> 	struct workqueue_struct *comp_wq;
> >> 	struct dim *dim;
> >> +	ktime_t timestamp;
> >> +	bool interrupt;
> > Its unclear when to update timestamp and interrupt. Comment will help.
> 
> These are both updated only in the new trace points. Is more than that needed
> in a comment?
> 
No. It wasn't clear to me that when to update the timestamp.
But I guess its obvious..

> 
> >> 	/*
> >> 	 * Implementation details of the RDMA core, don't use in drivers:
> >> 	 */
> >> diff --git a/include/trace/events/rdma_core.h
> >> b/include/trace/events/rdma_core.h
> >> new file mode 100644
> >> index 0000000..c1397a3
> >> --- /dev/null
> >> +++ b/include/trace/events/rdma_core.h
> >> @@ -0,0 +1,218 @@
> >> +/* SPDX-License-Identifier: GPL-2.0-only */
> >> +/*
> >> + * Trace point definitions for core RDMA functions.
> >> + *
> >> + * Author: Chuck Lever <chuck.lever@oracle.com>
> >> + *
> >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> >> + */
> >> +
> >> +#undef TRACE_SYSTEM
> >> +#define TRACE_SYSTEM rdma_core
> >> +
> >> +#if !defined(_TRACE_RDMA_CORE_H) ||
> >> defined(TRACE_HEADER_MULTI_READ)
> >> +#define _TRACE_RDMA_CORE_H
> >> +
> >> +#include <linux/tracepoint.h>
> >> +#include <rdma/ib_verbs.h>
> >> +#include <rdma/restrack.h>
> >> +
> >> +/*
> >> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
> >> +#define IB_POLL_CTX_LIST			\
> >> +	ib_poll_ctx(DIRECT)			\
> >> +	ib_poll_ctx(SOFTIRQ)			\
> >> +	ib_poll_ctx(WORKQUEUE)			\
> >> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
> >> +
> >> +#undef ib_poll_ctx
> >> +#undef ib_poll_ctx_end
> >> +
> >> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
> >> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
> >> +
> >> +IB_POLL_CTX_LIST
> >> +
> >> +#undef ib_poll_ctx
> >> +#undef ib_poll_ctx_end
> >> +
> >> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
> >> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
> >> +
> >> +#define rdma_show_ib_poll_ctx(x) \
> >> +		__print_symbolic(x, IB_POLL_CTX_LIST)
> >> +
> >> +/**
> >> + ** Completion Queue events
> >> + **/
> >> +
> >> +TRACE_EVENT(cq_schedule,
> >> +	TP_PROTO(
> >> +		struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		cq->timestamp = ktime_get();
> >> +		cq->interrupt = true;
> >> +
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_reschedule,
> >> +	TP_PROTO(
> >> +		struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		cq->timestamp = ktime_get();
> >> +		cq->interrupt = false;
> >> +
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_process,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(s64, latency)
> >> +		__field(u32, id)
> >> +		__field(bool, interrupt)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
> >> +
> >> +		__entry->id = cq->res.id;
> >> +		__entry->latency = ktime_to_us(latency);
> >> +		__entry->interrupt = cq->interrupt;
> >> +	),
> >> +
> >> +	TP_printk("id %u wake-up took %lld [us] from %s",
> > It might be better to prefix 'id' with 'cq', so that in future rdma wide trace
> points, we can have multiple resource id's printed consistently as qpid, cqid,
> mrid etc; and don't have to rely on the function where it is used to decode what
> that id means.
> 
> I left out the "cq" here because the trace point names are prefixed with "cq_".
> However, now that you bring it up, I can imagine cases where a trace point
> might report information about two different resources that both have a
> restrack ID.
> 
> How about "cq.id=%u" ?
> 
Looks good.

> 
> > I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
> > I was thinking of rdma_restrack_entry*, as it makes future code for other
> resources also anchored on the resource id.
> 
> On the other hand, a trace point might someday want to report the value of a
> field in struct ib_cq.
> 
Yes. that is exactly why I had mix thoughts. So I think ib_cq is ok.

> 
> >> +		__entry->id, __entry->latency,
> >> +		__entry->interrupt ? "interrupt" : "reschedule"
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_poll,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		int requested,
> >> +		int rc
> >> +	),
> >> +
> >> +	TP_ARGS(cq, requested, rc),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(int, requested)
> >> +		__field(int, rc)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->requested = requested;
> >> +		__entry->rc = rc;
> >> +	),
> >> +
> >> +	TP_printk("id %u requested %d, returned %d",
> >> +		__entry->id, __entry->requested, __entry->rc
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_modify,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		u16 comps,
> >> +		u16 usec
> >> +	),
> >> +
> >> +	TP_ARGS(cq, comps, usec),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(unsigned int, comps)
> >> +		__field(unsigned int, usec)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->comps = comps;
> >> +		__entry->usec = usec;
> >> +	),
> >> +
> >> +	TP_printk("id %u comps=%u usec=%u",
> >> +		__entry->id, __entry->comps, __entry->usec
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_alloc,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		int comp_vector,
> >> +		enum ib_poll_context poll_ctx
> >> +	),
> >> +
> >> +	TP_ARGS(cq, comp_vector, poll_ctx),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(int, comp_vector)
> >> +		__field(unsigned long, poll_ctx)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->comp_vector = comp_vector;
> >> +		__entry->poll_ctx = poll_ctx;
> >> +	),
> >> +
> >> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
> >> +		__entry->id, __entry->comp_vector,
> >> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_free,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +#endif /* _TRACE_RDMA_CORE_H */
> >> +
> >> +#include <trace/define_trace.h>
> 
> --
> Chuck Lever
> 
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-09 16:54 [PATCH v3] IB/core: Trace points for diagnosing completion queue issues Chuck Lever
2019-10-11 17:26 ` Parav Pandit
2019-10-11 17:38   ` Chuck Lever
2019-10-11 17:44     ` Chuck Lever
2019-10-11 17:46     ` Parav Pandit

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org linux-rdma@archiver.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/ public-inbox