Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
From: Parav Pandit <parav@mellanox.com>
To: Chuck Lever <chuck.lever@oracle.com>,
	"linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>
Subject: RE: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
Date: Fri, 11 Oct 2019 17:26:28 +0000
Message-ID: <AM0PR05MB48665D73CD796FC65A29C356D1970@AM0PR05MB4866.eurprd05.prod.outlook.com> (raw)
In-Reply-To: <20191009165219.2202.56785.stgit@manet.1015granger.net>



> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Chuck Lever
> Sent: Wednesday, October 9, 2019 11:55 AM
> To: linux-rdma@vger.kernel.org
> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
> issues
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  drivers/infiniband/core/Makefile |    2
>  drivers/infiniband/core/cq.c     |   29 +++--
>  drivers/infiniband/core/trace.c  |   14 ++
>  include/rdma/ib_verbs.h          |    2
>  include/trace/events/rdma_core.h |  218
> ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 255 insertions(+), 10 deletions(-)  create mode 100644
> drivers/infiniband/core/trace.c  create mode 100644
> include/trace/events/rdma_core.h
> 
> Changes since v2:
> - Removed extraneous changes to include/trace/events/rdma.h
> 
> Changes since RFC:
> - Addressed comments from Parav Pandit <parav@mellanox.com>
> 
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index 09881bd..68d9e27 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
> verbs.o cq.o rw.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o mr_pool.o addr.o
> sa_query.o \
>  				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> -				nldev.o restrack.o counters.o
> +				nldev.o restrack.o counters.o trace.o
> 
>  ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>  ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
> bbfded6..bcde992 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -7,6 +7,8 @@
>  #include <linux/slab.h>
>  #include <rdma/ib_verbs.h>
> 
> +#include <trace/events/rdma_core.h>
> +
>  /* # of WCs to poll for with a single call to ib_poll_cq */
>  #define IB_POLL_BATCH			16
>  #define IB_POLL_BATCH_DIRECT		8
> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
> 
>  	dim->state = DIM_START_MEASURE;
> 
> +	trace_cq_modify(cq, comps, usec);
>  	cq->device->ops.modify_cq(cq, comps, usec);  }
> 
> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs,  {
>  	int i, n, completed = 0;
> 
> -	/*
> -	 * budget might be (-1) if the caller does not
> -	 * want to bound this call, thus we need unsigned
> -	 * minimum here.
> -	 */
> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
> -					 budget - completed), wcs)) > 0) {
> +	trace_cq_process(cq);
> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
With this change, it will poll for batch.
This is functional change than just adding the trace points.
I am not sure if this has any effect on the overall polling.
But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.

> +		trace_cq_poll(cq, batch, n);
>  		for (i = 0; i < n; i++) {
>  			struct ib_wc *wc = &wcs[i];
> 
> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs,
>  		}
> 
>  		completed += n;
> -
>  		if (n != batch || (budget != -1 && completed >= budget))
>  			break;
> +
> +		/*
> +		 * budget might be (-1) if the caller does not
> +		 * want to bound this call, thus we need unsigned
> +		 * minimum here.
> +		 */
> +		batch = min_t(u32, batch, budget - completed);
>  	}
> 
>  	return completed;
> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
>  	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>  	if (completed < budget) {
>  		irq_poll_complete(&cq->iop);
> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> +			trace_cq_reschedule(cq);
>  			irq_poll_sched(&cq->iop);
> +		}
>  	}
> 
>  	if (dim)
> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
> 
>  static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)  {
> +	trace_cq_schedule(cq);
>  	irq_poll_sched(&cq->iop);
>  }
> 
> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
> 
>  static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)  {
> +	trace_cq_schedule(cq);
>  	queue_work(cq->comp_wq, &cq->work);
>  }
> 
> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
> void *private,
>  		goto out_destroy_cq;
>  	}
> 
> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
>  	return cq;
> 
>  out_destroy_cq:
> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
> *udata)
>  		WARN_ON_ONCE(1);
>  	}
> 
> +	trace_cq_free(cq);
>  	rdma_restrack_del(&cq->res);
>  	cq->device->ops.destroy_cq(cq, udata);
>  	if (cq->dim)
> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
> new file mode 100644 index 0000000..6c3514b
> --- /dev/null
> +++ b/drivers/infiniband/core/trace.c
> @@ -0,0 +1,14 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Trace points for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#define CREATE_TRACE_POINTS
> +
> +#include <rdma/ib_verbs.h>
> +
> +#include <trace/events/rdma_core.h>
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> 6a47ba8..95a6bce 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1555,6 +1555,8 @@ struct ib_cq {
>  	};
>  	struct workqueue_struct *comp_wq;
>  	struct dim *dim;
> +	ktime_t timestamp;
> +	bool interrupt;
Its unclear when to update timestamp and interrupt. Comment will help.
>  	/*
>  	 * Implementation details of the RDMA core, don't use in drivers:
>  	 */
> diff --git a/include/trace/events/rdma_core.h
> b/include/trace/events/rdma_core.h
> new file mode 100644
> index 0000000..c1397a3
> --- /dev/null
> +++ b/include/trace/events/rdma_core.h
> @@ -0,0 +1,218 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Trace point definitions for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM rdma_core
> +
> +#if !defined(_TRACE_RDMA_CORE_H) ||
> defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_RDMA_CORE_H
> +
> +#include <linux/tracepoint.h>
> +#include <rdma/ib_verbs.h>
> +#include <rdma/restrack.h>
> +
> +/*
> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
> +#define IB_POLL_CTX_LIST			\
> +	ib_poll_ctx(DIRECT)			\
> +	ib_poll_ctx(SOFTIRQ)			\
> +	ib_poll_ctx(WORKQUEUE)			\
> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
> +
> +IB_POLL_CTX_LIST
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
> +
> +#define rdma_show_ib_poll_ctx(x) \
> +		__print_symbolic(x, IB_POLL_CTX_LIST)
> +
> +/**
> + ** Completion Queue events
> + **/
> +
> +TRACE_EVENT(cq_schedule,
> +	TP_PROTO(
> +		struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		cq->timestamp = ktime_get();
> +		cq->interrupt = true;
> +
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +TRACE_EVENT(cq_reschedule,
> +	TP_PROTO(
> +		struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		cq->timestamp = ktime_get();
> +		cq->interrupt = false;
> +
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +TRACE_EVENT(cq_process,
> +	TP_PROTO(
> +		const struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(s64, latency)
> +		__field(u32, id)
> +		__field(bool, interrupt)
> +	),
> +
> +	TP_fast_assign(
> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
> +
> +		__entry->id = cq->res.id;
> +		__entry->latency = ktime_to_us(latency);
> +		__entry->interrupt = cq->interrupt;
> +	),
> +
> +	TP_printk("id %u wake-up took %lld [us] from %s",
It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.

I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.


> +		__entry->id, __entry->latency,
> +		__entry->interrupt ? "interrupt" : "reschedule"
> +	)
> +);
> +
> +TRACE_EVENT(cq_poll,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		int requested,
> +		int rc
> +	),
> +
> +	TP_ARGS(cq, requested, rc),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(int, requested)
> +		__field(int, rc)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->requested = requested;
> +		__entry->rc = rc;
> +	),
> +
> +	TP_printk("id %u requested %d, returned %d",
> +		__entry->id, __entry->requested, __entry->rc
> +	)
> +);
> +
> +TRACE_EVENT(cq_modify,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		u16 comps,
> +		u16 usec
> +	),
> +
> +	TP_ARGS(cq, comps, usec),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(unsigned int, comps)
> +		__field(unsigned int, usec)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->comps = comps;
> +		__entry->usec = usec;
> +	),
> +
> +	TP_printk("id %u comps=%u usec=%u",
> +		__entry->id, __entry->comps, __entry->usec
> +	)
> +);
> +
> +TRACE_EVENT(cq_alloc,
> +	TP_PROTO(
> +		const struct ib_cq *cq,
> +		int comp_vector,
> +		enum ib_poll_context poll_ctx
> +	),
> +
> +	TP_ARGS(cq, comp_vector, poll_ctx),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +		__field(int, comp_vector)
> +		__field(unsigned long, poll_ctx)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +		__entry->comp_vector = comp_vector;
> +		__entry->poll_ctx = poll_ctx;
> +	),
> +
> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
> +		__entry->id, __entry->comp_vector,
> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
> +	)
> +);
> +
> +TRACE_EVENT(cq_free,
> +	TP_PROTO(
> +		const struct ib_cq *cq
> +	),
> +
> +	TP_ARGS(cq),
> +
> +	TP_STRUCT__entry(
> +		__field(u32, id)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->id = cq->res.id;
> +	),
> +
> +	TP_printk("id %u", __entry->id)
> +);
> +
> +#endif /* _TRACE_RDMA_CORE_H */
> +
> +#include <trace/define_trace.h>


  reply index

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-09 16:54 Chuck Lever
2019-10-11 17:26 ` Parav Pandit [this message]
2019-10-11 17:38   ` Chuck Lever
2019-10-11 17:44     ` Chuck Lever
2019-10-11 17:46     ` Parav Pandit

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AM0PR05MB48665D73CD796FC65A29C356D1970@AM0PR05MB4866.eurprd05.prod.outlook.com \
    --to=parav@mellanox.com \
    --cc=chuck.lever@oracle.com \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org linux-rdma@archiver.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/ public-inbox