Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
From: Parav Pandit <parav@mellanox.com>
To: Chuck Lever <chuck.lever@oracle.com>
Cc: "linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>
Subject: RE: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
Date: Fri, 11 Oct 2019 17:46:12 +0000
Message-ID: <AM0PR05MB48668F96396D3317D1DC6A30D1970@AM0PR05MB4866.eurprd05.prod.outlook.com> (raw)
In-Reply-To: <54FEAA5A-AF8D-4604-9AEE-3B61DB26325B@oracle.com>



> -----Original Message-----
> From: Chuck Lever <chuck.lever@oracle.com>
> Sent: Friday, October 11, 2019 12:39 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: linux-rdma@vger.kernel.org
> Subject: Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue
> issues
> 
> 
> 
> > On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@mellanox.com> wrote:
> >
> >
> >
> >> -----Original Message-----
> >> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> >> owner@vger.kernel.org> On Behalf Of Chuck Lever
> >> Sent: Wednesday, October 9, 2019 11:55 AM
> >> To: linux-rdma@vger.kernel.org
> >> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion
> >> queue issues
> >>
> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> >> ---
> >> drivers/infiniband/core/Makefile |    2
> >> drivers/infiniband/core/cq.c     |   29 +++--
> >> drivers/infiniband/core/trace.c  |   14 ++
> >> include/rdma/ib_verbs.h          |    2
> >> include/trace/events/rdma_core.h |  218
> >> ++++++++++++++++++++++++++++++++++++++
> >> 5 files changed, 255 insertions(+), 10 deletions(-)  create mode
> >> 100644 drivers/infiniband/core/trace.c  create mode 100644
> >> include/trace/events/rdma_core.h
> >>
> >> Changes since v2:
> >> - Removed extraneous changes to include/trace/events/rdma.h
> >>
> >> Changes since RFC:
> >> - Addressed comments from Parav Pandit <parav@mellanox.com>
> >>
> >>
> >> diff --git a/drivers/infiniband/core/Makefile
> >> b/drivers/infiniband/core/Makefile
> >> index 09881bd..68d9e27 100644
> >> --- a/drivers/infiniband/core/Makefile
> >> +++ b/drivers/infiniband/core/Makefile
> >> @@ -11,7 +11,7 @@ ib_core-y :=			packer.o ud_header.o
> >> verbs.o cq.o rw.o sysfs.o \
> >> 				device.o fmr_pool.o cache.o netlink.o \
> >> 				roce_gid_mgmt.o mr_pool.o addr.o
> >> sa_query.o \
> >> 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
> >> -				nldev.o restrack.o counters.o
> >> +				nldev.o restrack.o counters.o trace.o
> >>
> >> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
> >> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
> >> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
> >> bbfded6..bcde992 100644
> >> --- a/drivers/infiniband/core/cq.c
> >> +++ b/drivers/infiniband/core/cq.c
> >> @@ -7,6 +7,8 @@
> >> #include <linux/slab.h>
> >> #include <rdma/ib_verbs.h>
> >>
> >> +#include <trace/events/rdma_core.h>
> >> +
> >> /* # of WCs to poll for with a single call to ib_poll_cq */
> >> #define IB_POLL_BATCH			16
> >> #define IB_POLL_BATCH_DIRECT		8
> >> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct
> >> *w)
> >>
> >> 	dim->state = DIM_START_MEASURE;
> >>
> >> +	trace_cq_modify(cq, comps, usec);
> >> 	cq->device->ops.modify_cq(cq, comps, usec);  }
> >>
> >> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int
> >> budget, struct ib_wc *wcs,  {
> >> 	int i, n, completed = 0;
> >>
> >> -	/*
> >> -	 * budget might be (-1) if the caller does not
> >> -	 * want to bound this call, thus we need unsigned
> >> -	 * minimum here.
> >> -	 */
> >> -	while ((n = ib_poll_cq(cq, min_t(u32, batch,
> >> -					 budget - completed), wcs)) > 0) {
> >> +	trace_cq_process(cq);
> >> +	while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
> > Before this change, on first attempt to poll the cq, it will poll for min(batch,
> budget).
> > With this change, it will poll for batch.
> > This is functional change than just adding the trace points.
> > I am not sure if this has any effect on the overall polling.
> > But it may be worth to keep such functional change in pre-patch which consist
> of this change, moving comment section, batch recalculation.
> 
> Or find a way to add the trace point without the functional change.
> 
> 
:-) yes.

> >> +		trace_cq_poll(cq, batch, n);
> >> 		for (i = 0; i < n; i++) {
> >> 			struct ib_wc *wc = &wcs[i];
> >>
> >> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int
> >> budget, struct ib_wc *wcs,
> >> 		}
> >>
> >> 		completed += n;
> >> -
> >> 		if (n != batch || (budget != -1 && completed >= budget))
> >> 			break;
> >> +
> >> +		/*
> >> +		 * budget might be (-1) if the caller does not
> >> +		 * want to bound this call, thus we need unsigned
> >> +		 * minimum here.
> >> +		 */
> >> +		batch = min_t(u32, batch, budget - completed);
> >> 	}
> >>
> >> 	return completed;
> >> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop,
> >> int
> >> budget)
> >> 	completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
> >> 	if (completed < budget) {
> >> 		irq_poll_complete(&cq->iop);
> >> -		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> >> +		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> >> +			trace_cq_reschedule(cq);
> >> 			irq_poll_sched(&cq->iop);
> >> +		}
> >> 	}
> >>
> >> 	if (dim)
> >> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop,
> >> int
> >> budget)
> >>
> >> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
> >> {
> >> +	trace_cq_schedule(cq);
> >> 	irq_poll_sched(&cq->iop);
> >> }
> >>
> >> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct
> >> *work)
> >>
> >> static void ib_cq_completion_workqueue(struct ib_cq *cq, void
> >> *private)  {
> >> +	trace_cq_schedule(cq);
> >> 	queue_work(cq->comp_wq, &cq->work);
> >> }
> >>
> >> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device
> >> *dev, void *private,
> >> 		goto out_destroy_cq;
> >> 	}
> >>
> >> +	trace_cq_alloc(cq, comp_vector, poll_ctx);
> >> 	return cq;
> >>
> >> out_destroy_cq:
> >> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct
> >> ib_udata
> >> *udata)
> >> 		WARN_ON_ONCE(1);
> >> 	}
> >>
> >> +	trace_cq_free(cq);
> >> 	rdma_restrack_del(&cq->res);
> >> 	cq->device->ops.destroy_cq(cq, udata);
> >> 	if (cq->dim)
> >> diff --git a/drivers/infiniband/core/trace.c
> >> b/drivers/infiniband/core/trace.c new file mode 100644 index
> >> 0000000..6c3514b
> >> --- /dev/null
> >> +++ b/drivers/infiniband/core/trace.c
> >> @@ -0,0 +1,14 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/*
> >> + * Trace points for core RDMA functions.
> >> + *
> >> + * Author: Chuck Lever <chuck.lever@oracle.com>
> >> + *
> >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> >> + */
> >> +
> >> +#define CREATE_TRACE_POINTS
> >> +
> >> +#include <rdma/ib_verbs.h>
> >> +
> >> +#include <trace/events/rdma_core.h>
> >> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> >> 6a47ba8..95a6bce 100644
> >> --- a/include/rdma/ib_verbs.h
> >> +++ b/include/rdma/ib_verbs.h
> >> @@ -1555,6 +1555,8 @@ struct ib_cq {
> >> 	};
> >> 	struct workqueue_struct *comp_wq;
> >> 	struct dim *dim;
> >> +	ktime_t timestamp;
> >> +	bool interrupt;
> > Its unclear when to update timestamp and interrupt. Comment will help.
> 
> These are both updated only in the new trace points. Is more than that needed
> in a comment?
> 
No. It wasn't clear to me that when to update the timestamp.
But I guess its obvious..

> 
> >> 	/*
> >> 	 * Implementation details of the RDMA core, don't use in drivers:
> >> 	 */
> >> diff --git a/include/trace/events/rdma_core.h
> >> b/include/trace/events/rdma_core.h
> >> new file mode 100644
> >> index 0000000..c1397a3
> >> --- /dev/null
> >> +++ b/include/trace/events/rdma_core.h
> >> @@ -0,0 +1,218 @@
> >> +/* SPDX-License-Identifier: GPL-2.0-only */
> >> +/*
> >> + * Trace point definitions for core RDMA functions.
> >> + *
> >> + * Author: Chuck Lever <chuck.lever@oracle.com>
> >> + *
> >> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> >> + */
> >> +
> >> +#undef TRACE_SYSTEM
> >> +#define TRACE_SYSTEM rdma_core
> >> +
> >> +#if !defined(_TRACE_RDMA_CORE_H) ||
> >> defined(TRACE_HEADER_MULTI_READ)
> >> +#define _TRACE_RDMA_CORE_H
> >> +
> >> +#include <linux/tracepoint.h>
> >> +#include <rdma/ib_verbs.h>
> >> +#include <rdma/restrack.h>
> >> +
> >> +/*
> >> + * enum ib_poll_context, from include/rdma/ib_verbs.h  */
> >> +#define IB_POLL_CTX_LIST			\
> >> +	ib_poll_ctx(DIRECT)			\
> >> +	ib_poll_ctx(SOFTIRQ)			\
> >> +	ib_poll_ctx(WORKQUEUE)			\
> >> +	ib_poll_ctx_end(UNBOUND_WORKQUEUE)
> >> +
> >> +#undef ib_poll_ctx
> >> +#undef ib_poll_ctx_end
> >> +
> >> +#define ib_poll_ctx(x)		TRACE_DEFINE_ENUM(IB_POLL_##x);
> >> +#define ib_poll_ctx_end(x)	TRACE_DEFINE_ENUM(IB_POLL_##x);
> >> +
> >> +IB_POLL_CTX_LIST
> >> +
> >> +#undef ib_poll_ctx
> >> +#undef ib_poll_ctx_end
> >> +
> >> +#define ib_poll_ctx(x)		{ IB_POLL_##x, #x },
> >> +#define ib_poll_ctx_end(x)	{ IB_POLL_##x, #x }
> >> +
> >> +#define rdma_show_ib_poll_ctx(x) \
> >> +		__print_symbolic(x, IB_POLL_CTX_LIST)
> >> +
> >> +/**
> >> + ** Completion Queue events
> >> + **/
> >> +
> >> +TRACE_EVENT(cq_schedule,
> >> +	TP_PROTO(
> >> +		struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		cq->timestamp = ktime_get();
> >> +		cq->interrupt = true;
> >> +
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_reschedule,
> >> +	TP_PROTO(
> >> +		struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		cq->timestamp = ktime_get();
> >> +		cq->interrupt = false;
> >> +
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_process,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(s64, latency)
> >> +		__field(u32, id)
> >> +		__field(bool, interrupt)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
> >> +
> >> +		__entry->id = cq->res.id;
> >> +		__entry->latency = ktime_to_us(latency);
> >> +		__entry->interrupt = cq->interrupt;
> >> +	),
> >> +
> >> +	TP_printk("id %u wake-up took %lld [us] from %s",
> > It might be better to prefix 'id' with 'cq', so that in future rdma wide trace
> points, we can have multiple resource id's printed consistently as qpid, cqid,
> mrid etc; and don't have to rely on the function where it is used to decode what
> that id means.
> 
> I left out the "cq" here because the trace point names are prefixed with "cq_".
> However, now that you bring it up, I can imagine cases where a trace point
> might report information about two different resources that both have a
> restrack ID.
> 
> How about "cq.id=%u" ?
> 
Looks good.

> 
> > I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
> > I was thinking of rdma_restrack_entry*, as it makes future code for other
> resources also anchored on the resource id.
> 
> On the other hand, a trace point might someday want to report the value of a
> field in struct ib_cq.
> 
Yes. that is exactly why I had mix thoughts. So I think ib_cq is ok.

> 
> >> +		__entry->id, __entry->latency,
> >> +		__entry->interrupt ? "interrupt" : "reschedule"
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_poll,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		int requested,
> >> +		int rc
> >> +	),
> >> +
> >> +	TP_ARGS(cq, requested, rc),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(int, requested)
> >> +		__field(int, rc)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->requested = requested;
> >> +		__entry->rc = rc;
> >> +	),
> >> +
> >> +	TP_printk("id %u requested %d, returned %d",
> >> +		__entry->id, __entry->requested, __entry->rc
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_modify,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		u16 comps,
> >> +		u16 usec
> >> +	),
> >> +
> >> +	TP_ARGS(cq, comps, usec),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(unsigned int, comps)
> >> +		__field(unsigned int, usec)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->comps = comps;
> >> +		__entry->usec = usec;
> >> +	),
> >> +
> >> +	TP_printk("id %u comps=%u usec=%u",
> >> +		__entry->id, __entry->comps, __entry->usec
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_alloc,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq,
> >> +		int comp_vector,
> >> +		enum ib_poll_context poll_ctx
> >> +	),
> >> +
> >> +	TP_ARGS(cq, comp_vector, poll_ctx),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +		__field(int, comp_vector)
> >> +		__field(unsigned long, poll_ctx)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +		__entry->comp_vector = comp_vector;
> >> +		__entry->poll_ctx = poll_ctx;
> >> +	),
> >> +
> >> +	TP_printk("id %u comp_vector=%d poll_ctx=%s",
> >> +		__entry->id, __entry->comp_vector,
> >> +		rdma_show_ib_poll_ctx(__entry->poll_ctx)
> >> +	)
> >> +);
> >> +
> >> +TRACE_EVENT(cq_free,
> >> +	TP_PROTO(
> >> +		const struct ib_cq *cq
> >> +	),
> >> +
> >> +	TP_ARGS(cq),
> >> +
> >> +	TP_STRUCT__entry(
> >> +		__field(u32, id)
> >> +	),
> >> +
> >> +	TP_fast_assign(
> >> +		__entry->id = cq->res.id;
> >> +	),
> >> +
> >> +	TP_printk("id %u", __entry->id)
> >> +);
> >> +
> >> +#endif /* _TRACE_RDMA_CORE_H */
> >> +
> >> +#include <trace/define_trace.h>
> 
> --
> Chuck Lever
> 
> 


      parent reply index

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-09 16:54 Chuck Lever
2019-10-11 17:26 ` Parav Pandit
2019-10-11 17:38   ` Chuck Lever
2019-10-11 17:44     ` Chuck Lever
2019-10-11 17:46     ` Parav Pandit [this message]

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AM0PR05MB48668F96396D3317D1DC6A30D1970@AM0PR05MB4866.eurprd05.prod.outlook.com \
    --to=parav@mellanox.com \
    --cc=chuck.lever@oracle.com \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org linux-rdma@archiver.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/ public-inbox