From: Chuck Lever <chuck.lever@oracle.com>
To: Parav Pandit <parav@mellanox.com>
Cc: "linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>
Subject: Re: [PATCH v3] IB/core: Trace points for diagnosing completion queue issues
Date: Fri, 11 Oct 2019 13:44:11 -0400 [thread overview]
Message-ID: <AFA11C46-9CA3-4B34-921C-1E11EECA4743@oracle.com> (raw)
In-Reply-To: <54FEAA5A-AF8D-4604-9AEE-3B61DB26325B@oracle.com>
> On Oct 11, 2019, at 1:38 PM, Chuck Lever <chuck.lever@oracle.com> wrote:
>
>
>
>> On Oct 11, 2019, at 1:26 PM, Parav Pandit <parav@mellanox.com> wrote:
>>
>>
>>
>>> -----Original Message-----
>>> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
>>> owner@vger.kernel.org> On Behalf Of Chuck Lever
>>> Sent: Wednesday, October 9, 2019 11:55 AM
>>> To: linux-rdma@vger.kernel.org
>>> Subject: [PATCH v3] IB/core: Trace points for diagnosing completion queue
>>> issues
>>>
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> drivers/infiniband/core/Makefile | 2
>>> drivers/infiniband/core/cq.c | 29 +++--
>>> drivers/infiniband/core/trace.c | 14 ++
>>> include/rdma/ib_verbs.h | 2
>>> include/trace/events/rdma_core.h | 218
>>> ++++++++++++++++++++++++++++++++++++++
>>> 5 files changed, 255 insertions(+), 10 deletions(-) create mode 100644
>>> drivers/infiniband/core/trace.c create mode 100644
>>> include/trace/events/rdma_core.h
>>>
>>> Changes since v2:
>>> - Removed extraneous changes to include/trace/events/rdma.h
>>>
>>> Changes since RFC:
>>> - Addressed comments from Parav Pandit <parav@mellanox.com>
>>>
>>>
>>> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
>>> index 09881bd..68d9e27 100644
>>> --- a/drivers/infiniband/core/Makefile
>>> +++ b/drivers/infiniband/core/Makefile
>>> @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o
>>> verbs.o cq.o rw.o sysfs.o \
>>> device.o fmr_pool.o cache.o netlink.o \
>>> roce_gid_mgmt.o mr_pool.o addr.o
>>> sa_query.o \
>>> multicast.o mad.o smi.o agent.o mad_rmpp.o \
>>> - nldev.o restrack.o counters.o
>>> + nldev.o restrack.o counters.o trace.o
>>>
>>> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
>>> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
>>> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
>>> bbfded6..bcde992 100644
>>> --- a/drivers/infiniband/core/cq.c
>>> +++ b/drivers/infiniband/core/cq.c
>>> @@ -7,6 +7,8 @@
>>> #include <linux/slab.h>
>>> #include <rdma/ib_verbs.h>
>>>
>>> +#include <trace/events/rdma_core.h>
>>> +
>>> /* # of WCs to poll for with a single call to ib_poll_cq */
>>> #define IB_POLL_BATCH 16
>>> #define IB_POLL_BATCH_DIRECT 8
>>> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
>>>
>>> dim->state = DIM_START_MEASURE;
>>>
>>> + trace_cq_modify(cq, comps, usec);
>>> cq->device->ops.modify_cq(cq, comps, usec); }
>>>
>>> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>>> struct ib_wc *wcs, {
>>> int i, n, completed = 0;
>>>
>>> - /*
>>> - * budget might be (-1) if the caller does not
>>> - * want to bound this call, thus we need unsigned
>>> - * minimum here.
>>> - */
>>> - while ((n = ib_poll_cq(cq, min_t(u32, batch,
>>> - budget - completed), wcs)) > 0) {
>>> + trace_cq_process(cq);
>>> + while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
>> Before this change, on first attempt to poll the cq, it will poll for min(batch, budget).
>> With this change, it will poll for batch.
>> This is functional change than just adding the trace points.
>> I am not sure if this has any effect on the overall polling.
>> But it may be worth to keep such functional change in pre-patch which consist of this change, moving comment section, batch recalculation.
>
> Or find a way to add the trace point without the functional change.
>
>
>>> + trace_cq_poll(cq, batch, n);
>>> for (i = 0; i < n; i++) {
>>> struct ib_wc *wc = &wcs[i];
>>>
>>> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
>>> struct ib_wc *wcs,
>>> }
>>>
>>> completed += n;
>>> -
>>> if (n != batch || (budget != -1 && completed >= budget))
>>> break;
>>> +
>>> + /*
>>> + * budget might be (-1) if the caller does not
>>> + * want to bound this call, thus we need unsigned
>>> + * minimum here.
>>> + */
>>> + batch = min_t(u32, batch, budget - completed);
>>> }
>>>
>>> return completed;
>>> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
>>> budget)
>>> completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
>>> if (completed < budget) {
>>> irq_poll_complete(&cq->iop);
>>> - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
>>> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
>>> + trace_cq_reschedule(cq);
>>> irq_poll_sched(&cq->iop);
>>> + }
>>> }
>>>
>>> if (dim)
>>> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
>>> budget)
>>>
>>> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) {
>>> + trace_cq_schedule(cq);
>>> irq_poll_sched(&cq->iop);
>>> }
>>>
>>> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
>>>
>>> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) {
>>> + trace_cq_schedule(cq);
>>> queue_work(cq->comp_wq, &cq->work);
>>> }
>>>
>>> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
>>> void *private,
>>> goto out_destroy_cq;
>>> }
>>>
>>> + trace_cq_alloc(cq, comp_vector, poll_ctx);
>>> return cq;
>>>
>>> out_destroy_cq:
>>> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
>>> *udata)
>>> WARN_ON_ONCE(1);
>>> }
>>>
>>> + trace_cq_free(cq);
>>> rdma_restrack_del(&cq->res);
>>> cq->device->ops.destroy_cq(cq, udata);
>>> if (cq->dim)
>>> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
>>> new file mode 100644 index 0000000..6c3514b
>>> --- /dev/null
>>> +++ b/drivers/infiniband/core/trace.c
>>> @@ -0,0 +1,14 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * Trace points for core RDMA functions.
>>> + *
>>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>>> + *
>>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>>> + */
>>> +
>>> +#define CREATE_TRACE_POINTS
>>> +
>>> +#include <rdma/ib_verbs.h>
>>> +
>>> +#include <trace/events/rdma_core.h>
>>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
>>> 6a47ba8..95a6bce 100644
>>> --- a/include/rdma/ib_verbs.h
>>> +++ b/include/rdma/ib_verbs.h
>>> @@ -1555,6 +1555,8 @@ struct ib_cq {
>>> };
>>> struct workqueue_struct *comp_wq;
>>> struct dim *dim;
>>> + ktime_t timestamp;
>>> + bool interrupt;
>> Its unclear when to update timestamp and interrupt. Comment will help.
>
> These are both updated only in the new trace points. Is more than
> that needed in a comment?
>
>
>>> /*
>>> * Implementation details of the RDMA core, don't use in drivers:
>>> */
>>> diff --git a/include/trace/events/rdma_core.h
>>> b/include/trace/events/rdma_core.h
>>> new file mode 100644
>>> index 0000000..c1397a3
>>> --- /dev/null
>>> +++ b/include/trace/events/rdma_core.h
>>> @@ -0,0 +1,218 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * Trace point definitions for core RDMA functions.
>>> + *
>>> + * Author: Chuck Lever <chuck.lever@oracle.com>
>>> + *
>>> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
>>> + */
>>> +
>>> +#undef TRACE_SYSTEM
>>> +#define TRACE_SYSTEM rdma_core
>>> +
>>> +#if !defined(_TRACE_RDMA_CORE_H) ||
>>> defined(TRACE_HEADER_MULTI_READ)
>>> +#define _TRACE_RDMA_CORE_H
>>> +
>>> +#include <linux/tracepoint.h>
>>> +#include <rdma/ib_verbs.h>
>>> +#include <rdma/restrack.h>
>>> +
>>> +/*
>>> + * enum ib_poll_context, from include/rdma/ib_verbs.h */
>>> +#define IB_POLL_CTX_LIST \
>>> + ib_poll_ctx(DIRECT) \
>>> + ib_poll_ctx(SOFTIRQ) \
>>> + ib_poll_ctx(WORKQUEUE) \
>>> + ib_poll_ctx_end(UNBOUND_WORKQUEUE)
>>> +
>>> +#undef ib_poll_ctx
>>> +#undef ib_poll_ctx_end
>>> +
>>> +#define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x);
>>> +#define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x);
>>> +
>>> +IB_POLL_CTX_LIST
>>> +
>>> +#undef ib_poll_ctx
>>> +#undef ib_poll_ctx_end
>>> +
>>> +#define ib_poll_ctx(x) { IB_POLL_##x, #x },
>>> +#define ib_poll_ctx_end(x) { IB_POLL_##x, #x }
>>> +
>>> +#define rdma_show_ib_poll_ctx(x) \
>>> + __print_symbolic(x, IB_POLL_CTX_LIST)
>>> +
>>> +/**
>>> + ** Completion Queue events
>>> + **/
>>> +
>>> +TRACE_EVENT(cq_schedule,
>>> + TP_PROTO(
>>> + struct ib_cq *cq
>>> + ),
>>> +
>>> + TP_ARGS(cq),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + cq->timestamp = ktime_get();
>>> + cq->interrupt = true;
>>> +
>>> + __entry->id = cq->res.id;
>>> + ),
>>> +
>>> + TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_reschedule,
>>> + TP_PROTO(
>>> + struct ib_cq *cq
>>> + ),
>>> +
>>> + TP_ARGS(cq),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + cq->timestamp = ktime_get();
>>> + cq->interrupt = false;
>>> +
>>> + __entry->id = cq->res.id;
>>> + ),
>>> +
>>> + TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +TRACE_EVENT(cq_process,
>>> + TP_PROTO(
>>> + const struct ib_cq *cq
>>> + ),
>>> +
>>> + TP_ARGS(cq),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(s64, latency)
>>> + __field(u32, id)
>>> + __field(bool, interrupt)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
>>> +
>>> + __entry->id = cq->res.id;
>>> + __entry->latency = ktime_to_us(latency);
>>> + __entry->interrupt = cq->interrupt;
>>> + ),
>>> +
>>> + TP_printk("id %u wake-up took %lld [us] from %s",
>> It might be better to prefix 'id' with 'cq', so that in future rdma wide trace points, we can have multiple resource id's printed consistently as qpid, cqid, mrid etc; and don't have to rely on the function where it is used to decode what that id means.
>
> I left out the "cq" here because the trace point names are prefixed
> with "cq_". However, now that you bring it up, I can imagine cases
> where a trace point might report information about two different
> resources that both have a restrack ID.
>
> How about "cq.id=%u" ?
>
>
>> I had mixed thoughts on whether to pass ib_cq* or rdma_restrack_entry*.
>> I was thinking of rdma_restrack_entry*, as it makes future code for other resources also anchored on the resource id.
>
> On the other hand, a trace point might someday want to report the
> value of a field in struct ib_cq.
Btw, is it worth adding a helper function to return res.id?
u32 rdma_restrack_id(const struct rdma_restrack_entry *res)
{
return res->id;
}
Or is this just noise?
>>> + __entry->id, __entry->latency,
>>> + __entry->interrupt ? "interrupt" : "reschedule"
>>> + )
>>> +);
>>> +
>>> +TRACE_EVENT(cq_poll,
>>> + TP_PROTO(
>>> + const struct ib_cq *cq,
>>> + int requested,
>>> + int rc
>>> + ),
>>> +
>>> + TP_ARGS(cq, requested, rc),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + __field(int, requested)
>>> + __field(int, rc)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + __entry->id = cq->res.id;
>>> + __entry->requested = requested;
>>> + __entry->rc = rc;
>>> + ),
>>> +
>>> + TP_printk("id %u requested %d, returned %d",
>>> + __entry->id, __entry->requested, __entry->rc
>>> + )
>>> +);
>>> +
>>> +TRACE_EVENT(cq_modify,
>>> + TP_PROTO(
>>> + const struct ib_cq *cq,
>>> + u16 comps,
>>> + u16 usec
>>> + ),
>>> +
>>> + TP_ARGS(cq, comps, usec),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + __field(unsigned int, comps)
>>> + __field(unsigned int, usec)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + __entry->id = cq->res.id;
>>> + __entry->comps = comps;
>>> + __entry->usec = usec;
>>> + ),
>>> +
>>> + TP_printk("id %u comps=%u usec=%u",
>>> + __entry->id, __entry->comps, __entry->usec
>>> + )
>>> +);
>>> +
>>> +TRACE_EVENT(cq_alloc,
>>> + TP_PROTO(
>>> + const struct ib_cq *cq,
>>> + int comp_vector,
>>> + enum ib_poll_context poll_ctx
>>> + ),
>>> +
>>> + TP_ARGS(cq, comp_vector, poll_ctx),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + __field(int, comp_vector)
>>> + __field(unsigned long, poll_ctx)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + __entry->id = cq->res.id;
>>> + __entry->comp_vector = comp_vector;
>>> + __entry->poll_ctx = poll_ctx;
>>> + ),
>>> +
>>> + TP_printk("id %u comp_vector=%d poll_ctx=%s",
>>> + __entry->id, __entry->comp_vector,
>>> + rdma_show_ib_poll_ctx(__entry->poll_ctx)
>>> + )
>>> +);
>>> +
>>> +TRACE_EVENT(cq_free,
>>> + TP_PROTO(
>>> + const struct ib_cq *cq
>>> + ),
>>> +
>>> + TP_ARGS(cq),
>>> +
>>> + TP_STRUCT__entry(
>>> + __field(u32, id)
>>> + ),
>>> +
>>> + TP_fast_assign(
>>> + __entry->id = cq->res.id;
>>> + ),
>>> +
>>> + TP_printk("id %u", __entry->id)
>>> +);
>>> +
>>> +#endif /* _TRACE_RDMA_CORE_H */
>>> +
>>> +#include <trace/define_trace.h>
>
> --
> Chuck Lever
--
Chuck Lever
next prev parent reply other threads:[~2019-10-11 17:45 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-09 16:54 [PATCH v3] IB/core: Trace points for diagnosing completion queue issues Chuck Lever
2019-10-11 17:26 ` Parav Pandit
2019-10-11 17:38 ` Chuck Lever
2019-10-11 17:44 ` Chuck Lever [this message]
2019-10-11 17:46 ` Parav Pandit
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=AFA11C46-9CA3-4B34-921C-1E11EECA4743@oracle.com \
--to=chuck.lever@oracle.com \
--cc=linux-rdma@vger.kernel.org \
--cc=parav@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).