From: Parav Pandit <parav@mellanox.com>
To: Chuck Lever <chuck.lever@oracle.com>,
"linux-rdma@vger.kernel.org" <linux-rdma@vger.kernel.org>
Subject: RE: [PATCH RFC] IB/core: Trace points for diagnosing completion queue issues
Date: Mon, 7 Oct 2019 18:41:46 +0000 [thread overview]
Message-ID: <AM0PR05MB486682846486A8F22B1B1FDDD19B0@AM0PR05MB4866.eurprd05.prod.outlook.com> (raw)
In-Reply-To: <20191004135721.2488.63359.stgit@manet.1015granger.net>
Hi Chuck,
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Chuck Lever
> Sent: Friday, October 4, 2019 8:57 AM
> To: linux-rdma@vger.kernel.org
> Subject: [PATCH RFC] IB/core: Trace points for diagnosing completion queue
> issues
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> drivers/infiniband/core/Makefile | 2
> drivers/infiniband/core/cq.c | 29 ++++--
> drivers/infiniband/core/trace.c | 15 +++
> include/rdma/ib_verbs.h | 2
> include/trace/events/rdma.h | 89 ++++++++++++++++++
> include/trace/events/rdma_core.h | 192
> ++++++++++++++++++++++++++++++++++++++
> 6 files changed, 319 insertions(+), 10 deletions(-) create mode 100644
> drivers/infiniband/core/trace.c create mode 100644
> include/trace/events/rdma_core.h
>
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index 09881bd..68d9e27 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o
> verbs.o cq.o rw.o sysfs.o \
> device.o fmr_pool.o cache.o netlink.o \
> roce_gid_mgmt.o mr_pool.o addr.o
> sa_query.o \
> multicast.o mad.o smi.o agent.o mad_rmpp.o \
> - nldev.o restrack.o counters.o
> + nldev.o restrack.o counters.o trace.o
>
> ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
> ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git
> a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index
> bbfded6..bcde992 100644
> --- a/drivers/infiniband/core/cq.c
> +++ b/drivers/infiniband/core/cq.c
> @@ -7,6 +7,8 @@
> #include <linux/slab.h>
> #include <rdma/ib_verbs.h>
>
> +#include <trace/events/rdma_core.h>
> +
> /* # of WCs to poll for with a single call to ib_poll_cq */
> #define IB_POLL_BATCH 16
> #define IB_POLL_BATCH_DIRECT 8
> @@ -41,6 +43,7 @@ static void ib_cq_rdma_dim_work(struct work_struct *w)
>
> dim->state = DIM_START_MEASURE;
>
> + trace_cq_modify(cq, comps, usec);
> cq->device->ops.modify_cq(cq, comps, usec); }
>
> @@ -70,13 +73,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs, {
> int i, n, completed = 0;
>
> - /*
> - * budget might be (-1) if the caller does not
> - * want to bound this call, thus we need unsigned
> - * minimum here.
> - */
> - while ((n = ib_poll_cq(cq, min_t(u32, batch,
> - budget - completed), wcs)) > 0) {
> + trace_cq_process(cq);
> + while ((n = ib_poll_cq(cq, batch, wcs)) > 0) {
> + trace_cq_poll(cq, batch, n);
> for (i = 0; i < n; i++) {
> struct ib_wc *wc = &wcs[i];
>
> @@ -87,9 +86,15 @@ static int __ib_process_cq(struct ib_cq *cq, int budget,
> struct ib_wc *wcs,
> }
>
> completed += n;
> -
> if (n != batch || (budget != -1 && completed >= budget))
> break;
> +
> + /*
> + * budget might be (-1) if the caller does not
> + * want to bound this call, thus we need unsigned
> + * minimum here.
> + */
> + batch = min_t(u32, batch, budget - completed);
> }
>
> return completed;
> @@ -131,8 +136,10 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
> completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
> if (completed < budget) {
> irq_poll_complete(&cq->iop);
> - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
> + trace_cq_reschedule(cq);
> irq_poll_sched(&cq->iop);
> + }
> }
>
> if (dim)
> @@ -143,6 +150,7 @@ static int ib_poll_handler(struct irq_poll *iop, int
> budget)
>
> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) {
> + trace_cq_schedule(cq);
> irq_poll_sched(&cq->iop);
> }
>
> @@ -162,6 +170,7 @@ static void ib_cq_poll_work(struct work_struct *work)
>
> static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) {
> + trace_cq_schedule(cq);
> queue_work(cq->comp_wq, &cq->work);
> }
>
> @@ -239,6 +248,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev,
> void *private,
> goto out_destroy_cq;
> }
>
> + trace_cq_alloc(cq, comp_vector, poll_ctx);
> return cq;
>
> out_destroy_cq:
> @@ -304,6 +314,7 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata
> *udata)
> WARN_ON_ONCE(1);
> }
>
> + trace_cq_free(cq);
> rdma_restrack_del(&cq->res);
> cq->device->ops.destroy_cq(cq, udata);
> if (cq->dim)
> diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
> new file mode 100644 index 0000000..568f57d
> --- /dev/null
> +++ b/drivers/infiniband/core/trace.c
> @@ -0,0 +1,15 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Trace points for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#define CREATE_TRACE_POINTS
> +
> +#include <rdma/ib_verbs.h>
> +#include <rdma/rdma_cm.h>
> +
I think rdma_cm.h inclusion is not needed here?
> +#include <trace/events/rdma_core.h>
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> 6a47ba8..95a6bce 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1555,6 +1555,8 @@ struct ib_cq {
> };
> struct workqueue_struct *comp_wq;
> struct dim *dim;
> + ktime_t timestamp;
> + bool interrupt;
> /*
> * Implementation details of the RDMA core, don't use in drivers:
> */
> diff --git a/include/trace/events/rdma.h b/include/trace/events/rdma.h index
> aa19afc..d5e5fa7a 100644
> --- a/include/trace/events/rdma.h
> +++ b/include/trace/events/rdma.h
> @@ -127,3 +127,92 @@
>
> #define rdma_show_cm_event(x) \
> __print_symbolic(x, RDMA_CM_EVENT_LIST)
> +
> +/*
> + * enum ib_poll_context, from include/rdma/ib_verbs.h */
> +#define IB_POLL_CTX_LIST \
> + ib_poll_ctx(DIRECT) \
> + ib_poll_ctx(SOFTIRQ) \
> + ib_poll_ctx(WORKQUEUE) \
> + ib_poll_ctx_end(UNBOUND_WORKQUEUE)
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x) TRACE_DEFINE_ENUM(IB_POLL_##x);
> +#define ib_poll_ctx_end(x) TRACE_DEFINE_ENUM(IB_POLL_##x);
> +
> +IB_POLL_CTX_LIST
> +
> +#undef ib_poll_ctx
> +#undef ib_poll_ctx_end
> +
> +#define ib_poll_ctx(x) { IB_POLL_##x, #x },
> +#define ib_poll_ctx_end(x) { IB_POLL_##x, #x }
> +
> +#define rdma_show_ib_poll_ctx(x) \
> + __print_symbolic(x, IB_POLL_CTX_LIST)
> +
> +/*
> + * enum ib_wc_opcode, from include/rdma/ib_verbs.h */
> +#define IB_WC_OPCODE_LIST \
> + ib_wc_opcode(SEND) \
> + ib_wc_opcode(RDMA_WRITE) \
> + ib_wc_opcode(RDMA_READ) \
> + ib_wc_opcode(COMP_SWAP) \
> + ib_wc_opcode(FETCH_ADD) \
> + ib_wc_opcode(LSO) \
> + ib_wc_opcode(LOCAL_INV) \
> + ib_wc_opcode(REG_MR) \
> + ib_wc_opcode(MASKED_COMP_SWAP) \
> + ib_wc_opcode(MASKED_FETCH_ADD) \
> + ib_wc_opcode(RECV) \
> + ib_wc_opcode_end(RECV_RDMA_WITH_IMM)
> +
> +#undef ib_wc_opcode
> +#undef ib_wc_opcode_end
> +
> +#define ib_wc_opcode(x) TRACE_DEFINE_ENUM(IB_WC_##x);
> +#define ib_wc_opcode_end(x) TRACE_DEFINE_ENUM(IB_WC_##x);
> +
> +IB_WC_OPCODE_LIST
> +
> +#undef ib_wc_opcode
> +#undef ib_wc_opcode_end
> +
> +#define ib_wc_opcode(x) { IB_WC_##x, #x },
> +#define ib_wc_opcode_end(x) { IB_WC_##x, #x }
> +
> +#define rdma_show_wc_opcode(x) \
> + __print_symbolic(x, IB_WC_OPCODE_LIST)
> +
> +/*
> + * enum ib_wc_flags, from include/rdma/ib_verbs.h */
> +#define IB_WC_FLAGS_LIST \
> + ib_wc_flags(GRH) \
> + ib_wc_flags(WITH_IMM) \
> + ib_wc_flags(WITH_INVALIDATE) \
> + ib_wc_flags(IP_CSUM_OK) \
> + ib_wc_flags(WITH_SMAC) \
> + ib_wc_flags(WITH_VLAN) \
> + ib_wc_flags_end(WITH_NETWORK_HDR_TYPE)
> +
> +#undef ib_wc_flags
> +#undef ib_wc_flags_end
> +
> +#define ib_wc_flags(x) TRACE_DEFINE_ENUM(IB_WC_##x);
> +#define ib_wc_flags_end(x) TRACE_DEFINE_ENUM(IB_WC_##x);
> +
> +IB_WC_FLAGS_LIST
> +
> +#undef ib_wc_flags
> +#undef ib_wc_flags_end
> +
> +#define ib_wc_flags(x) { IB_WC_##x, #x },
> +#define ib_wc_flags_end(x) { IB_WC_##x, #x }
> +
> +#define rdma_show_wc_flags(x) \
> + __print_symbolic(x, IB_WC_FLAGS_LIST)
> diff --git a/include/trace/events/rdma_core.h
> b/include/trace/events/rdma_core.h
> new file mode 100644
> index 0000000..d5cafe8
> --- /dev/null
> +++ b/include/trace/events/rdma_core.h
> @@ -0,0 +1,192 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Trace point definitions for core RDMA functions.
> + *
> + * Author: Chuck Lever <chuck.lever@oracle.com>
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM rdma_core
> +
> +#if !defined(_TRACE_RDMA_CORE_H) ||
> defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_RDMA_CORE_H
> +
> +#include <linux/tracepoint.h>
> +#include <trace/events/rdma.h>
> +#include <rdma/ib_verbs.h>
> +
> +/**
> + ** Completion Queue events
> + **/
> +
> +TRACE_EVENT(cq_schedule,
> + TP_PROTO(
> + struct ib_cq *cq
> + ),
> +
> + TP_ARGS(cq),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + ),
> +
> + TP_fast_assign(
> + cq->timestamp = ktime_get();
> + cq->interrupt = true;
> +
> + __entry->cq = cq;
> + ),
> +
> + TP_printk("cq=%p", __entry->cq)
> +);
Instead of printing kernel address, cq->res.id should be better.
> +
> +TRACE_EVENT(cq_reschedule,
> + TP_PROTO(
> + struct ib_cq *cq
> + ),
> +
> + TP_ARGS(cq),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + ),
> +
> + TP_fast_assign(
> + cq->timestamp = ktime_get();
> + cq->interrupt = false;
> +
> + __entry->cq = cq;
> + ),
> +
> + TP_printk("cq=%p", __entry->cq)
cq->res.id and for few more places below.
> +);
> +
> +TRACE_EVENT(cq_process,
> + TP_PROTO(
> + const struct ib_cq *cq
> + ),
> +
> + TP_ARGS(cq),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + __field(s64, latency)
> + __field(bool, interrupt)
> + ),
> +
> + TP_fast_assign(
> + ktime_t latency = ktime_sub(ktime_get(), cq->timestamp);
> +
> + __entry->cq = cq;
> + __entry->latency = ktime_to_us(latency);
> + __entry->interrupt = cq->interrupt;
> + ),
> +
> + TP_printk("cq=%p: wake-up took %lld [us] from %s",
> + __entry->cq, __entry->latency,
> + __entry->interrupt ? "interrupt" : "reschedule"
> + )
> +);
> +
> +TRACE_EVENT(cq_poll,
> + TP_PROTO(
> + const struct ib_cq *cq,
> + int requested,
> + int rc
> + ),
> +
> + TP_ARGS(cq, requested, rc),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + __field(int, requested)
> + __field(int, rc)
> + ),
> +
> + TP_fast_assign(
> + __entry->cq = cq;
> + __entry->requested = requested;
> + __entry->rc = rc;
> + ),
> +
> + TP_printk("cq=%p: requested %d, returned %d",
> + __entry->cq, __entry->requested, __entry->rc
> + )
> +);
> +
> +TRACE_EVENT(cq_modify,
> + TP_PROTO(
> + const struct ib_cq *cq,
> + u16 comps,
> + u16 usec
> + ),
> +
> + TP_ARGS(cq, comps, usec),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + __field(unsigned int, comps)
> + __field(unsigned int, usec)
> + ),
> +
> + TP_fast_assign(
> + __entry->cq = cq;
> + __entry->comps = comps;
> + __entry->usec = usec;
> + ),
> +
> + TP_printk("cq=%p: comps=%u usec=%u",
> + __entry->cq, __entry->comps, __entry->usec
> + )
> +);
> +
> +TRACE_EVENT(cq_alloc,
> + TP_PROTO(
> + const struct ib_cq *cq,
> + int comp_vector,
> + enum ib_poll_context poll_ctx
> + ),
> +
> + TP_ARGS(cq, comp_vector, poll_ctx),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + __field(int, comp_vector)
> + __field(unsigned long, poll_ctx)
> + ),
> +
> + TP_fast_assign(
> + __entry->cq = cq;
> + __entry->comp_vector = comp_vector;
> + __entry->poll_ctx = poll_ctx;
> + ),
> +
> + TP_printk("cq=%p: comp_vector=%d poll_ctx=%s",
> + __entry->cq, __entry->comp_vector,
> + rdma_show_ib_poll_ctx(__entry->poll_ctx)
> + )
> +);
> +
> +TRACE_EVENT(cq_free,
> + TP_PROTO(
> + const struct ib_cq *cq
> + ),
> +
> + TP_ARGS(cq),
> +
> + TP_STRUCT__entry(
> + __field(const void *, cq)
> + ),
> +
> + TP_fast_assign(
> + __entry->cq = cq;
> + ),
> +
> + TP_printk("cq=%p", __entry->cq)
> +);
> +
> +#endif /* _TRACE_RDMA_CORE_H */
> +
> +#include <trace/define_trace.h>
prev parent reply other threads:[~2019-10-07 18:41 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-10-04 13:57 [PATCH RFC] IB/core: Trace points for diagnosing completion queue issues Chuck Lever
2019-10-07 18:41 ` Parav Pandit [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=AM0PR05MB486682846486A8F22B1B1FDDD19B0@AM0PR05MB4866.eurprd05.prod.outlook.com \
--to=parav@mellanox.com \
--cc=chuck.lever@oracle.com \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).