All of lore.kernel.org
 help / color / mirror / Atom feed
From: Haris Iqbal <haris.iqbal@ionos.com>
To: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
Cc: linux-rdma@vger.kernel.org, leonro@nvidia.com, jgg@nvidia.com,
	 zyjzyj2000@gmail.com, nvdimm@lists.linux.dev,
	linux-kernel@vger.kernel.org,  rpearsonhpe@gmail.com,
	yangx.jy@fujitsu.com, lizhijian@fujitsu.com,  y-goto@fujitsu.com,
	haris iqbal <haris.phnx@gmail.com>
Subject: Re: [RFC PATCH 5/7] RDMA/rxe: Allow registering MRs for On-Demand Paging
Date: Thu, 8 Sep 2022 18:57:54 +0200	[thread overview]
Message-ID: <CAJpMwyg7S-NErAfyJohWkGNXiWuSKnu9NrsjnbyYHMAOojjPpw@mail.gmail.com> (raw)
In-Reply-To: <b416663effb891cc63fff4ea11d0a8d24ba1706e.1662461897.git.matsuda-daisuke@fujitsu.com>

On Wed, Sep 7, 2022 at 4:45 AM Daisuke Matsuda
<matsuda-daisuke@fujitsu.com> wrote:
>
> Allow applications to register an ODP-enabled MR, in which case the flag
> IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no
> RDMA operation supported right now. They will be enabled later in the
> subsequent two patches.
>
> rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR here.
> It syncs process address space from the CPU page table to the driver page
> table(dma_list/pfn_list in umem_odp) when called with a
> RXE_PAGEFAULT_SNAPSHOT flag. Additionally, It can be used to trigger page
> fault when pages being accessed are not present or do not have proper
> read/write permissions and possibly to prefetch pages in the future.
>
> Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> ---
>  drivers/infiniband/sw/rxe/rxe.c       |  7 +++
>  drivers/infiniband/sw/rxe/rxe_loc.h   |  5 ++
>  drivers/infiniband/sw/rxe/rxe_mr.c    |  7 ++-
>  drivers/infiniband/sw/rxe/rxe_odp.c   | 80 +++++++++++++++++++++++++++
>  drivers/infiniband/sw/rxe/rxe_resp.c  | 21 +++++--
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  8 ++-
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  2 +
>  7 files changed, 121 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 51daac5c4feb..0719f451253c 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -73,6 +73,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>                         rxe->ndev->dev_addr);
>
>         rxe->max_ucontext                       = RXE_MAX_UCONTEXT;
> +
> +       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
> +               rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
> +
> +               /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
> +               rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
> +       }
>  }
>
>  /* initialize port attributes */
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index 0f8cb9e38cc9..03b4078b90a3 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -64,6 +64,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
>
>  /* rxe_mr.c */
>  u8 rxe_get_next_key(u32 last_key);
> +void rxe_mr_init(int access, struct rxe_mr *mr);
>  void rxe_mr_init_dma(struct rxe_pd *pd, int access, struct rxe_mr *mr);
>  int rxe_mr_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova,
>                      int access, struct rxe_mr *mr);
> @@ -188,4 +189,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
>         return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
>  }
>
> +/* rxe_odp.c */
> +int rxe_create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova,
> +                          int access_flags, struct rxe_mr *mr);
> +
>  #endif /* RXE_LOC_H */
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index 814116ec4778..0ae72a4516be 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -48,7 +48,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
>                                 | IB_ACCESS_REMOTE_WRITE        \
>                                 | IB_ACCESS_REMOTE_ATOMIC)
>
> -static void rxe_mr_init(int access, struct rxe_mr *mr)
> +void rxe_mr_init(int access, struct rxe_mr *mr)
>  {
>         u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1);
>         u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
> @@ -438,7 +438,10 @@ int copy_data(
>                 if (bytes > 0) {
>                         iova = sge->addr + offset;
>
> -                       err = rxe_mr_copy(mr, iova, addr, bytes, dir);
> +                       if (mr->odp_enabled)
> +                               err = -EOPNOTSUPP;
> +                       else
> +                               err = rxe_mr_copy(mr, iova, addr, bytes, dir);
>                         if (err)
>                                 goto err2;
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index 0f702787a66e..1f6930ba714c 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -5,6 +5,8 @@
>
>  #include <rdma/ib_umem_odp.h>
>
> +#include "rxe.h"
> +
>  bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
>                              const struct mmu_notifier_range *range,
>                              unsigned long cur_seq)
> @@ -32,3 +34,81 @@ bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
>  const struct mmu_interval_notifier_ops rxe_mn_ops = {
>         .invalidate = rxe_ib_invalidate_range,
>  };
> +
> +#define RXE_PAGEFAULT_RDONLY BIT(1)
> +#define RXE_PAGEFAULT_SNAPSHOT BIT(2)
> +static int rxe_odp_do_pagefault(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
> +{
> +       int np;
> +       u64 access_mask;
> +       bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
> +       struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +
> +       access_mask = ODP_READ_ALLOWED_BIT;
> +       if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
> +               access_mask |= ODP_WRITE_ALLOWED_BIT;
> +
> +       /*
> +        * umem mutex is held after return from ib_umem_odp_map_dma_and_lock().
> +        * Release it when access to user MR is done or not required.
> +        */
> +       np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
> +                                         access_mask, fault);
> +
> +       return np;
> +}
> +
> +static int rxe_init_odp_mr(struct rxe_mr *mr)
> +{
> +       int ret;
> +       struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +
> +       ret = rxe_odp_do_pagefault(mr, mr->umem->address, mr->umem->length,
> +                                  RXE_PAGEFAULT_SNAPSHOT);
> +       mutex_unlock(&umem_odp->umem_mutex);
> +
> +       return ret >= 0 ? 0 : ret;
> +}
> +
> +int rxe_create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova,
> +                          int access_flags, struct rxe_mr *mr)
> +{
> +       int err;
> +       struct ib_umem_odp *umem_odp;
> +       struct rxe_dev *dev = container_of(pd->device, struct rxe_dev, ib_dev);
> +
> +       if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
> +               return -EOPNOTSUPP;
> +
> +       rxe_mr_init(access_flags, mr);
> +
> +       if (!start && length == U64_MAX) {
> +               if (iova != 0)
> +                       return -EINVAL;
> +               if (!(dev->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
> +                       return -EINVAL;
> +
> +               /* Never reach here, for implicit ODP is not implemented. */
> +       }
> +
> +       umem_odp = ib_umem_odp_get(pd->device, start, length, access_flags,
> +                                  &rxe_mn_ops);
> +       if (IS_ERR(umem_odp))
> +               return PTR_ERR(umem_odp);
> +
> +       umem_odp->private = mr;
> +
> +       mr->odp_enabled = true;
> +       mr->ibmr.pd = pd;
> +       mr->umem = &umem_odp->umem;
> +       mr->access = access_flags;
> +       mr->length = length;
> +       mr->iova = iova;
> +       mr->offset = ib_umem_offset(&umem_odp->umem);
> +       mr->state = RXE_MR_STATE_VALID;
> +       mr->type = IB_MR_TYPE_USER;
> +
> +       err = rxe_init_odp_mr(mr);
> +
> +       return err;
> +}
> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> index cadc8fa64dd0..dd8632e783f6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> @@ -535,8 +535,12 @@ static enum resp_states write_data_in(struct rxe_qp *qp,
>         int     err;
>         int data_len = payload_size(pkt);
>
> -       err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset,
> -                         payload_addr(pkt), data_len, RXE_TO_MR_OBJ);
> +       if (qp->resp.mr->odp_enabled)

You cannot use qp->resp.mr here, because for zero byte operations,
resp.mr is not set in the function check_rkey().

The code fails for RTRS with the following stack trace,

[Thu Sep  8 20:12:22 2022] BUG: kernel NULL pointer dereference,
address: 0000000000000158
[Thu Sep  8 20:12:22 2022] #PF: supervisor read access in kernel mode
[Thu Sep  8 20:12:22 2022] #PF: error_code(0x0000) - not-present page
[Thu Sep  8 20:12:22 2022] PGD 0 P4D 0
[Thu Sep  8 20:12:22 2022] Oops: 0000 [#1] PREEMPT SMP
[Thu Sep  8 20:12:22 2022] CPU: 3 PID: 38 Comm: kworker/u8:1 Not
tainted 6.0.0-rc2-pserver+ #17
[Thu Sep  8 20:12:22 2022] Hardware name: QEMU Standard PC (i440FX +
PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014
[Thu Sep  8 20:12:22 2022] Workqueue: rxe_resp rxe_do_work [rdma_rxe]
[Thu Sep  8 20:12:22 2022] RIP: 0010:rxe_responder+0x1910/0x1d90 [rdma_rxe]
[Thu Sep  8 20:12:22 2022] Code: 06 48 63 88 fc 15 63 c0 0f b6 46 01
83 ea 04 c0 e8 04 29 ca 83 e0 03 29 c2 49 8b 87 08 05 00 00 49 03 87
00 05 00 00 4c 63 ea <80> bf 58 01 00 00 00 48 8d 14 0e 48 89 c6 4d 89
ee 44 89 e9 0f 84
[Thu Sep  8 20:12:22 2022] RSP: 0018:ffffb0358015fd80 EFLAGS: 00010246
[Thu Sep  8 20:12:22 2022] RAX: 0000000000000000 RBX: ffff9af4839b5e28
RCX: 0000000000000020
[Thu Sep  8 20:12:22 2022] RDX: 0000000000000000 RSI: ffff9af485094a6a
RDI: 0000000000000000
[Thu Sep  8 20:12:22 2022] RBP: ffff9af488bd7128 R08: 0000000000000000
R09: 0000000000000000
[Thu Sep  8 20:12:22 2022] R10: ffff9af4808eaf7c R11: 0000000000000001
R12: 0000000000000008
[Thu Sep  8 20:12:22 2022] R13: 0000000000000000 R14: ffff9af488bd7380
R15: ffff9af488bd7000
[Thu Sep  8 20:12:22 2022] FS:  0000000000000000(0000)
GS:ffff9af5b7d80000(0000) knlGS:0000000000000000
[Thu Sep  8 20:12:22 2022] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[Thu Sep  8 20:12:22 2022] CR2: 0000000000000158 CR3: 000000004a60a000
CR4: 00000000000006e0
[Thu Sep  8 20:12:22 2022] DR0: 0000000000000000 DR1: 0000000000000000
DR2: 0000000000000000
[Thu Sep  8 20:12:22 2022] DR3: 0000000000000000 DR6: 00000000fffe0ff0
DR7: 0000000000000400
[Thu Sep  8 20:12:22 2022] Call Trace:
[Thu Sep  8 20:12:22 2022]  <TASK>
[Thu Sep  8 20:12:22 2022]  ? newidle_balance+0x2e5/0x400
[Thu Sep  8 20:12:22 2022]  ? _raw_spin_unlock+0x12/0x30
[Thu Sep  8 20:12:22 2022]  ? finish_task_switch+0x91/0x2a0
[Thu Sep  8 20:12:22 2022]  rxe_do_work+0x86/0x110 [rdma_rxe]
[Thu Sep  8 20:12:22 2022]  process_one_work+0x1dc/0x3a0
[Thu Sep  8 20:12:22 2022]  worker_thread+0x4a/0x3b0
[Thu Sep  8 20:12:22 2022]  ? process_one_work+0x3a0/0x3a0
[Thu Sep  8 20:12:22 2022]  kthread+0xe7/0x110
[Thu Sep  8 20:12:22 2022]  ? kthread_complete_and_exit+0x20/0x20
[Thu Sep  8 20:12:22 2022]  ret_from_fork+0x22/0x30
[Thu Sep  8 20:12:22 2022]  </TASK>
[Thu Sep  8 20:12:22 2022] Modules linked in: rnbd_server rtrs_server
rtrs_core rdma_ucm rdma_cm iw_cm ib_cm crc32_generic rdma_rxe
ip6_udp_tunnel udp_tunnel ib_uverbs ib_core loop null_blk
[Thu Sep  8 20:12:22 2022] CR2: 0000000000000158
[Thu Sep  8 20:12:22 2022] ---[ end trace 0000000000000000 ]---
[Thu Sep  8 20:12:22 2022] BUG: kernel NULL pointer dereference,
address: 0000000000000158
[Thu Sep  8 20:12:22 2022] RIP: 0010:rxe_responder+0x1910/0x1d90 [rdma_rxe]
[Thu Sep  8 20:12:22 2022] #PF: supervisor read access in kernel mode
[Thu Sep  8 20:12:22 2022] Code: 06 48 63 88 fc 15 63 c0 0f b6 46 01
83 ea 04 c0 e8 04 29 ca 83 e0 03 29 c2 49 8b 87 08 05 00 00 49 03 87
00 05 00 00 4c 63 ea <80> bf 58 01 00 00 00 48 8d 14 0e 48 89 c6 4d 89
ee 44 89 e9 0f 84
[Thu Sep  8 20:12:22 2022] #PF: error_code(0x0000) - not-present page
[Thu Sep  8 20:12:22 2022] RSP: 0018:ffffb0358015fd80 EFLAGS: 00010246
[Thu Sep  8 20:12:22 2022] PGD 0 P4D 0

Technically, for operations with 0 length, the code can simply not do
any of the *_mr_copy, and carry on with success. So maybe you can
check data_len first and copy only if needed.


> +               err = -EOPNOTSUPP;
> +       else
> +               err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset,
> +                                 payload_addr(pkt), data_len, RXE_TO_MR_OBJ);
> +
>         if (err) {
>                 rc = RESPST_ERR_RKEY_VIOLATION;
>                 goto out;
> @@ -667,7 +671,10 @@ static enum resp_states rxe_atomic_reply(struct rxe_qp *qp,
>                 if (mr->state != RXE_MR_STATE_VALID)
>                         return RESPST_ERR_RKEY_VIOLATION;
>
> -               ret = rxe_atomic_ops(qp, pkt, mr);
> +               if (mr->odp_enabled)
> +                       ret = RESPST_ERR_UNSUPPORTED_OPCODE;
> +               else
> +                       ret = rxe_atomic_ops(qp, pkt, mr);
>         } else
>                 ret = RESPST_ACKNOWLEDGE;
>
> @@ -831,8 +838,12 @@ static enum resp_states read_reply(struct rxe_qp *qp,
>         if (!skb)
>                 return RESPST_ERR_RNR;
>
> -       err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt),
> -                         payload, RXE_FROM_MR_OBJ);
> +       if (mr->odp_enabled)
> +               err = -EOPNOTSUPP;
> +       else
> +               err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt),
> +                                 payload, RXE_FROM_MR_OBJ);
> +
>         if (err)
>                 pr_err("Failed copying memory\n");
>         if (mr)
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 7510f25c5ea3..b00e9b847382 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -926,10 +926,14 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
>                 goto err2;
>         }
>
> -
>         rxe_get(pd);
>
> -       err = rxe_mr_init_user(pd, start, length, iova, access, mr);
> +       if (access & IB_ACCESS_ON_DEMAND)
> +               err = rxe_create_user_odp_mr(&pd->ibpd, start, length, iova,
> +                                            access, mr);
> +       else
> +               err = rxe_mr_init_user(pd, start, length, iova, access, mr);
> +
>         if (err)
>                 goto err3;
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index b09b4cb9897a..98d2bb737ebc 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -324,6 +324,8 @@ struct rxe_mr {
>         atomic_t                num_mw;
>
>         struct rxe_map          **map;
> +
> +       bool                    odp_enabled;
>  };
>
>  enum rxe_mw_state {
> --
> 2.31.1
>

  reply	other threads:[~2022-09-08 16:58 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-07  2:42 [RFC PATCH 0/7] RDMA/rxe: On-Demand Paging on SoftRoCE Daisuke Matsuda
2022-09-07  2:42 ` [RFC PATCH 1/7] IB/mlx5: Change ib_umem_odp_map_dma_single_page() to retain umem_mutex Daisuke Matsuda
2022-09-07  2:43 ` [RFC PATCH 2/7] RDMA/rxe: Convert the triple tasklets to workqueues Daisuke Matsuda
2022-09-09 19:39   ` Bob Pearson
2022-09-12  8:27     ` matsuda-daisuke
2022-09-11  7:10   ` Yanjun Zhu
2022-09-11 15:08     ` Bart Van Assche
2022-09-12  7:58       ` matsuda-daisuke
2022-09-12  8:29         ` Yanjun Zhu
2022-09-12 19:52         ` Bob Pearson
2022-09-28  6:40           ` matsuda-daisuke
2022-09-12  8:25       ` Yanjun Zhu
2022-09-07  2:43 ` [RFC PATCH 3/7] RDMA/rxe: Cleanup code for responder Atomic operations Daisuke Matsuda
2022-09-07  2:43 ` [RFC PATCH 4/7] RDMA/rxe: Add page invalidation support Daisuke Matsuda
2022-09-07  2:43 ` [RFC PATCH 5/7] RDMA/rxe: Allow registering MRs for On-Demand Paging Daisuke Matsuda
2022-09-08 16:57   ` Haris Iqbal [this message]
2022-09-09  0:55     ` matsuda-daisuke
2022-09-07  2:43 ` [RFC PATCH 6/7] RDMA/rxe: Add support for Send/Recv/Write/Read operations with ODP Daisuke Matsuda
2022-09-08  8:29   ` Leon Romanovsky
2022-09-09  2:45     ` matsuda-daisuke
2022-09-07  2:43 ` [RFC PATCH 7/7] RDMA/rxe: Add support for the traditional Atomic " Daisuke Matsuda
2022-09-08  8:40 ` [RFC PATCH 0/7] RDMA/rxe: On-Demand Paging on SoftRoCE Zhu Yanjun
2022-09-08 10:25   ` matsuda-daisuke
2022-09-09  3:07 ` Li Zhijian
2022-09-12  9:21   ` matsuda-daisuke

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAJpMwyg7S-NErAfyJohWkGNXiWuSKnu9NrsjnbyYHMAOojjPpw@mail.gmail.com \
    --to=haris.iqbal@ionos.com \
    --cc=haris.phnx@gmail.com \
    --cc=jgg@nvidia.com \
    --cc=leonro@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=lizhijian@fujitsu.com \
    --cc=matsuda-daisuke@fujitsu.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=rpearsonhpe@gmail.com \
    --cc=y-goto@fujitsu.com \
    --cc=yangx.jy@fujitsu.com \
    --cc=zyjzyj2000@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.