All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] libibverbs IB Device Memory support
@ 2017-05-10 11:25 ahmad omary
       [not found] ` <CADWppnpd9Up7wUTxRgSinhgx3kt3+0bKwgt6P_d3CV1oA90isQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: ahmad omary @ 2017-05-10 11:25 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Ahmad Omary, Leon Romanovsky, Yishai Hadas, Tzahi Oved,
	Alex Rosenbaum, Ariel Levkovich, Liran Liss

Introduction
-------------------------------------------------------------------------------

Many types of user space application can get a real performance gain by using
the internal memory of an IB device. This can be useful to decrease latency of
a trading operation where data is already allocated in the device memory, to
save the PCI round trip when doing atomic operations on semaphores remotely
and also to save the PCI round trip when performing modification by the device
on received traffic that should be transmitted directly after this modification.

The problem
-------------------------------------------------------------------------------

Today there is no API in libibverbs that allow user space application to manage
internal memory of IB devices.

We have considered using mmap(), but As the size of device memory may be limited
,the way to access it from host cpu may differ from vendor to vendor, due to
the 4K (page) aligment limitation of mmap() and the need not to directly
allow user to access the device memory, there is a need for a wrapper access
methods API that allows allocating and managing chunks that are smaller than
4KB and not necessarily aligned to 4KB (page size).

Suggested Solution
-------------------------------------------------------------------------------

In order for user space applications to use the internal device memory, we
suggest to update libibverbs so it provides these applications access to
allocate, free, register and memcopy operations from/to host memory.
After the device memory is allocated for a process, it can be registered
using ibv_reg_mr_ex. The registered memory can be used to process any operation
like if it was registered on host memory mkey. It can be used for post_send,
post_receive, RDMA WRITE/READ and atomic operations.

New suggested verbs:

ibv_alloc_dm: allocates device memory and returns an identifier structure that
identify and define the allocated device memory

ibv_free_dm: free device memory.

ibv_memcpy_dm: Copy from device memory to host memory and from host memory
to device memory.

ibv_reg_mr_ex: Extended to ibv_reg_mr that allow registering device memory.



API changes
-------------------------------------------------------------------------------

/*New verb for allocation*/

struct ibv_dm *ibv_alloc_dm(struct ibv_context *context,
   struct ibv_alloc_dm_attr *dm_attr);

struct ibv_alloc_dm_attr{
size_t length;
uint32_t comp_mask; /*enable future extensions*/
}


struct ibv_dm{
struct ibv_context *context;
uint32 handle;
uint32_t comp_mask; /*enable future extensions*/
};



/*New verb for free*/

int ibv_free_dm(struct ibv_dm *dm);

/*New verb for mem-copy*/

int ibv_memcpy_dm(struct ibv_dm *dm, struct ibv_memcpy_dm_attr *
memcpy_dm_attr);

struct ibv_memcpy_dm_attr{
enum ibv_dev_memcpy_type memcpy_type;
void *host_addr; /*The VA of host memory we need to copy from/to.
uint64_t dm_offset; /*offset based to ibv_dm that need to copy from/to.
size_t length;
uint32_t comp_mask; /*enable future extensions*/
}

enum ibv_dev_memcpy_type{ /*for memcpy_type*/
IBV_DM_CPY_HOST_TO_DEVICE,
        IBV_DM_CPY_DEVICE_TO_HOST;
};

/*new verb for memory registration ibv_reg_mr_ex*/

struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd ,ibv_mr_attr *mr_attr);

struct ibv_mr_attr {
enum ibv_mem_type type; /*memory type*/
size_t length;
int access;
union {
        struct {
Void *addr;
            }host_mem;
            struct{
                  struct ibv_dm *dm;
                  uint64_t offset; /*start offset in ibv_dm
}dev_mem;
      }mem_type
uint32_t comp_mask; /*enable future extensions*/
};

enum ibv_mem_type{
      IBV_HOST_MEM,
      IBV_DEV_MEM
};




/*Update ibv_query_device_ex*/

struct ibv_device_attr_ex{
/*The maximum size of supported device memory supported*/
uint64_t    max_dm_size;
}

Example
-------------------------------------------------------------------------------
/*pseudo code example:  HPC application that allocates a 64 Byte counting
/*semaphore on DM, distributes the mkey to all peers which will do a couple*/
/*of atomic RDMA increase operation on that DM.

/*Assuming application already have:*/
struct ibv_context ctx;
struct ibv_pd pd;
struct ibv_qp qp;
struct ibv_cq cq;
struct ibv_sge sge[1]; /*local memory registered for RDMA READ operations*/

/* The application allocates a DM range */
struct ibv_alloc_dm_attr dm_attr = {/*length*/64, /*comp_mask*/0};
dm = ibv_alloc_dm(ctx, &dm_attr)

/* Clear counting sem */
char sem_value[64] = {0};

struct ibv_memcpy_dm_attr memcpy_dm_attr =
{/*memcpy_type*/ IBV_DM_CPY_HOST_TO_DEVICE, /*host_addr*/&dummy,
/*dm_offset*/ 0, /*length*/ 64, /*comp_mask*/0};

ibv_memcpy_dm(dm, struct &memcpy_dm_attr);

/* Register for remote access MR */
struct ibv_mr_attr mr_attr = {/*type*/IBV_DEV_MEM, /*length*/64,
/*access*/IBV_ACCESS_REMOTE_ATOMIC, {dm, 0}, /*comp_mask*/0 };

mr = ibv_reg_mr_ex(pd, &mr_attr);

/* Distribute to peers: Send the DM <addr,rkey> to peer OOB */
send_sem_to_all_peers(mr->rkey, mr->addr);

/* All peers will do remote multiple RDMA atomic operation to increase the*/
/*   sem that we created on DM */

/* wait for peers to complete sem updates */
block_notify_from_all_peers();

/* Read counting sem via RDMA READ*/
struct ibv_send_wr *bad_wr=NULL;

struct ibv_send_wr wr = {/*wr_id*/0,/*next*/NULL,/*sg_list*/sge,/*num_sge*/1,
/*opcode*/IBV_WR_RDMA_READ,/*send_flags*/IBV_SEND_SIGNAL, /*imm_data*/0,
/*wr.rdma*/{/*remote_addr*/mr->addr,/*rkey*/,mr->rkey}}

ibv_post_send(qp,&wr,&bad_wr)

/* wait for CQ on RDMA operation*/
wait_for_completion(cq);


/* free resources */
ibv_dereg_mr(mr);
ibv_free_dm(dm);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found] ` <CADWppnpd9Up7wUTxRgSinhgx3kt3+0bKwgt6P_d3CV1oA90isQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-05 16:44   ` Christoph Lameter
       [not found]     ` <alpine.DEB.2.20.1706051141230.26831-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
  2017-06-05 17:20   ` Jason Gunthorpe
  2017-06-06  7:35   ` Sagi Grimberg
  2 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2017-06-05 16:44 UTC (permalink / raw)
  To: ahmad omary
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary, Leon Romanovsky,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

On Wed, 10 May 2017, ahmad omary wrote:

> We have considered using mmap(), but As the size of device memory may be limited
> ,the way to access it from host cpu may differ from vendor to vendor, due to
> the 4K (page) aligment limitation of mmap() and the need not to directly
> allow user to access the device memory, there is a need for a wrapper access
> methods API that allows allocating and managing chunks that are smaller than
> 4KB and not necessarily aligned to 4KB (page size).

Why are 4k sized chunks a problem given that there are megabytes of memory
in these devices? We are using various adapters already with an mmapped
solution here.

And I would prefer direct user space access to the memory. Fast access to
the data stored in the NIC is important and it would be best not to have
an intermediate layer that requires memcpy.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]     ` <alpine.DEB.2.20.1706051141230.26831-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
@ 2017-06-05 17:08       ` Leon Romanovsky
       [not found]         ` <20170605170825.GP6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
  2017-06-06  7:10       ` Christoph Hellwig
  1 sibling, 1 reply; 17+ messages in thread
From: Leon Romanovsky @ 2017-06-05 17:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

[-- Attachment #1: Type: text/plain, Size: 1363 bytes --]

On Mon, Jun 05, 2017 at 11:44:00AM -0500, Christoph Lameter wrote:
> On Wed, 10 May 2017, ahmad omary wrote:
>
> > We have considered using mmap(), but As the size of device memory may be limited
> > ,the way to access it from host cpu may differ from vendor to vendor, due to
> > the 4K (page) aligment limitation of mmap() and the need not to directly
> > allow user to access the device memory, there is a need for a wrapper access
> > methods API that allows allocating and managing chunks that are smaller than
> > 4KB and not necessarily aligned to 4KB (page size).
>
> Why are 4k sized chunks a problem given that there are megabytes of memory
> in these devices? We are using various adapters already with an mmapped
> solution here.

Ahmad presented use case where he needs access to small objects
(semaphores) in large scale (MPI). 1MB in the granularity of 4k will give us
256 chunks only, and it is definitely not enough.

>
> And I would prefer direct user space access to the memory. Fast access to
> the data stored in the NIC is important and it would be best not to have
> an intermediate layer that requires memcpy.

>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]         ` <20170605170825.GP6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
@ 2017-06-05 17:17           ` Jason Gunthorpe
       [not found]             ` <20170605171749.GA20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2017-06-05 17:17 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Christoph Lameter, ahmad omary,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary, Yishai Hadas,
	Tzahi Oved, Alex Rosenbaum, Ariel Levkovich, Liran Liss

On Mon, Jun 05, 2017 at 08:08:25PM +0300, Leon Romanovsky wrote:
> On Mon, Jun 05, 2017 at 11:44:00AM -0500, Christoph Lameter wrote:
> > On Wed, 10 May 2017, ahmad omary wrote:
> >
> > > We have considered using mmap(), but As the size of device memory may be limited
> > > ,the way to access it from host cpu may differ from vendor to vendor, due to
> > > the 4K (page) aligment limitation of mmap() and the need not to directly
> > > allow user to access the device memory, there is a need for a wrapper access
> > > methods API that allows allocating and managing chunks that are smaller than
> > > 4KB and not necessarily aligned to 4KB (page size).
> >
> > Why are 4k sized chunks a problem given that there are megabytes of memory
> > in these devices? We are using various adapters already with an mmapped
> > solution here.
> 
> Ahmad presented use case where he needs access to small objects
> (semaphores) in large scale (MPI). 1MB in the granularity of 4k will give us
> 256 chunks only, and it is definitely not enough.

Is 256 chunks per machine not enough? A single process could carve out
smaller regions from the 4k kernel allocation.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found] ` <CADWppnpd9Up7wUTxRgSinhgx3kt3+0bKwgt6P_d3CV1oA90isQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-06-05 16:44   ` Christoph Lameter
@ 2017-06-05 17:20   ` Jason Gunthorpe
       [not found]     ` <20170605172058.GB20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2017-06-06  7:35   ` Sagi Grimberg
  2 siblings, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2017-06-05 17:20 UTC (permalink / raw)
  To: ahmad omary
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary, Leon Romanovsky,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

On Wed, May 10, 2017 at 02:25:23PM +0300, ahmad omary wrote:

> struct ibv_dm *ibv_alloc_dm(struct ibv_context *context,
>    struct ibv_alloc_dm_attr *dm_attr);
> 
> struct ibv_alloc_dm_attr{
> size_t length;
> uint32_t comp_mask; /*enable future extensions*/
> }

Can we please stop this madness where every function call needs to
accept a struct?

It is perfectly fine to add new function calls if new arguments are
needed someday.

> int ibv_memcpy_dm(struct ibv_dm *dm, struct ibv_memcpy_dm_attr *
> memcpy_dm_attr);

To be clear, this is has to be a sycall, right?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]             ` <20170605171749.GA20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2017-06-05 17:41               ` Leon Romanovsky
       [not found]                 ` <20170605174151.GR6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Leon Romanovsky @ 2017-06-05 17:41 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Christoph Lameter, ahmad omary,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary, Yishai Hadas,
	Tzahi Oved, Alex Rosenbaum, Ariel Levkovich, Liran Liss

[-- Attachment #1: Type: text/plain, Size: 1646 bytes --]

On Mon, Jun 05, 2017 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> On Mon, Jun 05, 2017 at 08:08:25PM +0300, Leon Romanovsky wrote:
> > On Mon, Jun 05, 2017 at 11:44:00AM -0500, Christoph Lameter wrote:
> > > On Wed, 10 May 2017, ahmad omary wrote:
> > >
> > > > We have considered using mmap(), but As the size of device memory may be limited
> > > > ,the way to access it from host cpu may differ from vendor to vendor, due to
> > > > the 4K (page) aligment limitation of mmap() and the need not to directly
> > > > allow user to access the device memory, there is a need for a wrapper access
> > > > methods API that allows allocating and managing chunks that are smaller than
> > > > 4KB and not necessarily aligned to 4KB (page size).
> > >
> > > Why are 4k sized chunks a problem given that there are megabytes of memory
> > > in these devices? We are using various adapters already with an mmapped
> > > solution here.
> >
> > Ahmad presented use case where he needs access to small objects
> > (semaphores) in large scale (MPI). 1MB in the granularity of 4k will give us
> > 256 chunks only, and it is definitely not enough.
>
> Is 256 chunks per machine not enough? A single process could carve out
> smaller regions from the 4k kernel allocation.

It is rough calculation for 1MB, when I asked Ahmad about this
limitation (4K) he explained to me that exposed device memory is less
than 1MB.

Thanks

>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]                 ` <20170605174151.GR6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
@ 2017-06-05 22:11                   ` Christoph Lameter
       [not found]                     ` <alpine.DEB.2.20.1706051705350.886-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2017-06-05 22:11 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Jason Gunthorpe, ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Ahmad Omary, Yishai Hadas, Tzahi Oved, Alex Rosenbaum,
	Ariel Levkovich, Liran Liss

On Mon, 5 Jun 2017, Leon Romanovsky wrote:

> It is rough calculation for 1MB, when I asked Ahmad about this
> limitation (4K) he explained to me that exposed device memory is less
> than 1MB.

Still doesnt that mean more than 256 MPI instances or so per node?

The use case for a semaphore indicates that a 4k page would be
shared between multiple processes? Therefore there is even less of a need
of multiple pages.

You may not be able to avoid the 4k page since page
protection works only on a 4k level. The kernel futexes rely on 4k page
protection tricks.

Please come up with a reasonable use case here.... We do not run MPI but
our use cases work fine with mmapped 4k pages. There are some who actually
would like 2M pages for that use case since some of the adapters have
quite a bit of memory available.

A small object allocator with the need to go through an intermediate
layer seems to be not very productive.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]     ` <20170605172058.GB20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2017-06-06  5:52       ` Leon Romanovsky
       [not found]         ` <20170606055229.GT6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Leon Romanovsky @ 2017-06-06  5:52 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

[-- Attachment #1: Type: text/plain, Size: 1042 bytes --]

On Mon, Jun 05, 2017 at 11:20:58AM -0600, Jason Gunthorpe wrote:
> On Wed, May 10, 2017 at 02:25:23PM +0300, ahmad omary wrote:
>
> > struct ibv_dm *ibv_alloc_dm(struct ibv_context *context,
> >    struct ibv_alloc_dm_attr *dm_attr);
> >
> > struct ibv_alloc_dm_attr{
> > size_t length;
> > uint32_t comp_mask; /*enable future extensions*/
> > }
>
> Can we please stop this madness where every function call needs to
> accept a struct?
>
> It is perfectly fine to add new function calls if new arguments are
> needed someday.

IMHO, generally speaking, it is better and cleaner to have similar functions
signatures for all APIs which is exported to user.

>
> > int ibv_memcpy_dm(struct ibv_dm *dm, struct ibv_memcpy_dm_attr *
> > memcpy_dm_attr);
>
> To be clear, this is has to be a sycall, right?

Yes

>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]         ` <20170606055229.GT6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
@ 2017-06-06  6:31           ` Leon Romanovsky
  2017-06-06 16:34           ` Jason Gunthorpe
  1 sibling, 0 replies; 17+ messages in thread
From: Leon Romanovsky @ 2017-06-06  6:31 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

[-- Attachment #1: Type: text/plain, Size: 1295 bytes --]

On Tue, Jun 06, 2017 at 08:52:29AM +0300, Leon Romanovsky wrote:
> On Mon, Jun 05, 2017 at 11:20:58AM -0600, Jason Gunthorpe wrote:
> > On Wed, May 10, 2017 at 02:25:23PM +0300, ahmad omary wrote:
> >
> > > struct ibv_dm *ibv_alloc_dm(struct ibv_context *context,
> > >    struct ibv_alloc_dm_attr *dm_attr);
> > >
> > > struct ibv_alloc_dm_attr{
> > > size_t length;
> > > uint32_t comp_mask; /*enable future extensions*/
> > > }
> >
> > Can we please stop this madness where every function call needs to
> > accept a struct?
> >
> > It is perfectly fine to add new function calls if new arguments are
> > needed someday.
>
> IMHO, generally speaking, it is better and cleaner to have similar functions
> signatures for all APIs which is exported to user.
>
> >
> > > int ibv_memcpy_dm(struct ibv_dm *dm, struct ibv_memcpy_dm_attr *
> > > memcpy_dm_attr);
> >
> > To be clear, this is has to be a sycall, right?
>
> Yes

Or to be more clear, it depends on the memory type exposed and specific vendor
implementation (memcpy, e.t.c.).

Thanks

>
> >
> > Jason
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]     ` <alpine.DEB.2.20.1706051141230.26831-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
  2017-06-05 17:08       ` Leon Romanovsky
@ 2017-06-06  7:10       ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2017-06-06  7:10 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary,
	Leon Romanovsky, Yishai Hadas, Tzahi Oved, Alex Rosenbaum,
	Ariel Levkovich, Liran Liss

On Mon, Jun 05, 2017 at 11:44:00AM -0500, Christoph Lameter wrote:
> On Wed, 10 May 2017, ahmad omary wrote:
> 
> > We have considered using mmap(), but As the size of device memory may be limited
> > ,the way to access it from host cpu may differ from vendor to vendor, due to
> > the 4K (page) aligment limitation of mmap() and the need not to directly
> > allow user to access the device memory, there is a need for a wrapper access
> > methods API that allows allocating and managing chunks that are smaller than
> > 4KB and not necessarily aligned to 4KB (page size).
> 
> Why are 4k sized chunks a problem given that there are megabytes of memory
> in these devices? We are using various adapters already with an mmapped
> solution here.
> 
> And I would prefer direct user space access to the memory. Fast access to
> the data stored in the NIC is important and it would be best not to have
> an intermediate layer that requires memcpy.

Agreed.  The current design looks incredibly stupid.  Also makes sure
it works with the cxgb4 version of this feature that has been posted
as part of the NVMe P2P patches.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found] ` <CADWppnpd9Up7wUTxRgSinhgx3kt3+0bKwgt6P_d3CV1oA90isQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-06-05 16:44   ` Christoph Lameter
  2017-06-05 17:20   ` Jason Gunthorpe
@ 2017-06-06  7:35   ` Sagi Grimberg
       [not found]     ` <46b1a06d-5936-799c-7743-097c579971ef-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
  2 siblings, 1 reply; 17+ messages in thread
From: Sagi Grimberg @ 2017-06-06  7:35 UTC (permalink / raw)
  To: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Ahmad Omary, Leon Romanovsky, Yishai Hadas, Tzahi Oved,
	Alex Rosenbaum, Ariel Levkovich, Liran Liss, Stephen Bates,
	Logan Gunthorpe



On 10/05/17 14:25, ahmad omary wrote:
> Introduction
> -------------------------------------------------------------------------------
> 
> Many types of user space application can get a real performance gain by using
> the internal memory of an IB device. This can be useful to decrease latency of
> a trading operation where data is already allocated in the device memory, to
> save the PCI round trip when doing atomic operations on semaphores remotely
> and also to save the PCI round trip when performing modification by the device
> on received traffic that should be transmitted directly after this modification.
> 
> The problem
> -------------------------------------------------------------------------------
> 
> Today there is no API in libibverbs that allow user space application to manage
> internal memory of IB devices.
> 
> We have considered using mmap(), but As the size of device memory may be limited
> ,the way to access it from host cpu may differ from vendor to vendor, due to
> the 4K (page) aligment limitation of mmap() and the need not to directly
> allow user to access the device memory, there is a need for a wrapper access
> methods API that allows allocating and managing chunks that are smaller than
> 4KB and not necessarily aligned to 4KB (page size).
> 
> Suggested Solution
> -------------------------------------------------------------------------------
> 
> In order for user space applications to use the internal device memory, we
> suggest to update libibverbs so it provides these applications access to
> allocate, free, register and memcopy operations from/to host memory.
> After the device memory is allocated for a process, it can be registered
> using ibv_reg_mr_ex. The registered memory can be used to process any operation
> like if it was registered on host memory mkey. It can be used for post_send,
> post_receive, RDMA WRITE/READ and atomic operations.
> 
> New suggested verbs:
> 
> ibv_alloc_dm: allocates device memory and returns an identifier structure that
> identify and define the allocated device memory
> 
> ibv_free_dm: free device memory.
> 
> ibv_memcpy_dm: Copy from device memory to host memory and from host memory
> to device memory.
> 
> ibv_reg_mr_ex: Extended to ibv_reg_mr that allow registering device memory.
> 
> 

This is useful to kernel consumers too, Adding Stephen and Logan to the
party.

Personally I agree with Christoph Lamater that we are far better off
sticking to standard mmap to expose it to user-space.

 From the thread I don't really understand the problem the API is trying
to solve, AFAICT it boils down to a plain bar, and as such it needs to
be treated as such.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [RFC] libibverbs IB Device Memory support
       [not found]                     ` <alpine.DEB.2.20.1706051705350.886-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
@ 2017-06-06 11:51                       ` Ahmad Omary
       [not found]                         ` <AM4PR0501MB1956B596450F3BC38806E775C4CB0-dp/nxUn679ggcKT3UpMsY8DSnupUy6xnnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Ahmad Omary @ 2017-06-06 11:51 UTC (permalink / raw)
  To: Christoph Lameter, Leon Romanovsky
  Cc: Jason Gunthorpe, ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss


> -----Original Message-----
> From: Christoph Lameter [mailto:cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org]
> Sent: Tuesday, June 06, 2017 1:11 AM
> To: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Cc: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>; ahmad omary
> <ahmad151084-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Ahmad Omary
> <ahmad-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>; Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>; Tzahi
> Oved <tzahio-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>; Alex Rosenbaum <alexr-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>;
> Ariel Levkovich <lariel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>; Liran Liss <liranl-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Subject: Re: [RFC] libibverbs IB Device Memory support
> 
> On Mon, 5 Jun 2017, Leon Romanovsky wrote:
> 
> > It is rough calculation for 1MB, when I asked Ahmad about this
> > limitation (4K) he explained to me that exposed device memory is less
> > than 1MB.

Some of the devices supports less than 1MB internal device memory (256KB). 
 
> Still doesnt that mean more than 256 MPI instances or so per node?
>

In the above use case device, we can have only 64 processes per node which
Is critical for HPC.
 
> The use case for a semaphore indicates that a 4k page would be shared
> between multiple processes? Therefore there is even less of a need of
> multiple pages.
> 
> You may not be able to avoid the 4k page since page protection works only
> on a 4k level. The kernel futexes rely on 4k page protection tricks.
> 

Vendor driver still allocates and map 4KB pages granularity. But in case the
HW device supports less than 4KB, then the HW must provide the required protection.

> Please come up with a reasonable use case here.... We do not run MPI but
> our use cases work fine with mmapped 4k pages. There are some who
> actually would like 2M pages for that use case since some of the adapters
> have quite a bit of memory available.
> 
> A small object allocator with the need to go through an intermediate layer
> seems to be not very productive.
> 

Note that the device memory does not necessary have to be mapped to the CPU.
i.e. is not necessary accessible by PCI, and can only be accessed by RDMA.
This is why we can't use MMAP for all cases  and a dedicated allocation and
 copy functions are needed.

Ahmad Omary

 




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [RFC] libibverbs IB Device Memory support
       [not found]                         ` <AM4PR0501MB1956B596450F3BC38806E775C4CB0-dp/nxUn679ggcKT3UpMsY8DSnupUy6xnnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-06-06 15:25                           ` Liran Liss
  2017-06-06 15:30                           ` Christoph Lameter
  1 sibling, 0 replies; 17+ messages in thread
From: Liran Liss @ 2017-06-06 15:25 UTC (permalink / raw)
  To: Ahmad Omary, Christoph Lameter, Leon Romanovsky
  Cc: Jason Gunthorpe, ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich

> From: Ahmad Omary

> 
> Note that the device memory does not necessary have to be mapped to the
> CPU.
> i.e. is not necessary accessible by PCI, and can only be accessed by RDMA.
> This is why we can't use MMAP for all cases  and a dedicated allocation and
> copy functions are needed.
> 

You can also automate efficient copies, barriers, and flushes even when you do map a BAR directly.

Note that we are holding a concall today (*) to further discuss this topic.
--Liran

* See concall details at: http://marc.info/?l=linux-rdma&m=149660439926108&w=2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [RFC] libibverbs IB Device Memory support
       [not found]                         ` <AM4PR0501MB1956B596450F3BC38806E775C4CB0-dp/nxUn679ggcKT3UpMsY8DSnupUy6xnnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  2017-06-06 15:25                           ` Liran Liss
@ 2017-06-06 15:30                           ` Christoph Lameter
       [not found]                             ` <alpine.DEB.2.20.1706061022270.16819-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2017-06-06 15:30 UTC (permalink / raw)
  To: Ahmad Omary
  Cc: Leon Romanovsky, Jason Gunthorpe, ahmad omary,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Yishai Hadas, Tzahi Oved,
	Alex Rosenbaum, Ariel Levkovich, Liran Liss, Christoph Hellwig

On Tue, 6 Jun 2017, Ahmad Omary wrote:

> In the above use case device, we can have only 64 processes per node which
> Is critical for HPC.

You can have 64 pages that are mapped by any number of processes. You can
map a single page to multiple processes which maybe a requirement if you
want to implement your own synchronization primitives as the mention of
semaphores suggests.

> Vendor driver still allocates and map 4KB pages granularity. But in case th=
> e
> HW device supports less than 4KB, then the HW must provide the required pro=
> tection.


The OS needs to provide the protection if that is the case and then it
probably is not a HPC device. We are talking about RDMA high end devices
here. The design here is for performance and low latency. I dont know of
any devices in use in HPC or in HFT that have these tiny memory sizes.
Mostly these devices are already engineeded for mmapping.

Is this for some kind of embedded device?

> Note that the device memory does not necessary have to be mapped to the CPU=
> .
> i.e. is not necessary accessible by PCI, and can only be accessed by RDMA.
> This is why we can't use MMAP for all cases  and a dedicated allocation and
>  copy functions are needed.

Can we come up with some sort of ioctl API then to write to the devices
inaccessible memory? There must be other drives outside of the RDMA tree
that have similar requirements and that may already have implemented some
version of it.

This seems to be a very specialized application that may be device
specific. ioctls are usually used then.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]         ` <20170606055229.GT6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
  2017-06-06  6:31           ` Leon Romanovsky
@ 2017-06-06 16:34           ` Jason Gunthorpe
  1 sibling, 0 replies; 17+ messages in thread
From: Jason Gunthorpe @ 2017-06-06 16:34 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Ahmad Omary,
	Yishai Hadas, Tzahi Oved, Alex Rosenbaum, Ariel Levkovich,
	Liran Liss

On Tue, Jun 06, 2017 at 08:52:29AM +0300, Leon Romanovsky wrote:
> On Mon, Jun 05, 2017 at 11:20:58AM -0600, Jason Gunthorpe wrote:
> > On Wed, May 10, 2017 at 02:25:23PM +0300, ahmad omary wrote:
> >
> > > struct ibv_dm *ibv_alloc_dm(struct ibv_context *context,
> > >    struct ibv_alloc_dm_attr *dm_attr);
> > >
> > > struct ibv_alloc_dm_attr{
> > > size_t length;
> > > uint32_t comp_mask; /*enable future extensions*/
> > > }
> >
> > Can we please stop this madness where every function call needs to
> > accept a struct?
> >
> > It is perfectly fine to add new function calls if new arguments are
> > needed someday.
> 
> IMHO, generally speaking, it is better and cleaner to have similar functions
> signatures for all APIs which is exported to user.

No, I don't think so.. The APIs should be simple to use first, in this
case we don't need a struct to pass 1 value.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] libibverbs IB Device Memory support
       [not found]     ` <46b1a06d-5936-799c-7743-097c579971ef-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
@ 2017-06-06 16:49       ` Logan Gunthorpe
  0 siblings, 0 replies; 17+ messages in thread
From: Logan Gunthorpe @ 2017-06-06 16:49 UTC (permalink / raw)
  To: Sagi Grimberg, ahmad omary, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: Ahmad Omary, Leon Romanovsky, Yishai Hadas, Tzahi Oved,
	Alex Rosenbaum, Ariel Levkovich, Liran Liss, Stephen Bates

Hi,

On 06/06/17 01:35 AM, Sagi Grimberg wrote:
> This is useful to kernel consumers too, Adding Stephen and Logan to the
> party.

Thanks for copying us.

> Personally I agree with Christoph Lamater that we are far better off
> sticking to standard mmap to expose it to user-space.
> From the thread I don't really understand the problem the API is trying
> to solve, AFAICT it boils down to a plain bar, and as such it needs to
> be treated as such.

We always get push back on IO safety (and for good reason). If the PCI
bar is exposed to userspace through mmap, users should be reading and
writing it through IO operations and if the buffers get used in any
kernel calls, the kernel also has to know that it is now dealing with IO
memory. The kernel is really not ready for this. A kernel-only API would
be able to do this safely and may make sense here.

However, without being able to do P2P DMAs with this memory (which is
what we have been working on) I don't really see the point. My intuition
is that copying from system to IO memory, or having the CPU populate
data structures in IO memory, would overall be slower than just doing it
in system memory and letting the DMA transfer be slightly slower.

Also, I have been told that, at least the two major RDMA NIC vendors
don't currently have working memory available in hardware to expose for
such uses. If there is hardware, I sure would like to get my hands on some.

In any case, this all seems slightly premature.

Logan




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [RFC] libibverbs IB Device Memory support
       [not found]                             ` <alpine.DEB.2.20.1706061022270.16819-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
@ 2017-06-08 13:10                               ` Liran Liss
  0 siblings, 0 replies; 17+ messages in thread
From: Liran Liss @ 2017-06-08 13:10 UTC (permalink / raw)
  To: Christoph Lameter, Ahmad Omary
  Cc: Leon Romanovsky, Jason Gunthorpe, ahmad omary,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Yishai Hadas, Tzahi Oved,
	Alex Rosenbaum, Ariel Levkovich, Christoph Hellwig

> From: Christoph Lameter [mailto:cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org]
> Sent: Tuesday, June 06, 2017 6:30 PM

Hi Chistoph,

> The OS needs to provide the protection if that is the case and then it probably is
> not a HPC device. We are talking about RDMA high end devices here. The design
> here is for performance and low latency. I dont know of any devices in use in
> HPC or in HFT that have these tiny memory sizes.
> Mostly these devices are already engineeded for mmapping.
> 
> Is this for some kind of embedded device?
> 

In our case, this is the HCA.
Device memory could be a scarce resource, and even allocating a 4K page per process could wasteful in some systems.
An allocation API allows allocations of any size.

> > Note that the device memory does not necessary have to be mapped to
> > the CPU= .
> > i.e. is not necessary accessible by PCI, and can only be accessed by RDMA.
> > This is why we can't use MMAP for all cases  and a dedicated
> > allocation and  copy functions are needed.
> 
> Can we come up with some sort of ioctl API then to write to the devices
> inaccessible memory? There must be other drives outside of the RDMA tree that
> have similar requirements and that may already have implemented some
> version of it.
> 
> This seems to be a very specialized application that may be device specific. ioctls
> are usually used then.

The host to dev memcopy is often a user-space only action, so ioctl() doesn't work for us.
The memcpy API allows efficient data transfer regardless of the implementation.

I also don't think that always mandating a direct mmap() is the solution.
First, not in all cases such a mapping exists.
Some devices have only limited BAR space, which can only be dynamically mapped to a small window at a time. In this case, you might need to move the window before the access.
Other device might not be able to map device memory to the CPU at all.

Even if a direct mapping is possible, you don't expose mmap()'d IO memory directly to the application.
There are details such as optimizing for write-combining, locking, and flushing that the application shouldn't be aware of.
--Liran
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2017-06-08 13:10 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-10 11:25 [RFC] libibverbs IB Device Memory support ahmad omary
     [not found] ` <CADWppnpd9Up7wUTxRgSinhgx3kt3+0bKwgt6P_d3CV1oA90isQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-05 16:44   ` Christoph Lameter
     [not found]     ` <alpine.DEB.2.20.1706051141230.26831-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
2017-06-05 17:08       ` Leon Romanovsky
     [not found]         ` <20170605170825.GP6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
2017-06-05 17:17           ` Jason Gunthorpe
     [not found]             ` <20170605171749.GA20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2017-06-05 17:41               ` Leon Romanovsky
     [not found]                 ` <20170605174151.GR6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
2017-06-05 22:11                   ` Christoph Lameter
     [not found]                     ` <alpine.DEB.2.20.1706051705350.886-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
2017-06-06 11:51                       ` Ahmad Omary
     [not found]                         ` <AM4PR0501MB1956B596450F3BC38806E775C4CB0-dp/nxUn679ggcKT3UpMsY8DSnupUy6xnnBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-06-06 15:25                           ` Liran Liss
2017-06-06 15:30                           ` Christoph Lameter
     [not found]                             ` <alpine.DEB.2.20.1706061022270.16819-wcBtFHqTun5QOdAKl3ChDw@public.gmane.org>
2017-06-08 13:10                               ` Liran Liss
2017-06-06  7:10       ` Christoph Hellwig
2017-06-05 17:20   ` Jason Gunthorpe
     [not found]     ` <20170605172058.GB20477-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2017-06-06  5:52       ` Leon Romanovsky
     [not found]         ` <20170606055229.GT6868-U/DQcQFIOTAAJjI8aNfphQ@public.gmane.org>
2017-06-06  6:31           ` Leon Romanovsky
2017-06-06 16:34           ` Jason Gunthorpe
2017-06-06  7:35   ` Sagi Grimberg
     [not found]     ` <46b1a06d-5936-799c-7743-097c579971ef-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
2017-06-06 16:49       ` Logan Gunthorpe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.