All of lore.kernel.org
 help / color / mirror / Atom feed
* Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
       [not found] <20210307221034.568606-1-yanjun.zhu@intel.com>
@ 2021-03-07 14:28 ` Zhu Yanjun
  2021-03-07 17:20   ` Leon Romanovsky
                     ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-07 14:28 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe, Leon Romanovsky, RDMA mailing list, maorg

From: Zhu Yanjun <zyjzyj2000@gmail.com>

After the commit ("RDMA/umem: Move to allocate SG table from pages"),
the sg list from ib_ume_get is like the following:
"
sg_dma_address(sg):0x4b3c1ce000
sg_dma_address(sg):0x4c3c1cd000
sg_dma_address(sg):0x4d3c1cc000
sg_dma_address(sg):0x4e3c1cb000
"

But sometimes, we need sg list like the following:
"
sg_dma_address(sg):0x203b400000
sg_dma_address(sg):0x213b200000
sg_dma_address(sg):0x223b000000
sg_dma_address(sg):0x233ae00000
sg_dma_address(sg):0x243ac00000
"
The function ib_umem_add_sg_table can provide the sg list like the
second. And this function is removed in the commit ("RDMA/umem: Move
to allocate SG table from pages"). Now I add it back.

The new function is ib_umem_get to ib_umem_hugepage_get that calls
ib_umem_add_sg_table.

This function ib_umem_huagepage_get can get 4K, 2M sg list dma address.

Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
---
 drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         |   3 +
 2 files changed, 200 insertions(+)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 2dde99a9ba07..af8733788b1e 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -62,6 +62,203 @@ static void __ib_umem_release(struct ib_device
*dev, struct ib_umem *umem, int d
        sg_free_table(&umem->sg_head);
 }

+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+                                               struct page **page_list,
+                                               unsigned long npages,
+                                               unsigned int max_seg_sz,
+                                               int *nents)
+{
+       unsigned long first_pfn;
+       unsigned long i = 0;
+       bool update_cur_sg = false;
+       bool first = !sg_page(sg);
+
+       /* Check if new page_list is contiguous with end of previous page_list.
+        * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+        */
+       if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+                       page_to_pfn(page_list[0])))
+               update_cur_sg = true;
+
+       while (i != npages) {
+               unsigned long len;
+               struct page *first_page = page_list[i];
+
+               first_pfn = page_to_pfn(first_page);
+
+               /* Compute the number of contiguous pages we have starting
+                * at i
+                */
+               for (len = 0; i != npages &&
+                               first_pfn + len == page_to_pfn(page_list[i]) &&
+                               len < (max_seg_sz >> PAGE_SHIFT);
+                    len++)
+                       i++;
+
+               /* Squash N contiguous pages from page_list into current sge */
+               if (update_cur_sg) {
+                       if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
+                               sg_set_page(sg, sg_page(sg),
+                               sg->length + (len << PAGE_SHIFT),
+                               0);
+                               update_cur_sg = false;
+                               continue;
+                       }
+                       update_cur_sg = false;
+               }
+
+               /* Squash N contiguous pages into next sge or first sge */
+               if (!first)
+                       sg = sg_next(sg);
+
+               (*nents)++;
+               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+               first = false;
+       }
+
+       return sg;
+}
+
+/**
+ * ib_umem_hugepage_get - Pin and DMA map userspace memory.
+ *
+ * @device: IB device to connect UMEM
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ */
+struct ib_umem *ib_umem_hugepage_get(struct ib_device *device,
+                                    unsigned long addr,
+                                    size_t size, int access)
+{
+       struct ib_umem *umem;
+       struct page **page_list;
+       unsigned long lock_limit;
+       unsigned long new_pinned;
+       unsigned long cur_base;
+       unsigned long dma_attr = 0;
+       struct mm_struct *mm;
+       unsigned long npages;
+       int ret;
+       struct scatterlist *sg;
+       unsigned int gup_flags = FOLL_WRITE;
+
+       /*
+        * If the combination of the addr and size requested for this memory
+        * region causes an integer overflow, return error.
+        */
+       if (((addr + size) < addr) ||
+           PAGE_ALIGN(addr + size) < (addr + size))
+               return ERR_PTR(-EINVAL);
+
+       if (!can_do_mlock())
+               return ERR_PTR(-EPERM);
+
+       if (access & IB_ACCESS_ON_DEMAND)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+       umem->ibdev      = device;
+       umem->length     = size;
+       umem->address    = addr;
+       umem->writable   = ib_access_writable(access);
+       umem->owning_mm = mm = current->mm;
+       mmgrab(mm);
+
+       page_list = (struct page **) __get_free_page(GFP_KERNEL);
+       if (!page_list) {
+               ret = -ENOMEM;
+               goto umem_kfree;
+       }
+
+       npages = ib_umem_num_pages(umem);
+       if (npages == 0 || npages > UINT_MAX) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
+       if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
+               atomic64_sub(npages, &mm->pinned_vm);
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       cur_base = addr & PAGE_MASK;
+
+       ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+       if (ret)
+               goto vma;
+
+       if (!umem->writable)
+               gup_flags |= FOLL_FORCE;
+
+       sg = umem->sg_head.sgl;
+
+       while (npages) {
+               cond_resched();
+               ret = pin_user_pages_fast(cur_base,
+                                         min_t(unsigned long, npages,
+                                               PAGE_SIZE /
+                                         sizeof(struct page *)),
+                                         gup_flags | FOLL_LONGTERM, page_list);
+               if (ret < 0)
+                       goto umem_release;
+
+               cur_base += ret * PAGE_SIZE;
+               npages   -= ret;
+
+               sg = ib_umem_add_sg_table(sg, page_list, ret,
+                               dma_get_max_seg_size(device->dma_device),
+                               &umem->sg_nents);
+       }
+
+       sg_mark_end(sg);
+
+       if (access & IB_ACCESS_RELAXED_ORDERING)
+               dma_attr |= DMA_ATTR_WEAK_ORDERING;
+
+       umem->nmap =
+               ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
+                                   DMA_BIDIRECTIONAL, dma_attr);
+
+       if (!umem->nmap) {
+               ret = -ENOMEM;
+               goto umem_release;
+       }
+
+       ret = 0;
+       goto out;
+
+umem_release:
+       __ib_umem_release(device, umem, 0);
+vma:
+       atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
+out:
+       free_page((unsigned long) page_list);
+umem_kfree:
+       if (ret) {
+               mmdrop(umem->owning_mm);
+               kfree(umem);
+       }
+       return ret ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_hugepage_get);
+
 /**
  * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
  *
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 676c57f5ca80..fc6350ff21e6 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -96,6 +96,9 @@ static inline void
__rdma_umem_block_iter_start(struct ib_block_iter *biter,
             __rdma_block_iter_next(biter);)

 #ifdef CONFIG_INFINIBAND_USER_MEM
+struct ib_umem *ib_umem_hugepage_get(struct ib_device *device,
+                                    unsigned long addr,
+                                    size_t size, int access);

 struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
                            size_t size, int access);
--
2.25.1

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-07 14:28 ` Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list Zhu Yanjun
@ 2021-03-07 17:20   ` Leon Romanovsky
  2021-03-07 17:22   ` Leon Romanovsky
  2021-03-08  0:44   ` Jason Gunthorpe
  2 siblings, 0 replies; 30+ messages in thread
From: Leon Romanovsky @ 2021-03-07 17:20 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list, maorg

Please use git send-email to send patches.

Also don't write 1/1 for single patch.

Thanks

On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> From: Zhu Yanjun <zyjzyj2000@gmail.com>
>
> After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> the sg list from ib_ume_get is like the following:
> "
> sg_dma_address(sg):0x4b3c1ce000
> sg_dma_address(sg):0x4c3c1cd000
> sg_dma_address(sg):0x4d3c1cc000
> sg_dma_address(sg):0x4e3c1cb000
> "
>
> But sometimes, we need sg list like the following:
> "
> sg_dma_address(sg):0x203b400000
> sg_dma_address(sg):0x213b200000
> sg_dma_address(sg):0x223b000000
> sg_dma_address(sg):0x233ae00000
> sg_dma_address(sg):0x243ac00000
> "
> The function ib_umem_add_sg_table can provide the sg list like the
> second. And this function is removed in the commit ("RDMA/umem: Move
> to allocate SG table from pages"). Now I add it back.
>
> The new function is ib_umem_get to ib_umem_hugepage_get that calls
> ib_umem_add_sg_table.
>
> This function ib_umem_huagepage_get can get 4K, 2M sg list dma address.
>
> Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
> Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
> ---
>  drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         |   3 +
>  2 files changed, 200 insertions(+)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 2dde99a9ba07..af8733788b1e 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -62,6 +62,203 @@ static void __ib_umem_release(struct ib_device
> *dev, struct ib_umem *umem, int d
>         sg_free_table(&umem->sg_head);
>  }
>
> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> + *
> + * sg: current scatterlist entry
> + * page_list: array of npage struct page pointers
> + * npages: number of pages in page_list
> + * max_seg_sz: maximum segment size in bytes
> + * nents: [out] number of entries in the scatterlist
> + *
> + * Return new end of scatterlist
> + */
> +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> +                                               struct page **page_list,
> +                                               unsigned long npages,
> +                                               unsigned int max_seg_sz,
> +                                               int *nents)
> +{
> +       unsigned long first_pfn;
> +       unsigned long i = 0;
> +       bool update_cur_sg = false;
> +       bool first = !sg_page(sg);
> +
> +       /* Check if new page_list is contiguous with end of previous page_list.
> +        * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
> +        */
> +       if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
> +                       page_to_pfn(page_list[0])))
> +               update_cur_sg = true;
> +
> +       while (i != npages) {
> +               unsigned long len;
> +               struct page *first_page = page_list[i];
> +
> +               first_pfn = page_to_pfn(first_page);
> +
> +               /* Compute the number of contiguous pages we have starting
> +                * at i
> +                */
> +               for (len = 0; i != npages &&
> +                               first_pfn + len == page_to_pfn(page_list[i]) &&
> +                               len < (max_seg_sz >> PAGE_SHIFT);
> +                    len++)
> +                       i++;
> +
> +               /* Squash N contiguous pages from page_list into current sge */
> +               if (update_cur_sg) {
> +                       if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
> +                               sg_set_page(sg, sg_page(sg),
> +                               sg->length + (len << PAGE_SHIFT),
> +                               0);
> +                               update_cur_sg = false;
> +                               continue;
> +                       }
> +                       update_cur_sg = false;
> +               }
> +
> +               /* Squash N contiguous pages into next sge or first sge */
> +               if (!first)
> +                       sg = sg_next(sg);
> +
> +               (*nents)++;
> +               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
> +               first = false;
> +       }
> +
> +       return sg;
> +}
> +
> +/**
> + * ib_umem_hugepage_get - Pin and DMA map userspace memory.
> + *
> + * @device: IB device to connect UMEM
> + * @addr: userspace virtual address to start at
> + * @size: length of region to pin
> + * @access: IB_ACCESS_xxx flags for memory being pinned
> + */
> +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device,
> +                                    unsigned long addr,
> +                                    size_t size, int access)
> +{
> +       struct ib_umem *umem;
> +       struct page **page_list;
> +       unsigned long lock_limit;
> +       unsigned long new_pinned;
> +       unsigned long cur_base;
> +       unsigned long dma_attr = 0;
> +       struct mm_struct *mm;
> +       unsigned long npages;
> +       int ret;
> +       struct scatterlist *sg;
> +       unsigned int gup_flags = FOLL_WRITE;
> +
> +       /*
> +        * If the combination of the addr and size requested for this memory
> +        * region causes an integer overflow, return error.
> +        */
> +       if (((addr + size) < addr) ||
> +           PAGE_ALIGN(addr + size) < (addr + size))
> +               return ERR_PTR(-EINVAL);
> +
> +       if (!can_do_mlock())
> +               return ERR_PTR(-EPERM);
> +
> +       if (access & IB_ACCESS_ON_DEMAND)
> +               return ERR_PTR(-EOPNOTSUPP);
> +
> +       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> +       if (!umem)
> +               return ERR_PTR(-ENOMEM);
> +       umem->ibdev      = device;
> +       umem->length     = size;
> +       umem->address    = addr;
> +       umem->writable   = ib_access_writable(access);
> +       umem->owning_mm = mm = current->mm;
> +       mmgrab(mm);
> +
> +       page_list = (struct page **) __get_free_page(GFP_KERNEL);
> +       if (!page_list) {
> +               ret = -ENOMEM;
> +               goto umem_kfree;
> +       }
> +
> +       npages = ib_umem_num_pages(umem);
> +       if (npages == 0 || npages > UINT_MAX) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +       new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
> +       if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> +               atomic64_sub(npages, &mm->pinned_vm);
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +
> +       cur_base = addr & PAGE_MASK;
> +
> +       ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
> +       if (ret)
> +               goto vma;
> +
> +       if (!umem->writable)
> +               gup_flags |= FOLL_FORCE;
> +
> +       sg = umem->sg_head.sgl;
> +
> +       while (npages) {
> +               cond_resched();
> +               ret = pin_user_pages_fast(cur_base,
> +                                         min_t(unsigned long, npages,
> +                                               PAGE_SIZE /
> +                                         sizeof(struct page *)),
> +                                         gup_flags | FOLL_LONGTERM, page_list);
> +               if (ret < 0)
> +                       goto umem_release;
> +
> +               cur_base += ret * PAGE_SIZE;
> +               npages   -= ret;
> +
> +               sg = ib_umem_add_sg_table(sg, page_list, ret,
> +                               dma_get_max_seg_size(device->dma_device),
> +                               &umem->sg_nents);
> +       }
> +
> +       sg_mark_end(sg);
> +
> +       if (access & IB_ACCESS_RELAXED_ORDERING)
> +               dma_attr |= DMA_ATTR_WEAK_ORDERING;
> +
> +       umem->nmap =
> +               ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
> +                                   DMA_BIDIRECTIONAL, dma_attr);
> +
> +       if (!umem->nmap) {
> +               ret = -ENOMEM;
> +               goto umem_release;
> +       }
> +
> +       ret = 0;
> +       goto out;
> +
> +umem_release:
> +       __ib_umem_release(device, umem, 0);
> +vma:
> +       atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
> +out:
> +       free_page((unsigned long) page_list);
> +umem_kfree:
> +       if (ret) {
> +               mmdrop(umem->owning_mm);
> +               kfree(umem);
> +       }
> +       return ret ? ERR_PTR(ret) : umem;
> +}
> +EXPORT_SYMBOL(ib_umem_hugepage_get);
> +
>  /**
>   * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
>   *
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 676c57f5ca80..fc6350ff21e6 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -96,6 +96,9 @@ static inline void
> __rdma_umem_block_iter_start(struct ib_block_iter *biter,
>              __rdma_block_iter_next(biter);)
>
>  #ifdef CONFIG_INFINIBAND_USER_MEM
> +struct ib_umem *ib_umem_hugepage_get(struct ib_device *device,
> +                                    unsigned long addr,
> +                                    size_t size, int access);
>
>  struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
>                             size_t size, int access);
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-07 14:28 ` Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list Zhu Yanjun
  2021-03-07 17:20   ` Leon Romanovsky
@ 2021-03-07 17:22   ` Leon Romanovsky
  2021-03-08  2:44     ` Zhu Yanjun
  2021-03-08 10:13     ` Zhu Yanjun
  2021-03-08  0:44   ` Jason Gunthorpe
  2 siblings, 2 replies; 30+ messages in thread
From: Leon Romanovsky @ 2021-03-07 17:22 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list, maorg

On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> From: Zhu Yanjun <zyjzyj2000@gmail.com>
>
> After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> the sg list from ib_ume_get is like the following:
> "
> sg_dma_address(sg):0x4b3c1ce000
> sg_dma_address(sg):0x4c3c1cd000
> sg_dma_address(sg):0x4d3c1cc000
> sg_dma_address(sg):0x4e3c1cb000
> "
>
> But sometimes, we need sg list like the following:
> "
> sg_dma_address(sg):0x203b400000
> sg_dma_address(sg):0x213b200000
> sg_dma_address(sg):0x223b000000
> sg_dma_address(sg):0x233ae00000
> sg_dma_address(sg):0x243ac00000
> "
> The function ib_umem_add_sg_table can provide the sg list like the
> second. And this function is removed in the commit ("RDMA/umem: Move
> to allocate SG table from pages"). Now I add it back.
>
> The new function is ib_umem_get to ib_umem_hugepage_get that calls
> ib_umem_add_sg_table.
>
> This function ib_umem_huagepage_get can get 4K, 2M sg list dma address.
>
> Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
> Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
> ---
>  drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         |   3 +
>  2 files changed, 200 insertions(+)

You didn't use newly introduced function and didn't really explain why
it is needed.

Thanks

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-07 14:28 ` Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list Zhu Yanjun
  2021-03-07 17:20   ` Leon Romanovsky
  2021-03-07 17:22   ` Leon Romanovsky
@ 2021-03-08  0:44   ` Jason Gunthorpe
  2021-03-08  2:47     ` Zhu Yanjun
  2 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-08  0:44 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, maorg

On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> From: Zhu Yanjun <zyjzyj2000@gmail.com>
> 
> After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> the sg list from ib_ume_get is like the following:

If you are not getting huge pages then you need to fix
__sg_alloc_table_from_pages()

But as far as I know it is not buggy like you are describing.

> The new function is ib_umem_get to ib_umem_hugepage_get that calls
> ib_umem_add_sg_table.

Nope

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-07 17:22   ` Leon Romanovsky
@ 2021-03-08  2:44     ` Zhu Yanjun
  2021-03-08 10:13     ` Zhu Yanjun
  1 sibling, 0 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-08  2:44 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list, maorg

On Mon, Mar 8, 2021 at 1:22 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> > From: Zhu Yanjun <zyjzyj2000@gmail.com>
> >
> > After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> > the sg list from ib_ume_get is like the following:
> > "
> > sg_dma_address(sg):0x4b3c1ce000
> > sg_dma_address(sg):0x4c3c1cd000
> > sg_dma_address(sg):0x4d3c1cc000
> > sg_dma_address(sg):0x4e3c1cb000
> > "
> >
> > But sometimes, we need sg list like the following:
> > "
> > sg_dma_address(sg):0x203b400000
> > sg_dma_address(sg):0x213b200000
> > sg_dma_address(sg):0x223b000000
> > sg_dma_address(sg):0x233ae00000
> > sg_dma_address(sg):0x243ac00000
> > "
> > The function ib_umem_add_sg_table can provide the sg list like the
> > second. And this function is removed in the commit ("RDMA/umem: Move
> > to allocate SG table from pages"). Now I add it back.
> >
> > The new function is ib_umem_get to ib_umem_hugepage_get that calls
> > ib_umem_add_sg_table.
> >
> > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address.
> >
> > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
> > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
> > ---
> >  drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++
> >  include/rdma/ib_umem.h         |   3 +
> >  2 files changed, 200 insertions(+)
>
> You didn't use newly introduced function and didn't really explain why
> it is needed.

OK. I will explain later.

Zhu Yanjun

>
> Thanks

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-08  0:44   ` Jason Gunthorpe
@ 2021-03-08  2:47     ` Zhu Yanjun
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-08  2:47 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list, maorg

On Mon, Mar 8, 2021 at 8:44 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> > From: Zhu Yanjun <zyjzyj2000@gmail.com>
> >
> > After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> > the sg list from ib_ume_get is like the following:
>
> If you are not getting huge pages then you need to fix
> __sg_alloc_table_from_pages()
>
> But as far as I know it is not buggy like you are describing.

In the same host and the same test, both __sg_alloc_table_from_pages
and ib_umem_add_sg_table
are called at the same time, but the returned sg lists are different.
It is weird.

Zhu Yanjun

>
> > The new function is ib_umem_get to ib_umem_hugepage_get that calls
> > ib_umem_add_sg_table.
>
> Nope
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-07 17:22   ` Leon Romanovsky
  2021-03-08  2:44     ` Zhu Yanjun
@ 2021-03-08 10:13     ` Zhu Yanjun
  2021-03-08 12:16       ` Jason Gunthorpe
  1 sibling, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-08 10:13 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list, maorg

On Mon, Mar 8, 2021 at 1:22 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Sun, Mar 07, 2021 at 10:28:33PM +0800, Zhu Yanjun wrote:
> > From: Zhu Yanjun <zyjzyj2000@gmail.com>
> >
> > After the commit ("RDMA/umem: Move to allocate SG table from pages"),
> > the sg list from ib_ume_get is like the following:
> > "
> > sg_dma_address(sg):0x4b3c1ce000
> > sg_dma_address(sg):0x4c3c1cd000
> > sg_dma_address(sg):0x4d3c1cc000
> > sg_dma_address(sg):0x4e3c1cb000
> > "
> >
> > But sometimes, we need sg list like the following:
> > "
> > sg_dma_address(sg):0x203b400000
> > sg_dma_address(sg):0x213b200000
> > sg_dma_address(sg):0x223b000000
> > sg_dma_address(sg):0x233ae00000
> > sg_dma_address(sg):0x243ac00000
> > "
> > The function ib_umem_add_sg_table can provide the sg list like the
> > second. And this function is removed in the commit ("RDMA/umem: Move
> > to allocate SG table from pages"). Now I add it back.
> >
> > The new function is ib_umem_get to ib_umem_hugepage_get that calls
> > ib_umem_add_sg_table.
> >
> > This function ib_umem_huagepage_get can get 4K, 2M sg list dma address.
> >
> > Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
> > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
> > ---
> >  drivers/infiniband/core/umem.c | 197 +++++++++++++++++++++++++++++++++
> >  include/rdma/ib_umem.h         |   3 +
> >  2 files changed, 200 insertions(+)
>
> You didn't use newly introduced function and didn't really explain why

Thanks Leon and Jason.
Now I set ib_dma_max_seg_size to SZ_2M. Then this problem is fixed.
This problem is caused by the capacity parameter of our device.

Originally the ib_dma_max_seg_size is set to UINT_MAX. Then the sg dma
address is different from
the ones before the commit ("RDMA/umem: Move to allocate SG table from pages").

And I delved into the source code of __sg_alloc_table_from_pages. I
found that this function is
related with ib_dma_max_seg_size. So when ib_dma_max_seg_size is set
to UINT_MAX, the sg dma
address is 4K (one page). When ib_dma_max_seg_size is set to SZ_2M,
the sg dma address is 2M now.

The function ib_umem_add_sg_table is not related with
ib_dma_max_seg_size. So the value of  ib_dma_max_seg_size
does not affect sg dma address.

Anyway, this problem is not caused by the function __sg_alloc_table_from_pages.

Please ignore this commit.

Zhu Yanjun

> it is needed.
>
> Thanks

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-08 10:13     ` Zhu Yanjun
@ 2021-03-08 12:16       ` Jason Gunthorpe
  2021-03-11 10:41         ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-08 12:16 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:

> And I delved into the source code of __sg_alloc_table_from_pages. I
> found that this function is related with ib_dma_max_seg_size. So
> when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> address is 2M now.

That seems like a bug, you should fix it

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-08 12:16       ` Jason Gunthorpe
@ 2021-03-11 10:41         ` Zhu Yanjun
  2021-03-12  0:25           ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-11 10:41 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
>
> > And I delved into the source code of __sg_alloc_table_from_pages. I
> > found that this function is related with ib_dma_max_seg_size. So
> > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > address is 2M now.
>
> That seems like a bug, you should fix it

Hi, Jason && Leon

I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
In __sg_alloc_table_from_pages:

"
 449         if (prv) {
 450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
* PAGE_SIZE +
 451                                        prv->offset + prv->length) /
 452                                       PAGE_SIZE;
 453
 454                 if (WARN_ON(offset))
 455                         return ERR_PTR(-EINVAL);
 456
 457                 /* Merge contiguous pages into the last SG */
 458                 prv_len = prv->length;
 459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
 460                         if (prv->length + PAGE_SIZE > max_segment)
 461                                 break;
 462                         prv->length += PAGE_SIZE;
 463                         paddr++;
 464                         pages++;
 465                         n_pages--;
 466                 }
 467                 if (!n_pages)
 468                         goto out;
 469         }

"
if prv->length + PAGE_SIZE > max_segment, then set another sg.
In the commit "RDMA/umem: Move to allocate SG table from pages",
max_segment is dma_get_max_seg_size.
Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
usually less than max_segment
since length is unsigned int.

So the following has very few chance to execute.
"
 485         for (i = 0; i < chunks; i++) {
 ...
 490                 for (j = cur_page + 1; j < n_pages; j++) {
 491                         seg_len += PAGE_SIZE;
 492                         if (seg_len >= max_segment ||
 493                             page_to_pfn(pages[j]) !=
 494                             page_to_pfn(pages[j - 1]) + 1)
 495                                 break;
 496                 }
 497
 498                 /* Pass how many chunks might be left */
 499                 s = get_next_sg(sgt, s, chunks - i + left_pages, gfp_mask);
...
 510                 sg_set_page(s, pages[cur_page],
 511                             min_t(unsigned long, size,
chunk_size), offset);
 512                 added_nents++;
...
516         }
"

In the function ib_umem_add_sg_table, max_segment is used like the below:
"
               /* Squash N contiguous pages from page_list into current sge */
               if (update_cur_sg) {
                       if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
                               sg_set_page(sg, sg_page(sg),
                                           sg->length + (len << PAGE_SHIFT),
                                           0);
                               update_cur_sg = false;
                               continue;
                       }
                       update_cur_sg = false;
               }

               /* Squash N contiguous pages into next sge or first sge */
               if (!first)
                       sg = sg_next(sg);

               (*nents)++;
               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
"
if (max_seg_sz - sg->length) >= (len << PAGE_SHIFT),
the function sg_next and sg_set_page are called several times.

The different usage of max_seg_sz/max_segment is the root cause that
sg_dma_addresses
are different from the function __sg_alloc_table_from_pages and
ib_umem_add_sg_table.

The following commit tries to fix this problem, please comment on it.

From 65472ef21146b0fb72d5eb3e2fe1277380d29446 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <zyjzyj2000@gmail.com>
Date: Thu, 11 Mar 2021 12:35:40 -0500
Subject: [PATCH 1/1] RDMA/umem: fix different sg_dma_address problem

After the commit 0c16d9635e3a ("RDMA/umem: Move to allocate
SG table from pages"), the returned sg list has the different dma
addresses compared with the removed the function ib_umem_add_sg_table.

The root cause is that max_seg_sz/max_segment has different usage in
the function ib_umem_add_sg_table and __sg_alloc_table_from_pages.

Fixes: 0c16d9635e3a ("RDMA/umem: Move to allocate SG table from pages")
Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
---
 drivers/infiniband/core/umem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 2dde99a9ba07..71188edbb45f 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -232,7 +232,9 @@ struct ib_umem *ib_umem_get(struct ib_device
*device, unsigned long addr,
                npages -= ret;
                sg = __sg_alloc_table_from_pages(&umem->sg_head, page_list, ret,
                                0, ret << PAGE_SHIFT,
-                               ib_dma_max_seg_size(device), sg, npages,
+                               min_t(unsigned int,
+                               ib_dma_max_seg_size(device), ret * PAGE_SIZE),
+                               sg, npages,
                                GFP_KERNEL);
                umem->sg_nents = umem->sg_head.nents;
                if (IS_ERR(sg)) {
--
2.27.0

>
> Jason

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-11 10:41         ` Zhu Yanjun
@ 2021-03-12  0:25           ` Jason Gunthorpe
  2021-03-12  8:04             ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-12  0:25 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> >
> > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > found that this function is related with ib_dma_max_seg_size. So
> > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > address is 2M now.
> >
> > That seems like a bug, you should fix it
> 
> Hi, Jason && Leon
> 
> I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> In __sg_alloc_table_from_pages:
> 
> "
>  449         if (prv) {
>  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> * PAGE_SIZE +
>  451                                        prv->offset + prv->length) /
>  452                                       PAGE_SIZE;
>  453
>  454                 if (WARN_ON(offset))
>  455                         return ERR_PTR(-EINVAL);
>  456
>  457                 /* Merge contiguous pages into the last SG */
>  458                 prv_len = prv->length;
>  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
>  460                         if (prv->length + PAGE_SIZE > max_segment)
>  461                                 break;
>  462                         prv->length += PAGE_SIZE;
>  463                         paddr++;
>  464                         pages++;
>  465                         n_pages--;
>  466                 }
>  467                 if (!n_pages)
>  468                         goto out;
>  469         }
> 
> "
> if prv->length + PAGE_SIZE > max_segment, then set another sg.
> In the commit "RDMA/umem: Move to allocate SG table from pages",
> max_segment is dma_get_max_seg_size.
> Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> usually less than max_segment
> since length is unsigned int.

I don't understand what you are trying to say

  460                         if (prv->length + PAGE_SIZE > max_segment)

max_segment should be a very big number and "prv->length + PAGE_SIZE" should
always be < max_segment so it should always be increasing the size of
prv->length and 'rpv' here is the sgl.

The other loops are the same.

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12  0:25           ` Jason Gunthorpe
@ 2021-03-12  8:04             ` Zhu Yanjun
  2021-03-12  8:05               ` Zhu Yanjun
  2021-03-12 13:02               ` Jason Gunthorpe
  0 siblings, 2 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-12  8:04 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> > >
> > > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > > found that this function is related with ib_dma_max_seg_size. So
> > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > > address is 2M now.
> > >
> > > That seems like a bug, you should fix it
> >
> > Hi, Jason && Leon
> >
> > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> > In __sg_alloc_table_from_pages:
> >
> > "
> >  449         if (prv) {
> >  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> > * PAGE_SIZE +
> >  451                                        prv->offset + prv->length) /
> >  452                                       PAGE_SIZE;
> >  453
> >  454                 if (WARN_ON(offset))
> >  455                         return ERR_PTR(-EINVAL);
> >  456
> >  457                 /* Merge contiguous pages into the last SG */
> >  458                 prv_len = prv->length;
> >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> >  460                         if (prv->length + PAGE_SIZE > max_segment)
> >  461                                 break;
> >  462                         prv->length += PAGE_SIZE;
> >  463                         paddr++;
> >  464                         pages++;
> >  465                         n_pages--;
> >  466                 }
> >  467                 if (!n_pages)
> >  468                         goto out;
> >  469         }
> >
> > "
> > if prv->length + PAGE_SIZE > max_segment, then set another sg.
> > In the commit "RDMA/umem: Move to allocate SG table from pages",
> > max_segment is dma_get_max_seg_size.
> > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> > usually less than max_segment
> > since length is unsigned int.
>
> I don't understand what you are trying to say
>
>   460                         if (prv->length + PAGE_SIZE > max_segment)
>
> max_segment should be a very big number and "prv->length + PAGE_SIZE" should
> always be < max_segment so it should always be increasing the size of
> prv->length and 'rpv' here is the sgl.

Sure.

When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE.
It is big. That is, segment size is UINT_MAX - PAGE_SIZE.

But from the function ib_umem_add_sg_table, the prv->length is SZ_2M.
That is, segment size if SZ_2M.

It is the difference between the 2 functions.

Zhu Yanjun


>
> The other loops are the same.
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12  8:04             ` Zhu Yanjun
@ 2021-03-12  8:05               ` Zhu Yanjun
  2021-03-12 13:02               ` Jason Gunthorpe
  1 sibling, 0 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-12  8:05 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 4:04 PM Zhu Yanjun <zyjzyj2000@gmail.com> wrote:
>
> On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> > > >
> > > > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > > > found that this function is related with ib_dma_max_seg_size. So
> > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > > > address is 2M now.
> > > >
> > > > That seems like a bug, you should fix it
> > >
> > > Hi, Jason && Leon
> > >
> > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> > > In __sg_alloc_table_from_pages:
> > >
> > > "
> > >  449         if (prv) {
> > >  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> > > * PAGE_SIZE +
> > >  451                                        prv->offset + prv->length) /
> > >  452                                       PAGE_SIZE;
> > >  453
> > >  454                 if (WARN_ON(offset))
> > >  455                         return ERR_PTR(-EINVAL);
> > >  456
> > >  457                 /* Merge contiguous pages into the last SG */
> > >  458                 prv_len = prv->length;
> > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > >  460                         if (prv->length + PAGE_SIZE > max_segment)
> > >  461                                 break;
> > >  462                         prv->length += PAGE_SIZE;
> > >  463                         paddr++;
> > >  464                         pages++;
> > >  465                         n_pages--;
> > >  466                 }
> > >  467                 if (!n_pages)
> > >  468                         goto out;
> > >  469         }
> > >
> > > "
> > > if prv->length + PAGE_SIZE > max_segment, then set another sg.
> > > In the commit "RDMA/umem: Move to allocate SG table from pages",
> > > max_segment is dma_get_max_seg_size.
> > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> > > usually less than max_segment
> > > since length is unsigned int.
> >
> > I don't understand what you are trying to say
> >
> >   460                         if (prv->length + PAGE_SIZE > max_segment)
> >
> > max_segment should be a very big number and "prv->length + PAGE_SIZE" should
> > always be < max_segment so it should always be increasing the size of
> > prv->length and 'rpv' here is the sgl.
>
> Sure.
>
> When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE.
> It is big. That is, segment size is UINT_MAX - PAGE_SIZE.
>
> But from the function ib_umem_add_sg_table, the prv->length is SZ_2M.
> That is, segment size if SZ_2M.
>
> It is the difference between the 2 functions.

BTW, my commit is to fix this difference.

Zhu Yanjun

>
> Zhu Yanjun
>
>
> >
> > The other loops are the same.
> >
> > Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12  8:04             ` Zhu Yanjun
  2021-03-12  8:05               ` Zhu Yanjun
@ 2021-03-12 13:02               ` Jason Gunthorpe
  2021-03-12 13:42                 ` Zhu Yanjun
  1 sibling, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-12 13:02 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote:
> On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> > > >
> > > > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > > > found that this function is related with ib_dma_max_seg_size. So
> > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > > > address is 2M now.
> > > >
> > > > That seems like a bug, you should fix it
> > >
> > > Hi, Jason && Leon
> > >
> > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> > > In __sg_alloc_table_from_pages:
> > >
> > > "
> > >  449         if (prv) {
> > >  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> > > * PAGE_SIZE +
> > >  451                                        prv->offset + prv->length) /
> > >  452                                       PAGE_SIZE;
> > >  453
> > >  454                 if (WARN_ON(offset))
> > >  455                         return ERR_PTR(-EINVAL);
> > >  456
> > >  457                 /* Merge contiguous pages into the last SG */
> > >  458                 prv_len = prv->length;
> > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > >  460                         if (prv->length + PAGE_SIZE > max_segment)
> > >  461                                 break;
> > >  462                         prv->length += PAGE_SIZE;
> > >  463                         paddr++;
> > >  464                         pages++;
> > >  465                         n_pages--;
> > >  466                 }
> > >  467                 if (!n_pages)
> > >  468                         goto out;
> > >  469         }
> > >
> > > "
> > > if prv->length + PAGE_SIZE > max_segment, then set another sg.
> > > In the commit "RDMA/umem: Move to allocate SG table from pages",
> > > max_segment is dma_get_max_seg_size.
> > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> > > usually less than max_segment
> > > since length is unsigned int.
> >
> > I don't understand what you are trying to say
> >
> >   460                         if (prv->length + PAGE_SIZE > max_segment)
> >
> > max_segment should be a very big number and "prv->length + PAGE_SIZE" should
> > always be < max_segment so it should always be increasing the size of
> > prv->length and 'rpv' here is the sgl.
> 
> Sure.
> 
> When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE.
> It is big. That is, segment size is UINT_MAX - PAGE_SIZE.
> 
> But from the function ib_umem_add_sg_table, the prv->length is SZ_2M.
> That is, segment size if SZ_2M.
> 
> It is the difference between the 2 functions.

I still have no idea what you are trying to say. Why would prv->length
be 'UINT - PAGE_SIZE'? That sounds like max_segment

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12 13:02               ` Jason Gunthorpe
@ 2021-03-12 13:42                 ` Zhu Yanjun
  2021-03-12 13:49                   ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-12 13:42 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 9:02 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote:
> > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > >
> > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> > > > >
> > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > > > > found that this function is related with ib_dma_max_seg_size. So
> > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > > > > address is 2M now.
> > > > >
> > > > > That seems like a bug, you should fix it
> > > >
> > > > Hi, Jason && Leon
> > > >
> > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> > > > In __sg_alloc_table_from_pages:
> > > >
> > > > "
> > > >  449         if (prv) {
> > > >  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> > > > * PAGE_SIZE +
> > > >  451                                        prv->offset + prv->length) /
> > > >  452                                       PAGE_SIZE;
> > > >  453
> > > >  454                 if (WARN_ON(offset))
> > > >  455                         return ERR_PTR(-EINVAL);
> > > >  456
> > > >  457                 /* Merge contiguous pages into the last SG */
> > > >  458                 prv_len = prv->length;
> > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > >  460                         if (prv->length + PAGE_SIZE > max_segment)
> > > >  461                                 break;
> > > >  462                         prv->length += PAGE_SIZE;
> > > >  463                         paddr++;
> > > >  464                         pages++;
> > > >  465                         n_pages--;
> > > >  466                 }
> > > >  467                 if (!n_pages)
> > > >  468                         goto out;
> > > >  469         }
> > > >
> > > > "
> > > > if prv->length + PAGE_SIZE > max_segment, then set another sg.
> > > > In the commit "RDMA/umem: Move to allocate SG table from pages",
> > > > max_segment is dma_get_max_seg_size.
> > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> > > > usually less than max_segment
> > > > since length is unsigned int.
> > >
> > > I don't understand what you are trying to say
> > >
> > >   460                         if (prv->length + PAGE_SIZE > max_segment)
> > >
> > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should
> > > always be < max_segment so it should always be increasing the size of
> > > prv->length and 'rpv' here is the sgl.
> >
> > Sure.
> >
> > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE.
> > It is big. That is, segment size is UINT_MAX - PAGE_SIZE.
> >
> > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M.
> > That is, segment size if SZ_2M.
> >
> > It is the difference between the 2 functions.
>
> I still have no idea what you are trying to say. Why would prv->length
> be 'UINT - PAGE_SIZE'? That sounds like max_segment

Sure. whatever, this prv->length from __sg_alloc_table_from_pages
is greater than sg->length from the function ib_umem_add_sg_table.

__sg_alloc_table_from_pages:
"
 457                 /* Merge contiguous pages into the last SG */
 458                 prv_len = prv->length;
 459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
 460                         if (prv->length + PAGE_SIZE >
max_segment)  <-------------- prv->length > max_segment - PAGE_SIZE
 461                                 break;
 462                         prv->length += PAGE_SIZE;
 463                         paddr++;
 464                         pages++;
 465                         n_pages--;
 466                 }
 467                 if (!n_pages)
 468                         goto out;
"

Zhu Yanjun
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12 13:42                 ` Zhu Yanjun
@ 2021-03-12 13:49                   ` Zhu Yanjun
  2021-03-12 14:01                     ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-12 13:49 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 9:42 PM Zhu Yanjun <zyjzyj2000@gmail.com> wrote:
>
> On Fri, Mar 12, 2021 at 9:02 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Fri, Mar 12, 2021 at 04:04:35PM +0800, Zhu Yanjun wrote:
> > > On Fri, Mar 12, 2021 at 8:25 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Thu, Mar 11, 2021 at 06:41:43PM +0800, Zhu Yanjun wrote:
> > > > > On Mon, Mar 8, 2021 at 8:16 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 08, 2021 at 06:13:52PM +0800, Zhu Yanjun wrote:
> > > > > >
> > > > > > > And I delved into the source code of __sg_alloc_table_from_pages. I
> > > > > > > found that this function is related with ib_dma_max_seg_size. So
> > > > > > > when ib_dma_max_seg_size is set to UINT_MAX, the sg dma address is
> > > > > > > 4K (one page). When ib_dma_max_seg_size is set to SZ_2M, the sg dma
> > > > > > > address is 2M now.
> > > > > >
> > > > > > That seems like a bug, you should fix it
> > > > >
> > > > > Hi, Jason && Leon
> > > > >
> > > > > I compared the function __sg_alloc_table_from_pages with ib_umem_add_sg_table.
> > > > > In __sg_alloc_table_from_pages:
> > > > >
> > > > > "
> > > > >  449         if (prv) {
> > > > >  450                 unsigned long paddr = (page_to_pfn(sg_page(prv))
> > > > > * PAGE_SIZE +
> > > > >  451                                        prv->offset + prv->length) /
> > > > >  452                                       PAGE_SIZE;
> > > > >  453
> > > > >  454                 if (WARN_ON(offset))
> > > > >  455                         return ERR_PTR(-EINVAL);
> > > > >  456
> > > > >  457                 /* Merge contiguous pages into the last SG */
> > > > >  458                 prv_len = prv->length;
> > > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > > >  460                         if (prv->length + PAGE_SIZE > max_segment)
> > > > >  461                                 break;
> > > > >  462                         prv->length += PAGE_SIZE;
> > > > >  463                         paddr++;
> > > > >  464                         pages++;
> > > > >  465                         n_pages--;
> > > > >  466                 }
> > > > >  467                 if (!n_pages)
> > > > >  468                         goto out;
> > > > >  469         }
> > > > >
> > > > > "
> > > > > if prv->length + PAGE_SIZE > max_segment, then set another sg.
> > > > > In the commit "RDMA/umem: Move to allocate SG table from pages",
> > > > > max_segment is dma_get_max_seg_size.
> > > > > Normally it is UINT_MAX. So in my host, prv->length + PAGE_SIZE is
> > > > > usually less than max_segment
> > > > > since length is unsigned int.
> > > >
> > > > I don't understand what you are trying to say
> > > >
> > > >   460                         if (prv->length + PAGE_SIZE > max_segment)
> > > >
> > > > max_segment should be a very big number and "prv->length + PAGE_SIZE" should
> > > > always be < max_segment so it should always be increasing the size of
> > > > prv->length and 'rpv' here is the sgl.
> > >
> > > Sure.
> > >
> > > When max_segment is UINT_MAX, prv->length is UINT_MAX - PAGE_SIZE.
> > > It is big. That is, segment size is UINT_MAX - PAGE_SIZE.
> > >
> > > But from the function ib_umem_add_sg_table, the prv->length is SZ_2M.
> > > That is, segment size if SZ_2M.
> > >
> > > It is the difference between the 2 functions.
> >
> > I still have no idea what you are trying to say. Why would prv->length
> > be 'UINT - PAGE_SIZE'? That sounds like max_segment
>
> Sure. whatever, this prv->length from __sg_alloc_table_from_pages
> is greater than sg->length from the function ib_umem_add_sg_table.

In short, the sg list from __sg_alloc_table_from_pages is different
from the sg list from ib_umem_add_sg_table.
This difference will make difference on the later behavior.

Zhu Yanjun

>
> __sg_alloc_table_from_pages:
> "
>  457                 /* Merge contiguous pages into the last SG */
>  458                 prv_len = prv->length;
>  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
>  460                         if (prv->length + PAGE_SIZE >
> max_segment)  <-------------- prv->length > max_segment - PAGE_SIZE
>  461                                 break;
>  462                         prv->length += PAGE_SIZE;
>  463                         paddr++;
>  464                         pages++;
>  465                         n_pages--;
>  466                 }
>  467                 if (!n_pages)
>  468                         goto out;
> "
>
> Zhu Yanjun
> >
> > Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12 13:49                   ` Zhu Yanjun
@ 2021-03-12 14:01                     ` Jason Gunthorpe
  2021-03-13  3:02                       ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-12 14:01 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> In short, the sg list from __sg_alloc_table_from_pages is different
> from the sg list from ib_umem_add_sg_table.

I don't care about different. Tell me what is wrong with what we have
today.

I thought your first message said the sgl's were too small, but now
you seem to say they are too big?

I still have no idea what this problem is

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-12 14:01                     ` Jason Gunthorpe
@ 2021-03-13  3:02                       ` Zhu Yanjun
  2021-03-19 13:00                         ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-13  3:02 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > In short, the sg list from __sg_alloc_table_from_pages is different
> > from the sg list from ib_umem_add_sg_table.
>
> I don't care about different. Tell me what is wrong with what we have
> today.
>
> I thought your first message said the sgl's were too small, but now
> you seem to say they are too big?

Sure.

The sg list from __sg_alloc_table_from_pages, length of sg is too big.
And the dma address is like the followings:

"
sg_dma_address(sg):0x4b3c1ce000
sg_dma_address(sg):0x4c3c1cd000
sg_dma_address(sg):0x4d3c1cc000
sg_dma_address(sg):0x4e3c1cb000
"

The sg list from ib_umem_add_sg_table, length of sg is 2M.
And the dma address is like the followings:
"
sg_dma_address(sg):0x203b400000
sg_dma_address(sg):0x213b200000
sg_dma_address(sg):0x223b000000
sg_dma_address(sg):0x233ae00000
sg_dma_address(sg):0x243ac00000
"
The above 2 sg lists will be handled in this function ib_umem_find_best_pgsz.
The returned values are different.

Zhu Yanjun
>
> I still have no idea what this problem is
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-13  3:02                       ` Zhu Yanjun
@ 2021-03-19 13:00                         ` Jason Gunthorpe
  2021-03-19 13:33                           ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-19 13:00 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > from the sg list from ib_umem_add_sg_table.
> >
> > I don't care about different. Tell me what is wrong with what we have
> > today.
> >
> > I thought your first message said the sgl's were too small, but now
> > you seem to say they are too big?
> 
> Sure.
> 
> The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> And the dma address is like the followings:
> 
> "
> sg_dma_address(sg):0x4b3c1ce000
> sg_dma_address(sg):0x4c3c1cd000
> sg_dma_address(sg):0x4d3c1cc000
> sg_dma_address(sg):0x4e3c1cb000
> "

Ok, so how does too big a dma segment side cause
__sg_alloc_table_from_pages() to return sg elements that are too
small?

I assume there is some kind of maths overflow here?

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-19 13:00                         ` Jason Gunthorpe
@ 2021-03-19 13:33                           ` Zhu Yanjun
  2021-03-19 13:48                             ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-19 13:33 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > from the sg list from ib_umem_add_sg_table.
> > >
> > > I don't care about different. Tell me what is wrong with what we have
> > > today.
> > >
> > > I thought your first message said the sgl's were too small, but now
> > > you seem to say they are too big?
> >
> > Sure.
> >
> > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > And the dma address is like the followings:
> >
> > "
> > sg_dma_address(sg):0x4b3c1ce000
> > sg_dma_address(sg):0x4c3c1cd000
> > sg_dma_address(sg):0x4d3c1cc000
> > sg_dma_address(sg):0x4e3c1cb000
> > "
>
> Ok, so how does too big a dma segment side cause
> __sg_alloc_table_from_pages() to return sg elements that are too
> small?
>
> I assume there is some kind of maths overflow here?
Please check this function __sg_alloc_table_from_pages
"
...
 457                 /* Merge contiguous pages into the last SG */
 458                 prv_len = prv->length;
 459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
 460                         if (prv->length + PAGE_SIZE >
max_segment)    <--max_segment is too big. So n_pages will be 0. Then
the function will goto out to exit.
 461                                 break;
 462                         prv->length += PAGE_SIZE;
 463                         paddr++;
 464                         pages++;
 465                         n_pages--;
 466                 }
 467                 if (!n_pages)
 468                         goto out;
...
"

>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-19 13:33                           ` Zhu Yanjun
@ 2021-03-19 13:48                             ` Jason Gunthorpe
  2021-03-20  3:38                               ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-19 13:48 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote:
> On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > > from the sg list from ib_umem_add_sg_table.
> > > >
> > > > I don't care about different. Tell me what is wrong with what we have
> > > > today.
> > > >
> > > > I thought your first message said the sgl's were too small, but now
> > > > you seem to say they are too big?
> > >
> > > Sure.
> > >
> > > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > > And the dma address is like the followings:
> > >
> > > "
> > > sg_dma_address(sg):0x4b3c1ce000
> > > sg_dma_address(sg):0x4c3c1cd000
> > > sg_dma_address(sg):0x4d3c1cc000
> > > sg_dma_address(sg):0x4e3c1cb000
> > > "
> >
> > Ok, so how does too big a dma segment side cause
> > __sg_alloc_table_from_pages() to return sg elements that are too
> > small?
> >
> > I assume there is some kind of maths overflow here?
> Please check this function __sg_alloc_table_from_pages
> "
> ...
>  457                 /* Merge contiguous pages into the last SG */
>  458                 prv_len = prv->length;
>  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
>  460                         if (prv->length + PAGE_SIZE >
> max_segment)    <--max_segment is too big. So n_pages will be 0. Then
> the function will goto out to exit.

You already said this. 

You are reporting 4k pages, if max_segment is larger than 4k there is
no such thing as "too big"

I assume it is "too small" because of some maths overflow.

You should add some prints and find out what is going on.

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-19 13:48                             ` Jason Gunthorpe
@ 2021-03-20  3:38                               ` Zhu Yanjun
  2021-03-20  3:49                                 ` Zhu Yanjun
  2021-03-20 20:38                                 ` Jason Gunthorpe
  0 siblings, 2 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-20  3:38 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote:
> > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > >
> > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > > > from the sg list from ib_umem_add_sg_table.
> > > > >
> > > > > I don't care about different. Tell me what is wrong with what we have
> > > > > today.
> > > > >
> > > > > I thought your first message said the sgl's were too small, but now
> > > > > you seem to say they are too big?
> > > >
> > > > Sure.
> > > >
> > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > > > And the dma address is like the followings:
> > > >
> > > > "
> > > > sg_dma_address(sg):0x4b3c1ce000
> > > > sg_dma_address(sg):0x4c3c1cd000
> > > > sg_dma_address(sg):0x4d3c1cc000
> > > > sg_dma_address(sg):0x4e3c1cb000
> > > > "
> > >
> > > Ok, so how does too big a dma segment side cause
> > > __sg_alloc_table_from_pages() to return sg elements that are too
> > > small?
> > >
> > > I assume there is some kind of maths overflow here?
> > Please check this function __sg_alloc_table_from_pages
> > "
> > ...
> >  457                 /* Merge contiguous pages into the last SG */
> >  458                 prv_len = prv->length;
> >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> >  460                         if (prv->length + PAGE_SIZE >
> > max_segment)    <--max_segment is too big. So n_pages will be 0. Then
> > the function will goto out to exit.
>
> You already said this.
>
> You are reporting 4k pages, if max_segment is larger than 4k there is
> no such thing as "too big"
>
> I assume it is "too small" because of some maths overflow.

 459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
 460                         if (prv->length + PAGE_SIZE >
max_segment)  <--it max_segment is big, n_pages is zero.
 461                                 break;
 462                         prv->length += PAGE_SIZE;
 463                         paddr++;
 464                         pages++;
 465                         n_pages--;
 466                 }
 467                 if (!n_pages)   <---here, this function will goto out.
 468                         goto out;
...
 509                 chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
 510                 sg_set_page(s, pages[cur_page],
 511                             min_t(unsigned long, size,
chunk_size), offset); <----this function will not have many chance to
be called if max_segment is big.
 512                 added_nents++;
 513                 size -= chunk_size;

If the max_segment is not big enough, for example it is SZ-2M,
sg_set_page will be called every SZ_2M.
To now, I do not find any math overflow.

Zhu Yanjun
>
> You should add some prints and find out what is going on.
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-20  3:38                               ` Zhu Yanjun
@ 2021-03-20  3:49                                 ` Zhu Yanjun
  2021-03-20 20:38                                 ` Jason Gunthorpe
  1 sibling, 0 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-20  3:49 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sat, Mar 20, 2021 at 11:38 AM Zhu Yanjun <zyjzyj2000@gmail.com> wrote:
>
> On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote:
> > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > > > > from the sg list from ib_umem_add_sg_table.
> > > > > >
> > > > > > I don't care about different. Tell me what is wrong with what we have
> > > > > > today.
> > > > > >
> > > > > > I thought your first message said the sgl's were too small, but now
> > > > > > you seem to say they are too big?
> > > > >
> > > > > Sure.
> > > > >
> > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > > > > And the dma address is like the followings:
> > > > >
> > > > > "
> > > > > sg_dma_address(sg):0x4b3c1ce000
> > > > > sg_dma_address(sg):0x4c3c1cd000
> > > > > sg_dma_address(sg):0x4d3c1cc000
> > > > > sg_dma_address(sg):0x4e3c1cb000
> > > > > "
> > > >
> > > > Ok, so how does too big a dma segment side cause
> > > > __sg_alloc_table_from_pages() to return sg elements that are too
> > > > small?
> > > >
> > > > I assume there is some kind of maths overflow here?
> > > Please check this function __sg_alloc_table_from_pages
> > > "
> > > ...
> > >  457                 /* Merge contiguous pages into the last SG */
> > >  458                 prv_len = prv->length;
> > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > >  460                         if (prv->length + PAGE_SIZE >
> > > max_segment)    <--max_segment is too big. So n_pages will be 0. Then
> > > the function will goto out to exit.
> >
> > You already said this.
> >
> > You are reporting 4k pages, if max_segment is larger than 4k there is
> > no such thing as "too big"
> >
> > I assume it is "too small" because of some maths overflow.
>
>  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
>  460                         if (prv->length + PAGE_SIZE >
> max_segment)  <--it max_segment is big, n_pages is zero.
>  461                                 break;
>  462                         prv->length += PAGE_SIZE;
>  463                         paddr++;
>  464                         pages++;
>  465                         n_pages--;
>  466                 }
>  467                 if (!n_pages)   <---here, this function will goto out.
>  468                         goto out;
> ...
>  509                 chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
>  510                 sg_set_page(s, pages[cur_page],
>  511                             min_t(unsigned long, size,
> chunk_size), offset); <----this function will not have many chance to
> be called if max_segment is big.
>  512                 added_nents++;
>  513                 size -= chunk_size;
>
> If the max_segment is not big enough, for example it is SZ-2M,
> sg_set_page will be called every SZ_2M.
> To now, I do not find any math overflow.

147 struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
148                             size_t size, int access)
149 {
...
244         umem->nmap =
245                 ib_dma_map_sg_attrs(device, umem->sg_head.sgl,
umem->sg_nents,
246                                     DMA_BIDIRECTIONAL, dma_attr);
...

And after the function ib_dma_map_sg_attrs, dma address is set.
To now, I can not find maths overflow.

Zhu Yanjun


>
> Zhu Yanjun
> >
> > You should add some prints and find out what is going on.
> >
> > Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-20  3:38                               ` Zhu Yanjun
  2021-03-20  3:49                                 ` Zhu Yanjun
@ 2021-03-20 20:38                                 ` Jason Gunthorpe
  2021-03-21  8:06                                   ` Zhu Yanjun
  1 sibling, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-20 20:38 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sat, Mar 20, 2021 at 11:38:26AM +0800, Zhu Yanjun wrote:
> On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote:
> > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > >
> > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > > > > from the sg list from ib_umem_add_sg_table.
> > > > > >
> > > > > > I don't care about different. Tell me what is wrong with what we have
> > > > > > today.
> > > > > >
> > > > > > I thought your first message said the sgl's were too small, but now
> > > > > > you seem to say they are too big?
> > > > >
> > > > > Sure.
> > > > >
> > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > > > > And the dma address is like the followings:
> > > > >
> > > > > "
> > > > > sg_dma_address(sg):0x4b3c1ce000
> > > > > sg_dma_address(sg):0x4c3c1cd000
> > > > > sg_dma_address(sg):0x4d3c1cc000
> > > > > sg_dma_address(sg):0x4e3c1cb000
> > > > > "
> > > >
> > > > Ok, so how does too big a dma segment side cause
> > > > __sg_alloc_table_from_pages() to return sg elements that are too
> > > > small?
> > > >
> > > > I assume there is some kind of maths overflow here?
> > > Please check this function __sg_alloc_table_from_pages
> > > "
> > > ...
> > >  457                 /* Merge contiguous pages into the last SG */
> > >  458                 prv_len = prv->length;
> > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > >  460                         if (prv->length + PAGE_SIZE >
> > > max_segment)    <--max_segment is too big. So n_pages will be 0. Then
> > > the function will goto out to exit.
> >
> > You already said this.
> >
> > You are reporting 4k pages, if max_segment is larger than 4k there is
> > no such thing as "too big"
> >
> > I assume it is "too small" because of some maths overflow.
> 
>  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
>  460                         if (prv->length + PAGE_SIZE >
> max_segment)  <--it max_segment is big, n_pages is zero.

What does n_pages have to do with max_segment?

Please try to explain clearly.

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-20 20:38                                 ` Jason Gunthorpe
@ 2021-03-21  8:06                                   ` Zhu Yanjun
  2021-03-21 12:07                                     ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-21  8:06 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 4:38 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sat, Mar 20, 2021 at 11:38:26AM +0800, Zhu Yanjun wrote:
> > On Fri, Mar 19, 2021 at 9:48 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Fri, Mar 19, 2021 at 09:33:13PM +0800, Zhu Yanjun wrote:
> > > > On Fri, Mar 19, 2021 at 9:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > >
> > > > > On Sat, Mar 13, 2021 at 11:02:41AM +0800, Zhu Yanjun wrote:
> > > > > > On Fri, Mar 12, 2021 at 10:01 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 12, 2021 at 09:49:52PM +0800, Zhu Yanjun wrote:
> > > > > > > > In short, the sg list from __sg_alloc_table_from_pages is different
> > > > > > > > from the sg list from ib_umem_add_sg_table.
> > > > > > >
> > > > > > > I don't care about different. Tell me what is wrong with what we have
> > > > > > > today.
> > > > > > >
> > > > > > > I thought your first message said the sgl's were too small, but now
> > > > > > > you seem to say they are too big?
> > > > > >
> > > > > > Sure.
> > > > > >
> > > > > > The sg list from __sg_alloc_table_from_pages, length of sg is too big.
> > > > > > And the dma address is like the followings:
> > > > > >
> > > > > > "
> > > > > > sg_dma_address(sg):0x4b3c1ce000
> > > > > > sg_dma_address(sg):0x4c3c1cd000
> > > > > > sg_dma_address(sg):0x4d3c1cc000
> > > > > > sg_dma_address(sg):0x4e3c1cb000
> > > > > > "
> > > > >
> > > > > Ok, so how does too big a dma segment side cause
> > > > > __sg_alloc_table_from_pages() to return sg elements that are too
> > > > > small?
> > > > >
> > > > > I assume there is some kind of maths overflow here?
> > > > Please check this function __sg_alloc_table_from_pages
> > > > "
> > > > ...
> > > >  457                 /* Merge contiguous pages into the last SG */
> > > >  458                 prv_len = prv->length;
> > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > >  460                         if (prv->length + PAGE_SIZE >
> > > > max_segment)    <--max_segment is too big. So n_pages will be 0. Then
> > > > the function will goto out to exit.
> > >
> > > You already said this.
> > >
> > > You are reporting 4k pages, if max_segment is larger than 4k there is
> > > no such thing as "too big"
> > >
> > > I assume it is "too small" because of some maths overflow.
> >
> >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> >  460                         if (prv->length + PAGE_SIZE >
> > max_segment)  <--it max_segment is big, n_pages is zero.
>
> What does n_pages have to do with max_segment?

With the following snippet
"
        struct ib_umem *region;
        region = ib_umem_get(pd->device, start, len, access);

        page_size = ib_umem_find_best_pgsz(region,
                                           SZ_4K | SZ_2M | SZ_1G,
                                           virt);
"
Before the commit 0c16d9635e3a ("RDMA/umem: Move to allocate SG table
from pages"),
the variable page_size is SZ_2M.
After the commit 0c16d9635e3a ("RDMA/umem: Move to allocate SG table
from pages"),
the variable page_size is SZ_4K.

IMHO, you can reproduce this problem in your local host.

Zhu Yanjun
>
> Please try to explain clearly.
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21  8:06                                   ` Zhu Yanjun
@ 2021-03-21 12:07                                     ` Jason Gunthorpe
  2021-03-21 12:54                                       ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-21 12:07 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote:
> > > > You are reporting 4k pages, if max_segment is larger than 4k there is
> > > > no such thing as "too big"
> > > >
> > > > I assume it is "too small" because of some maths overflow.
> > >
> > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > >  460                         if (prv->length + PAGE_SIZE >
> > > max_segment)  <--it max_segment is big, n_pages is zero.
> >
> > What does n_pages have to do with max_segment?
> 
> With the following snippet
> "
>         struct ib_umem *region;
>         region = ib_umem_get(pd->device, start, len, access);
> 
>         page_size = ib_umem_find_best_pgsz(region,
>                                            SZ_4K | SZ_2M | SZ_1G,
>                                            virt);
> "

Can you please stop posting random bits of code that do nothing to
answer the question?

> IMHO, you can reproduce this problem in your local host.

Uh, no, you need to communicate effectively or stop posting please.

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21 12:07                                     ` Jason Gunthorpe
@ 2021-03-21 12:54                                       ` Zhu Yanjun
  2021-03-21 13:03                                         ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-21 12:54 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote:
> > > > > You are reporting 4k pages, if max_segment is larger than 4k there is
> > > > > no such thing as "too big"
> > > > >
> > > > > I assume it is "too small" because of some maths overflow.
> > > >
> > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > >  460                         if (prv->length + PAGE_SIZE >
> > > > max_segment)  <--it max_segment is big, n_pages is zero.
> > >
> > > What does n_pages have to do with max_segment?
> >
> > With the following snippet
> > "
> >         struct ib_umem *region;
> >         region = ib_umem_get(pd->device, start, len, access);
> >
> >         page_size = ib_umem_find_best_pgsz(region,
> >                                            SZ_4K | SZ_2M | SZ_1G,
> >                                            virt);
> > "
>
> Can you please stop posting random bits of code that do nothing to
> answer the question?
>
> > IMHO, you can reproduce this problem in your local host.
>
> Uh, no, you need to communicate effectively or stop posting please.

can you read the following link again

https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643

In this link, I explained it in details.

Since the max_segment is very big, so normally the prv->length +
PAGE_SIZE is less than max_segment, it will loop over and over again,
until n_pages is zero,
then the function __sg_alloc_table_from_pages will exit. So the
function __sg_alloc_table_from_pages has few chances to call the
function sg_set_page.

This behavior is not like the function ib_umem_add_sg_table. In
ib_umem_add_sg_table, the "max_seg_sz - sg->length" has more chances
to be greater than (len << PAGE_SHIFT).
So the function sg_set_page is called more times than
__sg_alloc_table_from_pages.

The above causes the different sg lists from the 2 functions
__sg_alloc_table_from_pages and ib_umem_add_sg_table.

The most important thing is "if (prv->length + PAGE_SIZE >
max_segment)" in __sg_alloc_table_from_pages and "if ((max_seg_sz -
sg->length) >= (len << PAGE_SHIFT))" in the function
ib_umem_add_sg_table.

When max_segment is UINT_MAX, the different test in the 2 functions
causes the different sg lists.

Zhu Yanjun

>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21 12:54                                       ` Zhu Yanjun
@ 2021-03-21 13:03                                         ` Jason Gunthorpe
  2021-03-21 14:38                                           ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-21 13:03 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 08:54:31PM +0800, Zhu Yanjun wrote:
> On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote:
> > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is
> > > > > > no such thing as "too big"
> > > > > >
> > > > > > I assume it is "too small" because of some maths overflow.
> > > > >
> > > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > > >  460                         if (prv->length + PAGE_SIZE >
> > > > > max_segment)  <--it max_segment is big, n_pages is zero.
> > > >
> > > > What does n_pages have to do with max_segment?
> > >
> > > With the following snippet
> > > "
> > >         struct ib_umem *region;
> > >         region = ib_umem_get(pd->device, start, len, access);
> > >
> > >         page_size = ib_umem_find_best_pgsz(region,
> > >                                            SZ_4K | SZ_2M | SZ_1G,
> > >                                            virt);
> > > "
> >
> > Can you please stop posting random bits of code that do nothing to
> > answer the question?
> >
> > > IMHO, you can reproduce this problem in your local host.
> >
> > Uh, no, you need to communicate effectively or stop posting please.
> 
> can you read the following link again
> 
> https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643
> 
> In this link, I explained it in details.

No, this is all nonsense

> Since the max_segment is very big, so normally the prv->length +
> PAGE_SIZE is less than max_segment, it will loop over and over again,
> until n_pages is zero,

And each iteration increases

			prv->length += PAGE_SIZE;
			n_pages--;

Where prv is the last sgl entry, so decreasing n_pages cannot also
yield a 4k sgl entry like you are reporting.

You get 4k sgl entries if max_segment is *too small* not too big!

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21 13:03                                         ` Jason Gunthorpe
@ 2021-03-21 14:38                                           ` Zhu Yanjun
  2021-03-21 15:52                                             ` Jason Gunthorpe
  0 siblings, 1 reply; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-21 14:38 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 9:03 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sun, Mar 21, 2021 at 08:54:31PM +0800, Zhu Yanjun wrote:
> > On Sun, Mar 21, 2021 at 8:07 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Sun, Mar 21, 2021 at 04:06:07PM +0800, Zhu Yanjun wrote:
> > > > > > > You are reporting 4k pages, if max_segment is larger than 4k there is
> > > > > > > no such thing as "too big"
> > > > > > >
> > > > > > > I assume it is "too small" because of some maths overflow.
> > > > > >
> > > > > >  459                 while (n_pages && page_to_pfn(pages[0]) == paddr) {
> > > > > >  460                         if (prv->length + PAGE_SIZE >
> > > > > > max_segment)  <--it max_segment is big, n_pages is zero.
> > > > >
> > > > > What does n_pages have to do with max_segment?
> > > >
> > > > With the following snippet
> > > > "
> > > >         struct ib_umem *region;
> > > >         region = ib_umem_get(pd->device, start, len, access);
> > > >
> > > >         page_size = ib_umem_find_best_pgsz(region,
> > > >                                            SZ_4K | SZ_2M | SZ_1G,
> > > >                                            virt);
> > > > "
> > >
> > > Can you please stop posting random bits of code that do nothing to
> > > answer the question?
> > >
> > > > IMHO, you can reproduce this problem in your local host.
> > >
> > > Uh, no, you need to communicate effectively or stop posting please.
> >
> > can you read the following link again
> >
> > https://patchwork.kernel.org/project/linux-rdma/patch/CAD=hENeqTTmpS5V+G646V0QvJFLVSd3Sq11ffQFcDXU-OSsQEg@mail.gmail.com/#24031643
> >
> > In this link, I explained it in details.
>
> No, this is all nonsense
>
> > Since the max_segment is very big, so normally the prv->length +
> > PAGE_SIZE is less than max_segment, it will loop over and over again,
> > until n_pages is zero,
>
> And each iteration increases
>
>                         prv->length += PAGE_SIZE;
>                         n_pages--;
>
> Where prv is the last sgl entry, so decreasing n_pages cannot also
> yield a 4k sgl entry like you are reporting.

No. You suppose n_pages is very big, then prv->length will increase to
reach max_segment.
If n_pages is very small, for example, we suppose n_pages is 1, then
prv->length will
not reach max_segment.

In fact, in my test case, n_pages is very small.

Zhu Yanjun

>
> You get 4k sgl entries if max_segment is *too small* not too big!
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21 14:38                                           ` Zhu Yanjun
@ 2021-03-21 15:52                                             ` Jason Gunthorpe
  2021-03-22  5:07                                               ` Zhu Yanjun
  0 siblings, 1 reply; 30+ messages in thread
From: Jason Gunthorpe @ 2021-03-21 15:52 UTC (permalink / raw)
  To: Zhu Yanjun; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

On Sun, Mar 21, 2021 at 10:38:45PM +0800, Zhu Yanjun wrote:

> No. You suppose n_pages is very big, then prv->length will increase to
> reach max_segment.
> If n_pages is very small, for example, we suppose n_pages is 1, then
> prv->length will
> not reach max_segment.
> 
> In fact, in my test case, n_pages is very small.

n_pages is an *input* it is the number of pages in the VA range, if it
is very small it is not a problem with this algorithm

Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list
  2021-03-21 15:52                                             ` Jason Gunthorpe
@ 2021-03-22  5:07                                               ` Zhu Yanjun
  0 siblings, 0 replies; 30+ messages in thread
From: Zhu Yanjun @ 2021-03-22  5:07 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Doug Ledford, RDMA mailing list, maorg

Got it.  Thanks a lot for your help.

Zhu Yanjun

On Sun, Mar 21, 2021 at 11:52 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Sun, Mar 21, 2021 at 10:38:45PM +0800, Zhu Yanjun wrote:
>
> > No. You suppose n_pages is very big, then prv->length will increase to
> > reach max_segment.
> > If n_pages is very small, for example, we suppose n_pages is 1, then
> > prv->length will
> > not reach max_segment.
> >
> > In fact, in my test case, n_pages is very small.
>
> n_pages is an *input* it is the number of pages in the VA range, if it
> is very small it is not a problem with this algorithm
>
> Jason

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-03-22  5:08 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20210307221034.568606-1-yanjun.zhu@intel.com>
2021-03-07 14:28 ` Fwd: [PATCH 1/1] RDMA/umem: add back hugepage sg list Zhu Yanjun
2021-03-07 17:20   ` Leon Romanovsky
2021-03-07 17:22   ` Leon Romanovsky
2021-03-08  2:44     ` Zhu Yanjun
2021-03-08 10:13     ` Zhu Yanjun
2021-03-08 12:16       ` Jason Gunthorpe
2021-03-11 10:41         ` Zhu Yanjun
2021-03-12  0:25           ` Jason Gunthorpe
2021-03-12  8:04             ` Zhu Yanjun
2021-03-12  8:05               ` Zhu Yanjun
2021-03-12 13:02               ` Jason Gunthorpe
2021-03-12 13:42                 ` Zhu Yanjun
2021-03-12 13:49                   ` Zhu Yanjun
2021-03-12 14:01                     ` Jason Gunthorpe
2021-03-13  3:02                       ` Zhu Yanjun
2021-03-19 13:00                         ` Jason Gunthorpe
2021-03-19 13:33                           ` Zhu Yanjun
2021-03-19 13:48                             ` Jason Gunthorpe
2021-03-20  3:38                               ` Zhu Yanjun
2021-03-20  3:49                                 ` Zhu Yanjun
2021-03-20 20:38                                 ` Jason Gunthorpe
2021-03-21  8:06                                   ` Zhu Yanjun
2021-03-21 12:07                                     ` Jason Gunthorpe
2021-03-21 12:54                                       ` Zhu Yanjun
2021-03-21 13:03                                         ` Jason Gunthorpe
2021-03-21 14:38                                           ` Zhu Yanjun
2021-03-21 15:52                                             ` Jason Gunthorpe
2021-03-22  5:07                                               ` Zhu Yanjun
2021-03-08  0:44   ` Jason Gunthorpe
2021-03-08  2:47     ` Zhu Yanjun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.