[PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.
@ 2020-01-28  2:59 Arjun Roy
  2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Arjun Roy @ 2020-01-28  2:59 UTC (permalink / raw)
  To: davem, netdev, akpm, linux-mm
  Cc: arjunroy, Eric Dumazet, Soheil Hassas Yeganeh

From: Arjun Roy <arjunroy@google.com>

Add helper methods for vm_insert_page()/insert_page() to prepare for
vm_insert_pages(), which batch-inserts pages to reduce spinlock
operations when inserting multiple consecutive pages into the user
page table.

The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>

---
 mm/memory.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7c6e19bdc428..7e23a9275386 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1693,6 +1693,27 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 	return pte_alloc_map_lock(mm, pmd, addr, ptl);
 }
 
+static int validate_page_before_insert(struct page *page)
+{
+	if (PageAnon(page) || PageSlab(page))
+		return -EINVAL;
+	flush_dcache_page(page);
+	return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+			unsigned long addr, struct page *page, pgprot_t prot)
+{
+	if (!pte_none(*pte))
+		return -EBUSY;
+	/* Ok, finally just insert the thing.. */
+	get_page(page);
+	inc_mm_counter_fast(mm, mm_counter_file(page));
+	page_add_file_rmap(page, false);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	return 0;
+}
+
 /*
  * This is the old fallback for page remapping.
  *
@@ -1708,28 +1729,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	retval = -EINVAL;
-	if (PageAnon(page) || PageSlab(page))
+	retval = validate_page_before_insert(page);
+	if (retval)
 		goto out;
 	retval = -ENOMEM;
-	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
-	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
-
-	/* Ok, finally just insert the thing.. */
-	get_page(page);
-	inc_mm_counter_fast(mm, mm_counter_file(page));
-	page_add_file_rmap(page, false);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
-
-	retval = 0;
-	pte_unmap_unlock(pte, ptl);
-	return retval;
-out_unlock:
+	retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
-- 
2.25.0.rc1.283.g88dfdc4193-goog



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-01-28  2:59 [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
@ 2020-01-28  2:59 ` Arjun Roy
  2020-02-13  2:41   ` Andrew Morton
  2020-02-13 21:54   ` Matthew Wilcox
  2020-01-28  2:59 ` [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
  2020-02-13  2:41 ` [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Andrew Morton
  2 siblings, 2 replies; 19+ messages in thread
From: Arjun Roy @ 2020-01-28  2:59 UTC (permalink / raw)
  To: davem, netdev, akpm, linux-mm
  Cc: arjunroy, Eric Dumazet, Soheil Hassas Yeganeh

From: Arjun Roy <arjunroy@google.com>

Add the ability to insert multiple pages at once to a user VM with
lower PTE spinlock operations.

The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>

---
 include/linux/mm.h |   2 +
 mm/memory.c        | 111 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 29c928ba6b42..a3ac40fbe8fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2563,6 +2563,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+			struct page **pages, unsigned long *num);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 7e23a9275386..1655c5feaf32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1670,8 +1670,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-			spinlock_t **ptl)
+static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -1690,6 +1689,16 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 		return NULL;
 
 	VM_BUG_ON(pmd_trans_huge(*pmd));
+	return pmd;
+}
+
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+			spinlock_t **ptl)
+{
+	pmd_t *pmd = walk_to_pmd(mm, addr);
+
+	if (!pmd)
+		return NULL;
 	return pte_alloc_map_lock(mm, pmd, addr, ptl);
 }
 
@@ -1714,6 +1723,15 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
 	return 0;
 }
 
+static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+			unsigned long addr, struct page *page, pgprot_t prot)
+{
+	const int err = validate_page_before_insert(page);
+
+	return err ? err : insert_page_into_pte_locked(
+		mm, pte_offset_map(pmd, addr), addr, page, prot);
+}
+
 /*
  * This is the old fallback for page remapping.
  *
@@ -1742,6 +1760,95 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	return retval;
 }
 
+/* insert_pages() amortizes the cost of spinlock operations
+ * when inserting pages in a loop.
+ */
+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
+			struct page **pages, unsigned long *num, pgprot_t prot)
+{
+	pmd_t *pmd = NULL;
+	spinlock_t *pte_lock = NULL;
+	struct mm_struct *const mm = vma->vm_mm;
+	unsigned long curr_page_idx = 0;
+	unsigned long remaining_pages_total = *num;
+	unsigned long pages_to_write_in_pmd;
+	int ret;
+more:
+	ret = -EFAULT;
+	pmd = walk_to_pmd(mm, addr);
+	if (!pmd)
+		goto out;
+
+	pages_to_write_in_pmd = min_t(unsigned long,
+		remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
+
+	/* Allocate the PTE if necessary; takes PMD lock once only. */
+	ret = -ENOMEM;
+	if (pte_alloc(mm, pmd, addr))
+		goto out;
+	pte_lock = pte_lockptr(mm, pmd);
+
+	while (pages_to_write_in_pmd) {
+		int pte_idx = 0;
+		const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
+
+		spin_lock(pte_lock);
+		for (; pte_idx < batch_size; ++pte_idx) {
+			int err = insert_page_in_batch_locked(mm, pmd,
+				addr, pages[curr_page_idx], prot);
+			if (unlikely(err)) {
+				spin_unlock(pte_lock);
+				ret = err;
+				remaining_pages_total -= pte_idx;
+				goto out;
+			}
+			addr += PAGE_SIZE;
+			++curr_page_idx;
+		}
+		spin_unlock(pte_lock);
+		pages_to_write_in_pmd -= batch_size;
+		remaining_pages_total -= batch_size;
+	}
+	if (remaining_pages_total)
+		goto more;
+	ret = 0;
+out:
+	*num = remaining_pages_total;
+	return ret;
+}
+
+/**
+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
+ * @vma: user vma to map to
+ * @addr: target start user address of these pages
+ * @pages: source kernel pages
+ * @num: in: number of pages to map. out: number of pages that were *not*
+ * mapped. (0 means all pages were successfully mapped).
+ *
+ * Preferred over vm_insert_page() when inserting multiple pages.
+ *
+ * In case of error, we may have mapped a subset of the provided
+ * pages. It is the caller's responsibility to account for this case.
+ *
+ * The same restrictions apply as in vm_insert_page().
+ */
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+			struct page **pages, unsigned long *num)
+{
+	const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
+
+	if (addr < vma->vm_start || end_addr >= vma->vm_end)
+		return -EFAULT;
+	if (!(vma->vm_flags & VM_MIXEDMAP)) {
+		BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+		BUG_ON(vma->vm_flags & VM_PFNMAP);
+		vma->vm_flags |= VM_MIXEDMAP;
+	}
+	/* Defer page refcount checking till we're about to map that page. */
+	return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
 /**
  * vm_insert_page - insert single page into user vma
  * @vma: user vma to map to
-- 
2.25.0.rc1.283.g88dfdc4193-goog



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-01-28  2:59 [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
  2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
@ 2020-01-28  2:59 ` Arjun Roy
  2020-02-13  2:56   ` Andrew Morton
  2020-02-13  2:41 ` [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Andrew Morton
  2 siblings, 1 reply; 19+ messages in thread
From: Arjun Roy @ 2020-01-28  2:59 UTC (permalink / raw)
  To: davem, netdev, akpm, linux-mm
  Cc: arjunroy, Eric Dumazet, Soheil Hassas Yeganeh

From: Arjun Roy <arjunroy@google.com>

Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles
(as reported by perf) drop from a couple of percentage points
to a fraction of a percent. This results in a roughly 6% increase in
efficiency, measured roughly as zerocopy receive count divided by CPU
utilization.

The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>

---
 net/ipv4/tcp.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34490d972758..52f96c3ceab3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1861,14 +1861,48 @@ int tcp_mmap(struct file *file, struct socket *sock,
 }
 EXPORT_SYMBOL(tcp_mmap);
 
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+					struct page **pages,
+					unsigned long pages_to_map,
+					unsigned long *insert_addr,
+					u32 *length_with_pending,
+					u32 *seq,
+					struct tcp_zerocopy_receive *zc)
+{
+	unsigned long pages_remaining = pages_to_map;
+	int bytes_mapped;
+	int ret;
+
+	ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+	bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+	/* Even if vm_insert_pages fails, it may have partially succeeded in
+	 * mapping (some but not all of the pages).
+	 */
+	*seq += bytes_mapped;
+	*insert_addr += bytes_mapped;
+	if (ret) {
+		/* But if vm_insert_pages did fail, we have to unroll some state
+		 * we speculatively touched before.
+		 */
+		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+		*length_with_pending -= bytes_not_mapped;
+		zc->recv_skip_hint += bytes_not_mapped;
+	}
+	return ret;
+}
+
 static int tcp_zerocopy_receive(struct sock *sk,
 				struct tcp_zerocopy_receive *zc)
 {
 	unsigned long address = (unsigned long)zc->address;
 	u32 length = 0, seq, offset, zap_len;
+	#define PAGE_BATCH_SIZE 8
+	struct page *pages[PAGE_BATCH_SIZE];
 	const skb_frag_t *frags = NULL;
 	struct vm_area_struct *vma;
 	struct sk_buff *skb = NULL;
+	unsigned long pg_idx = 0;
+	unsigned long curr_addr;
 	struct tcp_sock *tp;
 	int inq;
 	int ret;
@@ -1901,15 +1935,26 @@ static int tcp_zerocopy_receive(struct sock *sk,
 		zc->recv_skip_hint = zc->length;
 	}
 	ret = 0;
+	curr_addr = address;
 	while (length + PAGE_SIZE <= zc->length) {
 		if (zc->recv_skip_hint < PAGE_SIZE) {
+			/* If we're here, finish the current batch. */
+			if (pg_idx) {
+				ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+								   pg_idx,
+								   &curr_addr,
+								   &length,
+								   &seq, zc);
+				if (ret)
+					goto out;
+				pg_idx = 0;
+			}
 			if (skb) {
 				skb = skb->next;
 				offset = seq - TCP_SKB_CB(skb)->seq;
 			} else {
 				skb = tcp_recv_skb(sk, seq, &offset);
 			}
-
 			zc->recv_skip_hint = skb->len - offset;
 			offset -= skb_headlen(skb);
 			if ((int)offset < 0 || skb_has_frag_list(skb))
@@ -1933,14 +1978,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
 			zc->recv_skip_hint -= remaining;
 			break;
 		}
-		ret = vm_insert_page(vma, address + length,
-				     skb_frag_page(frags));
-		if (ret)
-			break;
+		pages[pg_idx] = skb_frag_page(frags);
+		pg_idx++;
 		length += PAGE_SIZE;
-		seq += PAGE_SIZE;
 		zc->recv_skip_hint -= PAGE_SIZE;
 		frags++;
+		if (pg_idx == PAGE_BATCH_SIZE) {
+			ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+							   &curr_addr, &length,
+							   &seq, zc);
+			if (ret)
+				goto out;
+			pg_idx = 0;
+		}
+	}
+	if (pg_idx) {
+		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+						   &curr_addr, &length, &seq,
+						   zc);
 	}
 out:
 	up_read(&current->mm->mmap_sem);
-- 
2.25.0.rc1.283.g88dfdc4193-goog



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
@ 2020-02-13  2:41   ` Andrew Morton
  2020-02-13 17:09     ` Arjun Roy
  2020-02-13 21:37     ` Linus Torvalds
  2020-02-13 21:54   ` Matthew Wilcox
  1 sibling, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2020-02-13  2:41 UTC (permalink / raw)
  To: Arjun Roy
  Cc: davem, netdev, linux-mm, arjunroy, Eric Dumazet,
	Soheil Hassas Yeganeh, Linus Torvalds

On Mon, 27 Jan 2020 18:59:57 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:

> Add the ability to insert multiple pages at once to a user VM with
> lower PTE spinlock operations.
> 
> The intention of this patch-set is to reduce atomic ops for
> tcp zerocopy receives, which normally hits the same spinlock multiple
> times consecutively.

Seems sensible, thanks.  Some other vm_insert_page() callers might want
to know about this, but I can't immediately spot any which appear to be
high bandwidth.

Is there much point in keeping the vm_insert_page() implementation
around?  Replace it with

static inline int
vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
	       struct page *page)
{
	return vm_insert_pages(vma, addr, &page, 1);
}

?

Also, vm_insert_page() does

	if (!page_count(page))
		return -EINVAL;

and this was not carried over into vm_insert_pages().  How come?

I don't know what that test does - it was added by Linus in the
original commit a145dd411eb28c83.  It's only been 15 years so I'm sure
he remembers ;)


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.
  2020-01-28  2:59 [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
  2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
  2020-01-28  2:59 ` [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
@ 2020-02-13  2:41 ` Andrew Morton
  2020-02-13 16:52   ` Arjun Roy
  2020-02-13 16:55   ` Arjun Roy
  2 siblings, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2020-02-13  2:41 UTC (permalink / raw)
  To: Arjun Roy
  Cc: davem, netdev, linux-mm, arjunroy, Eric Dumazet, Soheil Hassas Yeganeh

On Mon, 27 Jan 2020 18:59:56 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:

> Add helper methods for vm_insert_page()/insert_page() to prepare for
> vm_insert_pages(), which batch-inserts pages to reduce spinlock
> operations when inserting multiple consecutive pages into the user
> page table.
> 
> The intention of this patch-set is to reduce atomic ops for
> tcp zerocopy receives, which normally hits the same spinlock multiple
> times consecutively.

I tweaked this a bit for the addition of page_has_type() to
insert_page().  Please check.

From: Arjun Roy <arjunroy.kdev@gmail.com>
Subject: mm: Refactor insert_page to prepare for batched-lock insert.

From: Arjun Roy <arjunroy@google.com>

Add helper methods for vm_insert_page()/insert_page() to prepare for
vm_insert_pages(), which batch-inserts pages to reduce spinlock
operations when inserting multiple consecutive pages into the user
page table.

The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.

Link: http://lkml.kernel.org/r/20200128025958.43490-1-arjunroy.kdev@gmail.com
Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/memory.c |   39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

--- a/mm/memory.c~mm-refactor-insert_page-to-prepare-for-batched-lock-insert
+++ a/mm/memory.c
@@ -1430,6 +1430,27 @@ pte_t *__get_locked_pte(struct mm_struct
 	return pte_alloc_map_lock(mm, pmd, addr, ptl);
 }

+static int validate_page_before_insert(struct page *page)
+{
+	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+		return -EINVAL;
+	flush_dcache_page(page);
+	return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+			unsigned long addr, struct page *page, pgprot_t prot)
+{
+	if (!pte_none(*pte))
+		return -EBUSY;
+	/* Ok, finally just insert the thing.. */
+	get_page(page);
+	inc_mm_counter_fast(mm, mm_counter_file(page));
+	page_add_file_rmap(page, false);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	return 0;
+}
+
 /*
  * This is the old fallback for page remapping.
  *
@@ -1445,26 +1466,14 @@ static int insert_page(struct vm_area_st
 	pte_t *pte;
 	spinlock_t *ptl;

-	retval = -EINVAL;
-	if (PageAnon(page) || PageSlab(page) || page_has_type(page))
+	retval = validate_page_before_insert(page);
+	if (retval)
 		goto out;
 	retval = -ENOMEM;
-	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
-	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
-
-	/* Ok, finally just insert the thing.. */
-	get_page(page);
-	inc_mm_counter_fast(mm, mm_counter_file(page));
-	page_add_file_rmap(page, false);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
-
-	retval = 0;
-out_unlock:
+	retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
 	pte_unmap_unlock(pte, ptl);
 out:
 	return retval;
_

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-01-28  2:59 ` [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
@ 2020-02-13  2:56   ` Andrew Morton
  2020-02-17  2:49     ` Arjun Roy
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2020-02-13  2:56 UTC (permalink / raw)
  To: Arjun Roy
  Cc: davem, netdev, linux-mm, arjunroy, Eric Dumazet, Soheil Hassas Yeganeh

On Mon, 27 Jan 2020 18:59:58 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:

> Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles
> (as reported by perf) drop from a couple of percentage points
> to a fraction of a percent. This results in a roughly 6% increase in
> efficiency, measured roughly as zerocopy receive count divided by CPU
> utilization.
> 
> The intention of this patch-set is to reduce atomic ops for
> tcp zerocopy receives, which normally hits the same spinlock multiple
> times consecutively.

For some reason the patch causes this:

In file included from ./arch/x86/include/asm/atomic.h:5:0,
                 from ./include/linux/atomic.h:7,
                 from ./include/linux/crypto.h:15,
                 from ./include/crypto/hash.h:11,
                 from net/ipv4/tcp.c:246:
net/ipv4/tcp.c: In function ‘do_tcp_getsockopt.isra.29’:
./include/linux/compiler.h:225:31: warning: ‘tp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
  case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
          ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~
net/ipv4/tcp.c:1779:19: note: ‘tp’ was declared here
  struct tcp_sock *tp;
                   ^~

It's a false positive.  gcc-7.2.0

: out:
:        up_read(&current->mm->mmap_sem);
:        if (length) {
:                WRITE_ONCE(tp->copied_seq, seq);

but `length' is zero here.  

This suppresses it:

--- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy-fix
+++ a/net/ipv4/tcp.c
@@ -1788,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
 
 	sock_rps_record_flow(sk);
 
+	tp = tcp_sk(sk);
+
 	down_read(&current->mm->mmap_sem);
 
 	ret = -EINVAL;
@@ -1796,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
 		goto out;
 	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
 
-	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
 	inq = tcp_inq(sk);
 	zc->length = min_t(u32, zc->length, inq);

and I guess it's zero-cost.


Anyway, I'll sit on this lot for a while, hoping for a davem ack?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.
  2020-02-13  2:41 ` [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Andrew Morton
@ 2020-02-13 16:52   ` Arjun Roy
  2020-02-13 16:55   ` Arjun Roy
  1 sibling, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-13 16:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, davem, netdev, linux-mm, Eric Dumazet, Soheil Hassas Yeganeh

[-- Attachment #1: Type: text/plain, Size: 3728 bytes --]

The addition of page_has_type() looks good to me, thanks!

-Arjun

On Wed, Feb 12, 2020 at 6:41 PM Andrew Morton <akpm@linux-foundation.org>
wrote:

> On Mon, 27 Jan 2020 18:59:56 -0800 Arjun Roy <arjunroy.kdev@gmail.com>
> wrote:
>
> > Add helper methods for vm_insert_page()/insert_page() to prepare for
> > vm_insert_pages(), which batch-inserts pages to reduce spinlock
> > operations when inserting multiple consecutive pages into the user
> > page table.
> >
> > The intention of this patch-set is to reduce atomic ops for
> > tcp zerocopy receives, which normally hits the same spinlock multiple
> > times consecutively.
>
> I tweaked this a bit for the addition of page_has_type() to
> insert_page().  Please check.
>
>
>
> From: Arjun Roy <arjunroy.kdev@gmail.com>
> Subject: mm: Refactor insert_page to prepare for batched-lock insert.
>
> From: Arjun Roy <arjunroy@google.com>
>
> Add helper methods for vm_insert_page()/insert_page() to prepare for
> vm_insert_pages(), which batch-inserts pages to reduce spinlock
> operations when inserting multiple consecutive pages into the user
> page table.
>
> The intention of this patch-set is to reduce atomic ops for
> tcp zerocopy receives, which normally hits the same spinlock multiple
> times consecutively.
>
> Link:
> http://lkml.kernel.org/r/20200128025958.43490-1-arjunroy.kdev@gmail.com
> Signed-off-by: Arjun Roy <arjunroy@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>  mm/memory.c |   39 ++++++++++++++++++++++++---------------
>  1 file changed, 24 insertions(+), 15 deletions(-)
>
> ---
> a/mm/memory.c~mm-refactor-insert_page-to-prepare-for-batched-lock-insert
> +++ a/mm/memory.c
> @@ -1430,6 +1430,27 @@ pte_t *__get_locked_pte(struct mm_struct
>         return pte_alloc_map_lock(mm, pmd, addr, ptl);
>  }
>
> +static int validate_page_before_insert(struct page *page)
> +{
> +       if (PageAnon(page) || PageSlab(page) || page_has_type(page))
> +               return -EINVAL;
> +       flush_dcache_page(page);
> +       return 0;
> +}
> +
> +static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
> +                       unsigned long addr, struct page *page, pgprot_t
> prot)
> +{
> +       if (!pte_none(*pte))
> +               return -EBUSY;
> +       /* Ok, finally just insert the thing.. */
> +       get_page(page);
> +       inc_mm_counter_fast(mm, mm_counter_file(page));
> +       page_add_file_rmap(page, false);
> +       set_pte_at(mm, addr, pte, mk_pte(page, prot));
> +       return 0;
> +}
> +
>  /*
>   * This is the old fallback for page remapping.
>   *
> @@ -1445,26 +1466,14 @@ static int insert_page(struct vm_area_st
>         pte_t *pte;
>         spinlock_t *ptl;
>
> -       retval = -EINVAL;
> -       if (PageAnon(page) || PageSlab(page) || page_has_type(page))
> +       retval = validate_page_before_insert(page);
> +       if (retval)
>                 goto out;
>         retval = -ENOMEM;
> -       flush_dcache_page(page);
>         pte = get_locked_pte(mm, addr, &ptl);
>         if (!pte)
>                 goto out;
> -       retval = -EBUSY;
> -       if (!pte_none(*pte))
> -               goto out_unlock;
> -
> -       /* Ok, finally just insert the thing.. */
> -       get_page(page);
> -       inc_mm_counter_fast(mm, mm_counter_file(page));
> -       page_add_file_rmap(page, false);
> -       set_pte_at(mm, addr, pte, mk_pte(page, prot));
> -
> -       retval = 0;
> -out_unlock:
> +       retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
>         pte_unmap_unlock(pte, ptl);
>  out:
>         return retval;
> _
>
>

[-- Attachment #2: Type: text/html, Size: 5142 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.
  2020-02-13  2:41 ` [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Andrew Morton
  2020-02-13 16:52   ` Arjun Roy
@ 2020-02-13 16:55   ` Arjun Roy
  1 sibling, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-13 16:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, davem, netdev, linux-mm, Eric Dumazet, Soheil Hassas Yeganeh

The addition of page_has_type() looks good to me, thanks!

-Arjun


On Wed, Feb 12, 2020 at 6:41 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 27 Jan 2020 18:59:56 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:
>
> > Add helper methods for vm_insert_page()/insert_page() to prepare for
> > vm_insert_pages(), which batch-inserts pages to reduce spinlock
> > operations when inserting multiple consecutive pages into the user
> > page table.
> >
> > The intention of this patch-set is to reduce atomic ops for
> > tcp zerocopy receives, which normally hits the same spinlock multiple
> > times consecutively.
>
> I tweaked this a bit for the addition of page_has_type() to
> insert_page().  Please check.
>
>
>
> From: Arjun Roy <arjunroy.kdev@gmail.com>
> Subject: mm: Refactor insert_page to prepare for batched-lock insert.
>
> From: Arjun Roy <arjunroy@google.com>
>
> Add helper methods for vm_insert_page()/insert_page() to prepare for
> vm_insert_pages(), which batch-inserts pages to reduce spinlock
> operations when inserting multiple consecutive pages into the user
> page table.
>
> The intention of this patch-set is to reduce atomic ops for
> tcp zerocopy receives, which normally hits the same spinlock multiple
> times consecutively.
>
> Link: http://lkml.kernel.org/r/20200128025958.43490-1-arjunroy.kdev@gmail.com
> Signed-off-by: Arjun Roy <arjunroy@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>  mm/memory.c |   39 ++++++++++++++++++++++++---------------
>  1 file changed, 24 insertions(+), 15 deletions(-)
>
> --- a/mm/memory.c~mm-refactor-insert_page-to-prepare-for-batched-lock-insert
> +++ a/mm/memory.c
> @@ -1430,6 +1430,27 @@ pte_t *__get_locked_pte(struct mm_struct
>         return pte_alloc_map_lock(mm, pmd, addr, ptl);
>  }
>
> +static int validate_page_before_insert(struct page *page)
> +{
> +       if (PageAnon(page) || PageSlab(page) || page_has_type(page))
> +               return -EINVAL;
> +       flush_dcache_page(page);
> +       return 0;
> +}
> +
> +static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
> +                       unsigned long addr, struct page *page, pgprot_t prot)
> +{
> +       if (!pte_none(*pte))
> +               return -EBUSY;
> +       /* Ok, finally just insert the thing.. */
> +       get_page(page);
> +       inc_mm_counter_fast(mm, mm_counter_file(page));
> +       page_add_file_rmap(page, false);
> +       set_pte_at(mm, addr, pte, mk_pte(page, prot));
> +       return 0;
> +}
> +
>  /*
>   * This is the old fallback for page remapping.
>   *
> @@ -1445,26 +1466,14 @@ static int insert_page(struct vm_area_st
>         pte_t *pte;
>         spinlock_t *ptl;
>
> -       retval = -EINVAL;
> -       if (PageAnon(page) || PageSlab(page) || page_has_type(page))
> +       retval = validate_page_before_insert(page);
> +       if (retval)
>                 goto out;
>         retval = -ENOMEM;
> -       flush_dcache_page(page);
>         pte = get_locked_pte(mm, addr, &ptl);
>         if (!pte)
>                 goto out;
> -       retval = -EBUSY;
> -       if (!pte_none(*pte))
> -               goto out_unlock;
> -
> -       /* Ok, finally just insert the thing.. */
> -       get_page(page);
> -       inc_mm_counter_fast(mm, mm_counter_file(page));
> -       page_add_file_rmap(page, false);
> -       set_pte_at(mm, addr, pte, mk_pte(page, prot));
> -
> -       retval = 0;
> -out_unlock:
> +       retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
>         pte_unmap_unlock(pte, ptl);
>  out:
>         return retval;
> _
>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-02-13  2:41   ` Andrew Morton
@ 2020-02-13 17:09     ` Arjun Roy
  2020-02-13 21:37     ` Linus Torvalds
  1 sibling, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-13 17:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, davem, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh, Linus Torvalds

I think at least to start it probably makes sense to keep regular
vm_insert_page() around - smaller stack used, less branches, if you
know you just need one page - not sure if gcc would err towards
smaller binary or not when compiling.

Regarding the page_count() check - as far as I can tell that's just
checking to make sure that at least *someone* has a reference to the
page before inserting it; in the zerocopy case we most definitely do
but I guess a bad caller could call it with a bad page argument and
this would guard against that.

Actually, I appear to have fat fingered it - I intended for this check
to be in there but seem to have forgotten (per the comment "/* Defer
page refcount checking till we're about to map that page. */" but with
no actual check). So that check should go inside
insert_page_in_batch_locked(), right before the
validate_page_before_insert() check. I'll send an updated fixup diff
shortly.

-Arjun

On Wed, Feb 12, 2020 at 6:41 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 27 Jan 2020 18:59:57 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:
>
> > Add the ability to insert multiple pages at once to a user VM with
> > lower PTE spinlock operations.
> >
> > The intention of this patch-set is to reduce atomic ops for
> > tcp zerocopy receives, which normally hits the same spinlock multiple
> > times consecutively.
>
> Seems sensible, thanks.  Some other vm_insert_page() callers might want
> to know about this, but I can't immediately spot any which appear to be
> high bandwidth.
>
> Is there much point in keeping the vm_insert_page() implementation
> around?  Replace it with
>
> static inline int
> vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
>                struct page *page)
> {
>         return vm_insert_pages(vma, addr, &page, 1);
> }
>
> ?
>
> Also, vm_insert_page() does
>
>         if (!page_count(page))
>                 return -EINVAL;
>
> and this was not carried over into vm_insert_pages().  How come?
>
> I don't know what that test does - it was added by Linus in the
> original commit a145dd411eb28c83.  It's only been 15 years so I'm sure
> he remembers ;)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-02-13  2:41   ` Andrew Morton
  2020-02-13 17:09     ` Arjun Roy
@ 2020-02-13 21:37     ` Linus Torvalds
  1 sibling, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2020-02-13 21:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, David Miller, Netdev, Linux-MM, arjunroy,
	Eric Dumazet, Soheil Hassas Yeganeh

On Wed, Feb 12, 2020 at 6:41 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> Also, vm_insert_page() does
>
>         if (!page_count(page))
>                 return -EINVAL;
>
> and this was not carried over into vm_insert_pages().  How come?

Sounds like that was just a mistake.

> I don't know what that test does - it was added by Linus in the
> original commit a145dd411eb28c83.  It's only been 15 years so I'm sure
> he remembers ;)

Oh, sure.

No, I have absolutely no memory of the details, but I think the commit
message is actually the big hint: the difference between
vm_insert_page() and some of the more random "insert any pdf" cases we
have is exactly that:

    The page you insert needs to be a nice clean kernel allocation, so you
    can't insert arbitrary page mappings with this, but that's not what
    people want.

thing. The comment above it also kind of hints at it.

We *historically* had interfaces to insert random reserved pages (for
IO mappings, but also the zero page etc), but the whole point of that
vm_insert_page() is that it's now an interface for drivers to insert
the pages they maintain into the page tables.

But that also means that we very much didn't allow just random pages
accessed by doing pfn lookups (that might not be in use at all).

Is "page_count()" a great test? No. But it's at least _a_ test of
that. No reserved pages or other magic need apply.

         Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
  2020-02-13  2:41   ` Andrew Morton
@ 2020-02-13 21:54   ` Matthew Wilcox
  2020-02-13 22:06     ` Arjun Roy
  1 sibling, 1 reply; 19+ messages in thread
From: Matthew Wilcox @ 2020-02-13 21:54 UTC (permalink / raw)
  To: Arjun Roy
  Cc: davem, netdev, akpm, linux-mm, arjunroy, Eric Dumazet,
	Soheil Hassas Yeganeh

On Mon, Jan 27, 2020 at 06:59:57PM -0800, Arjun Roy wrote:
>  int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
> +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
> +			struct page **pages, unsigned long *num);
>  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  			unsigned long pfn);

Sorry I didn't notice these patches earlier.  I'm not thrilled about
the addition of a new vm_insert_* operation; we're moving towards a
vmf_insert_* API.  There are almost no users left of vm_insert_page
(10, at a quick count).  Once they're all gone, we can switch the
underlying primitives over to a vm_fault_t return type and get rid of the
errno-to-vm-fault translation step that currently goes on.

So ... is this called in the fault path?  Do you have a struct vm_fault
around?  Can you handle a vm_fault_t return value instead of an errno?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages().
  2020-02-13 21:54   ` Matthew Wilcox
@ 2020-02-13 22:06     ` Arjun Roy
  0 siblings, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-13 22:06 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Arjun Roy, davem, netdev, Andrew Morton, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

On Thu, Feb 13, 2020 at 1:54 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Jan 27, 2020 at 06:59:57PM -0800, Arjun Roy wrote:
> >  int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
> > +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
> > +                     struct page **pages, unsigned long *num);
> >  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> >                       unsigned long pfn);
>
> Sorry I didn't notice these patches earlier.  I'm not thrilled about
> the addition of a new vm_insert_* operation; we're moving towards a
> vmf_insert_* API.  There are almost no users left of vm_insert_page
> (10, at a quick count).  Once they're all gone, we can switch the
> underlying primitives over to a vm_fault_t return type and get rid of the
> errno-to-vm-fault translation step that currently goes on.
>
> So ... is this called in the fault path?  Do you have a struct vm_fault
> around?  Can you handle a vm_fault_t return value instead of an errno?

This is not a page fault, really. This customer of vm_insert_page() is
the TCP receive zerocopy code, which is remapping pages from the NIC
into the userspace process (in lieu of sys_recvmsg()'s copy). See:
tcp_zerocopy_receive() in net/ipv4/tcp.c .

I took a peek at vmf_insert_page(). I think that hides the presence of
EBUSY, which would be a necessary signal for us. If that was exposed I
think vm_fault_t could be fine, *but* I shall defer to Eric for
actually deciding on it.

-Arjun


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-02-13  2:56   ` Andrew Morton
@ 2020-02-17  2:49     ` Arjun Roy
  2020-02-21 21:21       ` Arjun Roy
  0 siblings, 1 reply; 19+ messages in thread
From: Arjun Roy @ 2020-02-17  2:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, davem, netdev, linux-mm, Eric Dumazet, Soheil Hassas Yeganeh

On Wed, Feb 12, 2020 at 6:56 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 27 Jan 2020 18:59:58 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:
>
> > Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles
> > (as reported by perf) drop from a couple of percentage points
> > to a fraction of a percent. This results in a roughly 6% increase in
> > efficiency, measured roughly as zerocopy receive count divided by CPU
> > utilization.
> >
> > The intention of this patch-set is to reduce atomic ops for
> > tcp zerocopy receives, which normally hits the same spinlock multiple
> > times consecutively.
>
> For some reason the patch causes this:
>
> In file included from ./arch/x86/include/asm/atomic.h:5:0,
>                  from ./include/linux/atomic.h:7,
>                  from ./include/linux/crypto.h:15,
>                  from ./include/crypto/hash.h:11,
>                  from net/ipv4/tcp.c:246:
> net/ipv4/tcp.c: In function ‘do_tcp_getsockopt.isra.29’:
> ./include/linux/compiler.h:225:31: warning: ‘tp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
>   case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
>           ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~
> net/ipv4/tcp.c:1779:19: note: ‘tp’ was declared here
>   struct tcp_sock *tp;
>                    ^~
>
> It's a false positive.  gcc-7.2.0
>
> : out:
> :        up_read(&current->mm->mmap_sem);
> :        if (length) {
> :                WRITE_ONCE(tp->copied_seq, seq);
>
> but `length' is zero here.
>
> This suppresses it:
>
> --- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy-fix
> +++ a/net/ipv4/tcp.c
> @@ -1788,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
>
>         sock_rps_record_flow(sk);
>
> +       tp = tcp_sk(sk);
> +
>         down_read(&current->mm->mmap_sem);
>
>         ret = -EINVAL;
> @@ -1796,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
>                 goto out;
>         zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
>
> -       tp = tcp_sk(sk);
>         seq = tp->copied_seq;
>         inq = tcp_inq(sk);
>         zc->length = min_t(u32, zc->length, inq);
>
> and I guess it's zero-cost.
>
>
> Anyway, I'll sit on this lot for a while, hoping for a davem ack?

Actually, speaking of the ack on the networking side:

I guess this patch set is a bit weird since it requires some
non-trivial coordination between mm and net-next? Not sure what the
normal approach is in this case.

-Arjun


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-02-17  2:49     ` Arjun Roy
@ 2020-02-21 21:21       ` Arjun Roy
  2020-02-24  3:37         ` Andrew Morton
  2020-04-10 19:04         ` Andrew Morton
  0 siblings, 2 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-21 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

Andrew, David -

I remain a bit concerned regarding the merge process for this specific
patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
changes for TCP receive zerocopy that I'd like to upstream for
net-next - and would like to avoid weird merge issues.

So perhaps the following could work:

1. Andrew, perhaps we could remove this particular patch (0003, the
net/ipv4/tcp.c change) from mm-next; that way we merge
vm_insert_pages() but not the call-site within TCP, for now.
2. net-next will eventually pick vm_insert_pages() up.
3. I can modify the zerocopy code to use it at that point?

Else I'm concerned a complicated merge situation may result.

What do you all think?

Thanks,
-Arjun

On Sun, Feb 16, 2020 at 6:49 PM Arjun Roy <arjunroy@google.com> wrote:
>
> On Wed, Feb 12, 2020 at 6:56 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Mon, 27 Jan 2020 18:59:58 -0800 Arjun Roy <arjunroy.kdev@gmail.com> wrote:
> >
> > > Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles
> > > (as reported by perf) drop from a couple of percentage points
> > > to a fraction of a percent. This results in a roughly 6% increase in
> > > efficiency, measured roughly as zerocopy receive count divided by CPU
> > > utilization.
> > >
> > > The intention of this patch-set is to reduce atomic ops for
> > > tcp zerocopy receives, which normally hits the same spinlock multiple
> > > times consecutively.
> >
> > For some reason the patch causes this:
> >
> > In file included from ./arch/x86/include/asm/atomic.h:5:0,
> >                  from ./include/linux/atomic.h:7,
> >                  from ./include/linux/crypto.h:15,
> >                  from ./include/crypto/hash.h:11,
> >                  from net/ipv4/tcp.c:246:
> > net/ipv4/tcp.c: In function ‘do_tcp_getsockopt.isra.29’:
> > ./include/linux/compiler.h:225:31: warning: ‘tp’ may be used uninitialized in this function [-Wmaybe-uninitialized]
> >   case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
> >           ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~
> > net/ipv4/tcp.c:1779:19: note: ‘tp’ was declared here
> >   struct tcp_sock *tp;
> >                    ^~
> >
> > It's a false positive.  gcc-7.2.0
> >
> > : out:
> > :        up_read(&current->mm->mmap_sem);
> > :        if (length) {
> > :                WRITE_ONCE(tp->copied_seq, seq);
> >
> > but `length' is zero here.
> >
> > This suppresses it:
> >
> > --- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy-fix
> > +++ a/net/ipv4/tcp.c
> > @@ -1788,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
> >
> >         sock_rps_record_flow(sk);
> >
> > +       tp = tcp_sk(sk);
> > +
> >         down_read(&current->mm->mmap_sem);
> >
> >         ret = -EINVAL;
> > @@ -1796,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
> >                 goto out;
> >         zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
> >
> > -       tp = tcp_sk(sk);
> >         seq = tp->copied_seq;
> >         inq = tcp_inq(sk);
> >         zc->length = min_t(u32, zc->length, inq);
> >
> > and I guess it's zero-cost.
> >
> >
> > Anyway, I'll sit on this lot for a while, hoping for a davem ack?
>
> Actually, speaking of the ack on the networking side:
>
> I guess this patch set is a bit weird since it requires some
> non-trivial coordination between mm and net-next? Not sure what the
> normal approach is in this case.
>
> -Arjun


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-02-21 21:21       ` Arjun Roy
@ 2020-02-24  3:37         ` Andrew Morton
  2020-02-24 16:19           ` Arjun Roy
  2020-04-10 19:04         ` Andrew Morton
  1 sibling, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2020-02-24  3:37 UTC (permalink / raw)
  To: Arjun Roy
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@google.com> wrote:

> I remain a bit concerned regarding the merge process for this specific
> patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> changes for TCP receive zerocopy that I'd like to upstream for
> net-next - and would like to avoid weird merge issues.
> 
> So perhaps the following could work:
> 
> 1. Andrew, perhaps we could remove this particular patch (0003, the
> net/ipv4/tcp.c change) from mm-next; that way we merge
> vm_insert_pages() but not the call-site within TCP, for now.
> 2. net-next will eventually pick vm_insert_pages() up.
> 3. I can modify the zerocopy code to use it at that point?
> 
> Else I'm concerned a complicated merge situation may result.
> 
> What do you all think?

We could do that.

For now, I'll stage the entire patch series after linux-next and shall
wait and see whether things which appear in linux-next cause serious
merge issues to occur.  Sound OK?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-02-24  3:37         ` Andrew Morton
@ 2020-02-24 16:19           ` Arjun Roy
  0 siblings, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-02-24 16:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

On Sun, Feb 23, 2020 at 7:37 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@google.com> wrote:
>
> > I remain a bit concerned regarding the merge process for this specific
> > patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> > changes for TCP receive zerocopy that I'd like to upstream for
> > net-next - and would like to avoid weird merge issues.
> >
> > So perhaps the following could work:
> >
> > 1. Andrew, perhaps we could remove this particular patch (0003, the
> > net/ipv4/tcp.c change) from mm-next; that way we merge
> > vm_insert_pages() but not the call-site within TCP, for now.
> > 2. net-next will eventually pick vm_insert_pages() up.
> > 3. I can modify the zerocopy code to use it at that point?
> >
> > Else I'm concerned a complicated merge situation may result.
> >
> > What do you all think?
>
> We could do that.
>
> For now, I'll stage the entire patch series after linux-next and shall
> wait and see whether things which appear in linux-next cause serious
> merge issues to occur.  Sound OK?

Sounds good for now; the conflict itself would be easy enough to fix
when it does crop up.

Thanks,
-Arjun


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-02-21 21:21       ` Arjun Roy
  2020-02-24  3:37         ` Andrew Morton
@ 2020-04-10 19:04         ` Andrew Morton
  2020-04-10 19:13           ` Arjun Roy
  2020-04-10 19:15           ` Arjun Roy
  1 sibling, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2020-04-10 19:04 UTC (permalink / raw)
  To: Arjun Roy
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@google.com> wrote:

> I remain a bit concerned regarding the merge process for this specific
> patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> changes for TCP receive zerocopy that I'd like to upstream for
> net-next - and would like to avoid weird merge issues.
> 
> So perhaps the following could work:
> 
> 1. Andrew, perhaps we could remove this particular patch (0003, the
> net/ipv4/tcp.c change) from mm-next; that way we merge
> vm_insert_pages() but not the call-site within TCP, for now.
> 2. net-next will eventually pick vm_insert_pages() up.
> 3. I can modify the zerocopy code to use it at that point?
> 
> Else I'm concerned a complicated merge situation may result.

The merge situation is quite clean.

I guess I'll hold off on
net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy.patch (below) and
shall send it to davem after Linus has merged the prerequisites.


From: Arjun Roy <arjunroy@google.com>
Subject: net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy

Use vm_insert_pages() for tcp receive zerocopy.  Spin lock cycles (as
reported by perf) drop from a couple of percentage points to a fraction of
a percent.  This results in a roughly 6% increase in efficiency, measured
roughly as zerocopy receive count divided by CPU utilization.

The intention of this patchset is to reduce atomic ops for tcp zerocopy
receives, which normally hits the same spinlock multiple times
consecutively.

[akpm@linux-foundation.org: suppress gcc-7.2.0 warning]
Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.com
Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 net/ipv4/tcp.c |   70 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 7 deletions(-)

--- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy
+++ a/net/ipv4/tcp.c
@@ -1734,14 +1734,48 @@ int tcp_mmap(struct file *file, struct s
 }
 EXPORT_SYMBOL(tcp_mmap);
 
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+					struct page **pages,
+					unsigned long pages_to_map,
+					unsigned long *insert_addr,
+					u32 *length_with_pending,
+					u32 *seq,
+					struct tcp_zerocopy_receive *zc)
+{
+	unsigned long pages_remaining = pages_to_map;
+	int bytes_mapped;
+	int ret;
+
+	ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+	bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+	/* Even if vm_insert_pages fails, it may have partially succeeded in
+	 * mapping (some but not all of the pages).
+	 */
+	*seq += bytes_mapped;
+	*insert_addr += bytes_mapped;
+	if (ret) {
+		/* But if vm_insert_pages did fail, we have to unroll some state
+		 * we speculatively touched before.
+		 */
+		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+		*length_with_pending -= bytes_not_mapped;
+		zc->recv_skip_hint += bytes_not_mapped;
+	}
+	return ret;
+}
+
 static int tcp_zerocopy_receive(struct sock *sk,
 				struct tcp_zerocopy_receive *zc)
 {
 	unsigned long address = (unsigned long)zc->address;
 	u32 length = 0, seq, offset, zap_len;
+	#define PAGE_BATCH_SIZE 8
+	struct page *pages[PAGE_BATCH_SIZE];
 	const skb_frag_t *frags = NULL;
 	struct vm_area_struct *vma;
 	struct sk_buff *skb = NULL;
+	unsigned long pg_idx = 0;
+	unsigned long curr_addr;
 	struct tcp_sock *tp;
 	int inq;
 	int ret;
@@ -1754,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
 
 	sock_rps_record_flow(sk);
 
+	tp = tcp_sk(sk);
+
 	down_read(&current->mm->mmap_sem);
 
 	ret = -EINVAL;
@@ -1762,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
 		goto out;
 	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
 
-	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
 	inq = tcp_inq(sk);
 	zc->length = min_t(u32, zc->length, inq);
@@ -1774,8 +1809,20 @@ static int tcp_zerocopy_receive(struct s
 		zc->recv_skip_hint = zc->length;
 	}
 	ret = 0;
+	curr_addr = address;
 	while (length + PAGE_SIZE <= zc->length) {
 		if (zc->recv_skip_hint < PAGE_SIZE) {
+			/* If we're here, finish the current batch. */
+			if (pg_idx) {
+				ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+								   pg_idx,
+								   &curr_addr,
+								   &length,
+								   &seq, zc);
+				if (ret)
+					goto out;
+				pg_idx = 0;
+			}
 			if (skb) {
 				if (zc->recv_skip_hint > 0)
 					break;
@@ -1784,7 +1831,6 @@ static int tcp_zerocopy_receive(struct s
 			} else {
 				skb = tcp_recv_skb(sk, seq, &offset);
 			}
-
 			zc->recv_skip_hint = skb->len - offset;
 			offset -= skb_headlen(skb);
 			if ((int)offset < 0 || skb_has_frag_list(skb))
@@ -1808,14 +1854,24 @@ static int tcp_zerocopy_receive(struct s
 			zc->recv_skip_hint -= remaining;
 			break;
 		}
-		ret = vm_insert_page(vma, address + length,
-				     skb_frag_page(frags));
-		if (ret)
-			break;
+		pages[pg_idx] = skb_frag_page(frags);
+		pg_idx++;
 		length += PAGE_SIZE;
-		seq += PAGE_SIZE;
 		zc->recv_skip_hint -= PAGE_SIZE;
 		frags++;
+		if (pg_idx == PAGE_BATCH_SIZE) {
+			ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+							   &curr_addr, &length,
+							   &seq, zc);
+			if (ret)
+				goto out;
+			pg_idx = 0;
+		}
+	}
+	if (pg_idx) {
+		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+						   &curr_addr, &length, &seq,
+						   zc);
 	}
 out:
 	up_read(&current->mm->mmap_sem);
_



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-04-10 19:04         ` Andrew Morton
@ 2020-04-10 19:13           ` Arjun Roy
  2020-04-10 19:15           ` Arjun Roy
  1 sibling, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-04-10 19:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

[-- Attachment #1: Type: text/plain, Size: 7814 bytes --]

On Fri, Apr 10, 2020 at 12:04 PM Andrew Morton <akpm@linux-foundation.org>
wrote:

> On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@google.com> wrote:
>
> > I remain a bit concerned regarding the merge process for this specific
> > patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> > changes for TCP receive zerocopy that I'd like to upstream for
> > net-next - and would like to avoid weird merge issues.
> >
> > So perhaps the following could work:
> >
> > 1. Andrew, perhaps we could remove this particular patch (0003, the
> > net/ipv4/tcp.c change) from mm-next; that way we merge
> > vm_insert_pages() but not the call-site within TCP, for now.
> > 2. net-next will eventually pick vm_insert_pages() up.
> > 3. I can modify the zerocopy code to use it at that point?
> >
> > Else I'm concerned a complicated merge situation may result.
>
> The merge situation is quite clean.
>
> I guess I'll hold off on
> net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy.patch (below) and
> shall send it to davem after Linus has merged the prerequisites.
>
>
>
Acknowledged, thank you!

-Arjun



> From: Arjun Roy <arjunroy@google.com>
> Subject: net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy
>
> Use vm_insert_pages() for tcp receive zerocopy.  Spin lock cycles (as
> reported by perf) drop from a couple of percentage points to a fraction of
> a percent.  This results in a roughly 6% increase in efficiency, measured
> roughly as zerocopy receive count divided by CPU utilization.
>
> The intention of this patchset is to reduce atomic ops for tcp zerocopy
> receives, which normally hits the same spinlock multiple times
> consecutively.
>
> [akpm@linux-foundation.org: suppress gcc-7.2.0 warning]
> Link:
> http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.com
> Signed-off-by: Arjun Roy <arjunroy@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: David Miller <davem@davemloft.net>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>  net/ipv4/tcp.c |   70 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 63 insertions(+), 7 deletions(-)
>
> --- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy
> +++ a/net/ipv4/tcp.c
> @@ -1734,14 +1734,48 @@ int tcp_mmap(struct file *file, struct s
>  }
>  EXPORT_SYMBOL(tcp_mmap);
>
> +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
> +                                       struct page **pages,
> +                                       unsigned long pages_to_map,
> +                                       unsigned long *insert_addr,
> +                                       u32 *length_with_pending,
> +                                       u32 *seq,
> +                                       struct tcp_zerocopy_receive *zc)
> +{
> +       unsigned long pages_remaining = pages_to_map;
> +       int bytes_mapped;
> +       int ret;
> +
> +       ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
> +       bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
> +       /* Even if vm_insert_pages fails, it may have partially succeeded
> in
> +        * mapping (some but not all of the pages).
> +        */
> +       *seq += bytes_mapped;
> +       *insert_addr += bytes_mapped;
> +       if (ret) {
> +               /* But if vm_insert_pages did fail, we have to unroll some
> state
> +                * we speculatively touched before.
> +                */
> +               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
> +               *length_with_pending -= bytes_not_mapped;
> +               zc->recv_skip_hint += bytes_not_mapped;
> +       }
> +       return ret;
> +}
> +
>  static int tcp_zerocopy_receive(struct sock *sk,
>                                 struct tcp_zerocopy_receive *zc)
>  {
>         unsigned long address = (unsigned long)zc->address;
>         u32 length = 0, seq, offset, zap_len;
> +       #define PAGE_BATCH_SIZE 8
> +       struct page *pages[PAGE_BATCH_SIZE];
>         const skb_frag_t *frags = NULL;
>         struct vm_area_struct *vma;
>         struct sk_buff *skb = NULL;
> +       unsigned long pg_idx = 0;
> +       unsigned long curr_addr;
>         struct tcp_sock *tp;
>         int inq;
>         int ret;
> @@ -1754,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
>
>         sock_rps_record_flow(sk);
>
> +       tp = tcp_sk(sk);
> +
>         down_read(&current->mm->mmap_sem);
>
>         ret = -EINVAL;
> @@ -1762,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
>                 goto out;
>         zc->length = min_t(unsigned long, zc->length, vma->vm_end -
> address);
>
> -       tp = tcp_sk(sk);
>         seq = tp->copied_seq;
>         inq = tcp_inq(sk);
>         zc->length = min_t(u32, zc->length, inq);
> @@ -1774,8 +1809,20 @@ static int tcp_zerocopy_receive(struct s
>                 zc->recv_skip_hint = zc->length;
>         }
>         ret = 0;
> +       curr_addr = address;
>         while (length + PAGE_SIZE <= zc->length) {
>                 if (zc->recv_skip_hint < PAGE_SIZE) {
> +                       /* If we're here, finish the current batch. */
> +                       if (pg_idx) {
> +                               ret = tcp_zerocopy_vm_insert_batch(vma,
> pages,
> +                                                                  pg_idx,
> +
> &curr_addr,
> +                                                                  &length,
> +                                                                  &seq,
> zc);
> +                               if (ret)
> +                                       goto out;
> +                               pg_idx = 0;
> +                       }
>                         if (skb) {
>                                 if (zc->recv_skip_hint > 0)
>                                         break;
> @@ -1784,7 +1831,6 @@ static int tcp_zerocopy_receive(struct s
>                         } else {
>                                 skb = tcp_recv_skb(sk, seq, &offset);
>                         }
> -
>                         zc->recv_skip_hint = skb->len - offset;
>                         offset -= skb_headlen(skb);
>                         if ((int)offset < 0 || skb_has_frag_list(skb))
> @@ -1808,14 +1854,24 @@ static int tcp_zerocopy_receive(struct s
>                         zc->recv_skip_hint -= remaining;
>                         break;
>                 }
> -               ret = vm_insert_page(vma, address + length,
> -                                    skb_frag_page(frags));
> -               if (ret)
> -                       break;
> +               pages[pg_idx] = skb_frag_page(frags);
> +               pg_idx++;
>                 length += PAGE_SIZE;
> -               seq += PAGE_SIZE;
>                 zc->recv_skip_hint -= PAGE_SIZE;
>                 frags++;
> +               if (pg_idx == PAGE_BATCH_SIZE) {
> +                       ret = tcp_zerocopy_vm_insert_batch(vma, pages,
> pg_idx,
> +                                                          &curr_addr,
> &length,
> +                                                          &seq, zc);
> +                       if (ret)
> +                               goto out;
> +                       pg_idx = 0;
> +               }
> +       }
> +       if (pg_idx) {
> +               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
> +                                                  &curr_addr, &length,
> &seq,
> +                                                  zc);
>         }
>  out:
>         up_read(&current->mm->mmap_sem);
> _
>
>

[-- Attachment #2: Type: text/html, Size: 10982 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
  2020-04-10 19:04         ` Andrew Morton
  2020-04-10 19:13           ` Arjun Roy
@ 2020-04-10 19:15           ` Arjun Roy
  1 sibling, 0 replies; 19+ messages in thread
From: Arjun Roy @ 2020-04-10 19:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjun Roy, David Miller, netdev, linux-mm, Eric Dumazet,
	Soheil Hassas Yeganeh

On Fri, Apr 10, 2020 at 12:04 PM Andrew Morton
<akpm@linux-foundation.org> wrote:
>
> On Fri, 21 Feb 2020 13:21:41 -0800 Arjun Roy <arjunroy@google.com> wrote:
>
> > I remain a bit concerned regarding the merge process for this specific
> > patch (0003, the net/ipv4/tcp.c change) since I have other in-flight
> > changes for TCP receive zerocopy that I'd like to upstream for
> > net-next - and would like to avoid weird merge issues.
> >
> > So perhaps the following could work:
> >
> > 1. Andrew, perhaps we could remove this particular patch (0003, the
> > net/ipv4/tcp.c change) from mm-next; that way we merge
> > vm_insert_pages() but not the call-site within TCP, for now.
> > 2. net-next will eventually pick vm_insert_pages() up.
> > 3. I can modify the zerocopy code to use it at that point?
> >
> > Else I'm concerned a complicated merge situation may result.
>
> The merge situation is quite clean.
>
> I guess I'll hold off on
> net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy.patch (below) and
> shall send it to davem after Linus has merged the prerequisites.
>

Acknowledged, thank you!
(resend because gmail conveniently forgot my plain text mode preference...)

-Arjun

>
> From: Arjun Roy <arjunroy@google.com>
> Subject: net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy
>
> Use vm_insert_pages() for tcp receive zerocopy.  Spin lock cycles (as
> reported by perf) drop from a couple of percentage points to a fraction of
> a percent.  This results in a roughly 6% increase in efficiency, measured
> roughly as zerocopy receive count divided by CPU utilization.
>
> The intention of this patchset is to reduce atomic ops for tcp zerocopy
> receives, which normally hits the same spinlock multiple times
> consecutively.
>
> [akpm@linux-foundation.org: suppress gcc-7.2.0 warning]
> Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.com
> Signed-off-by: Arjun Roy <arjunroy@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
> Cc: David Miller <davem@davemloft.net>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>  net/ipv4/tcp.c |   70 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 63 insertions(+), 7 deletions(-)
>
> --- a/net/ipv4/tcp.c~net-zerocopy-use-vm_insert_pages-for-tcp-rcv-zerocopy
> +++ a/net/ipv4/tcp.c
> @@ -1734,14 +1734,48 @@ int tcp_mmap(struct file *file, struct s
>  }
>  EXPORT_SYMBOL(tcp_mmap);
>
> +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
> +                                       struct page **pages,
> +                                       unsigned long pages_to_map,
> +                                       unsigned long *insert_addr,
> +                                       u32 *length_with_pending,
> +                                       u32 *seq,
> +                                       struct tcp_zerocopy_receive *zc)
> +{
> +       unsigned long pages_remaining = pages_to_map;
> +       int bytes_mapped;
> +       int ret;
> +
> +       ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
> +       bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
> +       /* Even if vm_insert_pages fails, it may have partially succeeded in
> +        * mapping (some but not all of the pages).
> +        */
> +       *seq += bytes_mapped;
> +       *insert_addr += bytes_mapped;
> +       if (ret) {
> +               /* But if vm_insert_pages did fail, we have to unroll some state
> +                * we speculatively touched before.
> +                */
> +               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
> +               *length_with_pending -= bytes_not_mapped;
> +               zc->recv_skip_hint += bytes_not_mapped;
> +       }
> +       return ret;
> +}
> +
>  static int tcp_zerocopy_receive(struct sock *sk,
>                                 struct tcp_zerocopy_receive *zc)
>  {
>         unsigned long address = (unsigned long)zc->address;
>         u32 length = 0, seq, offset, zap_len;
> +       #define PAGE_BATCH_SIZE 8
> +       struct page *pages[PAGE_BATCH_SIZE];
>         const skb_frag_t *frags = NULL;
>         struct vm_area_struct *vma;
>         struct sk_buff *skb = NULL;
> +       unsigned long pg_idx = 0;
> +       unsigned long curr_addr;
>         struct tcp_sock *tp;
>         int inq;
>         int ret;
> @@ -1754,6 +1788,8 @@ static int tcp_zerocopy_receive(struct s
>
>         sock_rps_record_flow(sk);
>
> +       tp = tcp_sk(sk);
> +
>         down_read(&current->mm->mmap_sem);
>
>         ret = -EINVAL;
> @@ -1762,7 +1798,6 @@ static int tcp_zerocopy_receive(struct s
>                 goto out;
>         zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
>
> -       tp = tcp_sk(sk);
>         seq = tp->copied_seq;
>         inq = tcp_inq(sk);
>         zc->length = min_t(u32, zc->length, inq);
> @@ -1774,8 +1809,20 @@ static int tcp_zerocopy_receive(struct s
>                 zc->recv_skip_hint = zc->length;
>         }
>         ret = 0;
> +       curr_addr = address;
>         while (length + PAGE_SIZE <= zc->length) {
>                 if (zc->recv_skip_hint < PAGE_SIZE) {
> +                       /* If we're here, finish the current batch. */
> +                       if (pg_idx) {
> +                               ret = tcp_zerocopy_vm_insert_batch(vma, pages,
> +                                                                  pg_idx,
> +                                                                  &curr_addr,
> +                                                                  &length,
> +                                                                  &seq, zc);
> +                               if (ret)
> +                                       goto out;
> +                               pg_idx = 0;
> +                       }
>                         if (skb) {
>                                 if (zc->recv_skip_hint > 0)
>                                         break;
> @@ -1784,7 +1831,6 @@ static int tcp_zerocopy_receive(struct s
>                         } else {
>                                 skb = tcp_recv_skb(sk, seq, &offset);
>                         }
> -
>                         zc->recv_skip_hint = skb->len - offset;
>                         offset -= skb_headlen(skb);
>                         if ((int)offset < 0 || skb_has_frag_list(skb))
> @@ -1808,14 +1854,24 @@ static int tcp_zerocopy_receive(struct s
>                         zc->recv_skip_hint -= remaining;
>                         break;
>                 }
> -               ret = vm_insert_page(vma, address + length,
> -                                    skb_frag_page(frags));
> -               if (ret)
> -                       break;
> +               pages[pg_idx] = skb_frag_page(frags);
> +               pg_idx++;
>                 length += PAGE_SIZE;
> -               seq += PAGE_SIZE;
>                 zc->recv_skip_hint -= PAGE_SIZE;
>                 frags++;
> +               if (pg_idx == PAGE_BATCH_SIZE) {
> +                       ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
> +                                                          &curr_addr, &length,
> +                                                          &seq, zc);
> +                       if (ret)
> +                               goto out;
> +                       pg_idx = 0;
> +               }
> +       }
> +       if (pg_idx) {
> +               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
> +                                                  &curr_addr, &length, &seq,
> +                                                  zc);
>         }
>  out:
>         up_read(&current->mm->mmap_sem);
> _
>


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2020-04-10 19:15 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-28  2:59 [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
2020-01-28  2:59 ` [PATCH resend mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
2020-02-13  2:41   ` Andrew Morton
2020-02-13 17:09     ` Arjun Roy
2020-02-13 21:37     ` Linus Torvalds
2020-02-13 21:54   ` Matthew Wilcox
2020-02-13 22:06     ` Arjun Roy
2020-01-28  2:59 ` [PATCH resend mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
2020-02-13  2:56   ` Andrew Morton
2020-02-17  2:49     ` Arjun Roy
2020-02-21 21:21       ` Arjun Roy
2020-02-24  3:37         ` Andrew Morton
2020-02-24 16:19           ` Arjun Roy
2020-04-10 19:04         ` Andrew Morton
2020-04-10 19:13           ` Arjun Roy
2020-04-10 19:15           ` Arjun Roy
2020-02-13  2:41 ` [PATCH resend mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Andrew Morton
2020-02-13 16:52   ` Arjun Roy
2020-02-13 16:55   ` Arjun Roy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).