KVM Archive on lore.kernel.org
 help / color / Atom feed
From: Daniel Jordan <daniel.m.jordan@oracle.com>
To: linux-mm@kvack.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: aarcange@redhat.com, aaron.lu@intel.com,
	akpm@linux-foundation.org, alex.williamson@redhat.com,
	bsd@redhat.com, daniel.m.jordan@oracle.com,
	darrick.wong@oracle.com, dave.hansen@linux.intel.com,
	jgg@mellanox.com, jwadams@google.com, jiangshanlai@gmail.com,
	mhocko@kernel.org, mike.kravetz@oracle.com,
	Pavel.Tatashin@microsoft.com, prasad.singamsetty@oracle.com,
	rdunlap@infradead.org, steven.sistare@oracle.com,
	tim.c.chen@intel.com, tj@kernel.org, vbabka@suse.cz
Subject: [RFC PATCH v4 13/13] hugetlbfs: parallelize hugetlbfs_fallocate with ktask
Date: Mon,  5 Nov 2018 11:55:58 -0500
Message-ID: <20181105165558.11698-14-daniel.m.jordan@oracle.com> (raw)
In-Reply-To: <20181105165558.11698-1-daniel.m.jordan@oracle.com>

hugetlbfs_fallocate preallocates huge pages to back a file in a
hugetlbfs filesystem.  The time to call this function grows linearly
with size.

ktask performs well with its default thread count of 4; higher thread
counts are given for context only.

Machine: Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz, 288 CPUs, 1T memory
Test:    fallocate(1) a file on a hugetlbfs filesystem

nthread   speedup   size (GiB)   min time (s)   stdev
      1                    200         127.53    2.19
      2     3.09x          200          41.30    2.11
      4     5.72x          200          22.29    0.51
      8     9.45x          200          13.50    2.58
     16     9.74x          200          13.09    1.64

      1                    400         193.09    2.47
      2     2.14x          400          90.31    3.39
      4     3.84x          400          50.32    0.44
      8     5.11x          400          37.75    1.23
     16     6.12x          400          31.54    3.13

The primary bottleneck for better scaling at higher thread counts is
hugetlb_fault_mutex_table[hash].  perf showed L1-dcache-loads increase
with 8 threads and again sharply with 16 threads, and a CPU counter
profile showed that 31% of the L1d misses were on
hugetlb_fault_mutex_table[hash] in the 16-thread case.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 fs/hugetlbfs/inode.c | 114 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 21 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 762028994f47..a73548a96061 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/magic.h>
 #include <linux/migrate.h>
 #include <linux/uio.h>
+#include <linux/ktask.h>
 
 #include <linux/uaccess.h>
 
@@ -104,11 +105,16 @@ static const struct fs_parameter_description hugetlb_fs_parameters = {
 };
 
 #ifdef CONFIG_NUMA
+static inline struct shared_policy *hugetlb_get_shared_policy(
+							struct inode *inode)
+{
+	return &HUGETLBFS_I(inode)->policy;
+}
+
 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
+				struct shared_policy *policy, pgoff_t index)
 {
-	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
-							index);
+	vma->vm_policy = mpol_shared_policy_lookup(policy, index);
 }
 
 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
@@ -116,8 +122,14 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
 	mpol_cond_put(vma->vm_policy);
 }
 #else
+static inline struct shared_policy *hugetlb_get_shared_policy(
+							struct inode *inode)
+{
+	return NULL;
+}
+
 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
+				struct shared_policy *policy, pgoff_t index)
 {
 }
 
@@ -576,20 +588,30 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	return 0;
 }
 
+struct hf_args {
+	struct file		*file;
+	struct task_struct	*parent_task;
+	struct mm_struct	*mm;
+	struct shared_policy	*shared_policy;
+	struct hstate		*hstate;
+	struct address_space	*mapping;
+	int			error;
+};
+
+static int hugetlbfs_fallocate_chunk(pgoff_t start, pgoff_t end,
+				     struct hf_args *args);
+
 static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 				loff_t len)
 {
 	struct inode *inode = file_inode(file);
 	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
-	struct address_space *mapping = inode->i_mapping;
 	struct hstate *h = hstate_inode(inode);
-	struct vm_area_struct pseudo_vma;
-	struct mm_struct *mm = current->mm;
 	loff_t hpage_size = huge_page_size(h);
 	unsigned long hpage_shift = huge_page_shift(h);
-	pgoff_t start, index, end;
+	pgoff_t start, end;
+	struct hf_args hf_args;
 	int error;
-	u32 hash;
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
@@ -617,16 +639,66 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		goto out;
 	}
 
+	hf_args.file = file;
+	hf_args.parent_task = current;
+	hf_args.mm = current->mm;
+	hf_args.shared_policy = hugetlb_get_shared_policy(inode);
+	hf_args.hstate = h;
+	hf_args.mapping = inode->i_mapping;
+	hf_args.error = 0;
+
+	if (unlikely(hstate_is_gigantic(h))) {
+		/*
+		 * Use multiple threads in clear_gigantic_page instead of here,
+		 * so just do a 1-threaded hugetlbfs_fallocate_chunk.
+		 */
+		error = hugetlbfs_fallocate_chunk(start, end, &hf_args);
+	} else {
+		DEFINE_KTASK_CTL(ctl, hugetlbfs_fallocate_chunk,
+				 &hf_args, KTASK_PMD_MINCHUNK);
+
+		error = ktask_run((void *)start, end - start, &ctl);
+	}
+
+	if (error != KTASK_RETURN_SUCCESS && hf_args.error != -EINTR)
+		goto out;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+		i_size_write(inode, offset + len);
+	inode->i_ctime = current_time(inode);
+out:
+	inode_unlock(inode);
+	return error;
+}
+
+static int hugetlbfs_fallocate_chunk(pgoff_t start, pgoff_t end,
+				     struct hf_args *args)
+{
+	struct file		*file		= args->file;
+	struct task_struct	*parent_task	= args->parent_task;
+	struct mm_struct	*mm		= args->mm;
+	struct shared_policy	*shared_policy	= args->shared_policy;
+	struct hstate		*h		= args->hstate;
+	struct address_space	*mapping	= args->mapping;
+	int			error		= 0;
+	pgoff_t			index;
+	struct vm_area_struct	pseudo_vma;
+	loff_t			hpage_size;
+	u32			hash;
+
+	hpage_size = huge_page_size(h);
+
 	/*
 	 * Initialize a pseudo vma as this is required by the huge page
 	 * allocation routines.  If NUMA is configured, use page index
-	 * as input to create an allocation policy.
+	 * as input to create an allocation policy.  Each thread gets its
+	 * own pseudo vma because mempolicies can differ by page.
 	 */
 	vma_init(&pseudo_vma, mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pseudo_vma.vm_file = file;
 
-	for (index = start; index < end; index++) {
+	for (index = start; index < end; ++index) {
 		/*
 		 * This is supposed to be the vaddr where the page is being
 		 * faulted in, but we have no vaddr here.
@@ -641,13 +713,13 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		 * fallocate(2) manpage permits EINTR; we may have been
 		 * interrupted because we are using up too much memory.
 		 */
-		if (signal_pending(current)) {
+		if (signal_pending(parent_task) || signal_pending(current)) {
 			error = -EINTR;
-			break;
+			goto err;
 		}
 
 		/* Set numa allocation policy based on index */
-		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+		hugetlb_set_vma_policy(&pseudo_vma, shared_policy, index);
 
 		/* addr is the offset within the file (zero based) */
 		addr = index * hpage_size;
@@ -672,7 +744,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		if (IS_ERR(page)) {
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			error = PTR_ERR(page);
-			goto out;
+			goto err;
 		}
 		clear_huge_page(page, addr, pages_per_huge_page(h));
 		__SetPageUptodate(page);
@@ -680,7 +752,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		if (unlikely(error)) {
 			put_page(page);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-			goto out;
+			goto err;
 		}
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -693,11 +765,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		put_page(page);
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
-		i_size_write(inode, offset + len);
-	inode->i_ctime = current_time(inode);
-out:
-	inode_unlock(inode);
+	return KTASK_RETURN_SUCCESS;
+
+err:
+	args->error = error;
+
 	return error;
 }
 
-- 
2.19.1

  parent reply index

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-05 16:55 [RFC PATCH v4 00/13] ktask: multithread CPU-intensive kernel work Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 01/13] ktask: add documentation Daniel Jordan
2018-11-05 21:19   ` Randy Dunlap
2018-11-06  2:27     ` Daniel Jordan
2018-11-06  8:49   ` Peter Zijlstra
2018-11-06 20:34     ` Daniel Jordan
2018-11-06 20:51       ` Jason Gunthorpe
2018-11-07 10:27         ` Peter Zijlstra
2018-11-07 20:21           ` Daniel Jordan
2018-11-07 10:35       ` Peter Zijlstra
2018-11-07 21:20         ` Daniel Jordan
2018-11-08 17:26   ` Jonathan Corbet
2018-11-08 19:15     ` Daniel Jordan
2018-11-08 19:24       ` Jonathan Corbet
2018-11-27 19:50   ` Pavel Machek
2018-11-28 16:56     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 02/13] ktask: multithread CPU-intensive kernel work Daniel Jordan
2018-11-05 20:51   ` Randy Dunlap
2018-11-06  2:24     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 03/13] ktask: add undo support Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 04/13] ktask: run helper threads at MAX_NICE Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 05/13] workqueue, ktask: renice helper threads to prevent starvation Daniel Jordan
2018-11-13 16:34   ` Tejun Heo
2018-11-19 16:45     ` Daniel Jordan
2018-11-20 16:33       ` Tejun Heo
2018-11-20 17:03         ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 06/13] vfio: parallelize vfio_pin_map_dma Daniel Jordan
2018-11-05 21:51   ` Alex Williamson
2018-11-06  2:42     ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 07/13] mm: change locked_vm's type from unsigned long to atomic_long_t Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 08/13] vfio: remove unnecessary mmap_sem writer acquisition around locked_vm Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 09/13] vfio: relieve mmap_sem reader cacheline bouncing by holding it longer Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 10/13] mm: enlarge type of offset argument in mem_map_offset and mem_map_next Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 11/13] mm: parallelize deferred struct page initialization within each node Daniel Jordan
2018-11-10  3:48   ` Elliott, Robert (Persistent Memory)
2018-11-12 16:54     ` Daniel Jordan
2018-11-12 22:15       ` Elliott, Robert (Persistent Memory)
2018-11-19 16:01         ` Daniel Jordan
2018-11-27  0:12           ` Elliott, Robert (Persistent Memory)
2018-11-27 20:23             ` Daniel Jordan
2018-11-19 16:29       ` Daniel Jordan
2018-11-05 16:55 ` [RFC PATCH v4 12/13] mm: parallelize clear_gigantic_page Daniel Jordan
2018-11-05 16:55 ` Daniel Jordan [this message]
2018-11-05 17:29 ` [RFC PATCH v4 00/13] ktask: multithread CPU-intensive kernel work Michal Hocko
2018-11-06  1:29   ` Daniel Jordan
2018-11-06  9:21     ` Michal Hocko
2018-11-07 20:17       ` Daniel Jordan
2018-11-05 18:49 ` Zi Yan
2018-11-06  2:20   ` Daniel Jordan
2018-11-06  2:48     ` Zi Yan
2018-11-06 19:00       ` Daniel Jordan
2018-11-30 19:18 ` Tejun Heo
2018-12-01  0:13   ` Daniel Jordan
2018-12-03 16:16     ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181105165558.11698-14-daniel.m.jordan@oracle.com \
    --to=daniel.m.jordan@oracle.com \
    --cc=Pavel.Tatashin@microsoft.com \
    --cc=aarcange@redhat.com \
    --cc=aaron.lu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex.williamson@redhat.com \
    --cc=bsd@redhat.com \
    --cc=darrick.wong@oracle.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=jgg@mellanox.com \
    --cc=jiangshanlai@gmail.com \
    --cc=jwadams@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=prasad.singamsetty@oracle.com \
    --cc=rdunlap@infradead.org \
    --cc=steven.sistare@oracle.com \
    --cc=tim.c.chen@intel.com \
    --cc=tj@kernel.org \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

KVM Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/kvm/0 kvm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 kvm kvm/ https://lore.kernel.org/kvm \
		kvm@vger.kernel.org
	public-inbox-index kvm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.kvm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git