linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Fengguang Wu <fengguang.wu@intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Linux Memory Management List <linux-mm@kvack.org>,
	Huang Ying <ying.huang@intel.com>,
	Brendan Gregg <bgregg@netflix.com>,
	Fengguang Wu <fengguang.wu@intel.com>
Cc: kvm@vger.kernel.org
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Fan Du <fan.du@intel.com>
Cc: Yao Yuan <yuan.yao@intel.com>
Cc: Peng Dong <dongx.peng@intel.com>
Cc: Liu Jingqi <jingqi.liu@intel.com>
Cc: Dong Eddie <eddie.dong@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Zhang Yi <yi.z.zhang@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Subject: [RFC][PATCH v2 17/21] proc: introduce /proc/PID/idle_pages
Date: Wed, 26 Dec 2018 21:15:03 +0800	[thread overview]
Message-ID: <20181226133352.076749877@intel.com> (raw)
Message-ID: <20181226131503.-TbBlaV8TN-7yG4-tuxEfnT4n-LGSDdZutz8L2cN3WQ@z> (raw)
In-Reply-To: 20181226131446.330864849@intel.com

[-- Attachment #1: 0008-proc-introduce-proc-PID-idle_pages.patch --]
[-- Type: text/plain, Size: 4845 bytes --]

This will be similar to /sys/kernel/mm/page_idle/bitmap documented in
Documentation/admin-guide/mm/idle_page_tracking.rst, however indexed
by process virtual address.

When using the global PFN indexed idle bitmap, we find 2 kind of
overheads:

- to track a task's working set, Brendan Gregg end up writing wss-v1
  for small tasks and wss-v2 for large tasks:

  https://github.com/brendangregg/wss

  That's because VAs may point to random PAs throughout the physical
  address space. So we either query /proc/pid/pagemap first and access
  the lots of random PFNs (with lots of syscalls) in the bitmap, or
  write+read the whole system idle bitmap beforehand.

- page table walking by PFN has much more overheads than to walk a
  page table in its natural order:
  - rmap queries
  - more locking
  - random memory reads/writes

This interface provides a cheap path for the majority non-shared mapping
pages. To walk 1TB memory of 4k active pages, it costs 2s vs 15s system
time to scan the per-task/global idle bitmaps. Which means ~7x speedup.
The gap will be enlarged if consider

- the extra /proc/pid/pagemap walk
- natural page table walks can skip the whole 512 PTEs if PMD is idle

OTOH, the per-task idle bitmap is not suitable in some situations:

- not accurate for shared pages
- don't work with non-mapped file pages
- don't perform well for sparse page tables (pointed out by Huang Ying)

So it's more about complementing the existing global idle bitmap.

CC: Huang Ying <ying.huang@intel.com>
CC: Brendan Gregg <bgregg@netflix.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
 fs/proc/base.c     |    2 +
 fs/proc/internal.h |    1 
 fs/proc/task_mmu.c |   54 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

--- linux.orig/fs/proc/base.c	2018-12-23 20:08:14.228919325 +0800
+++ linux/fs/proc/base.c	2018-12-23 20:08:14.224919327 +0800
@@ -2969,6 +2969,7 @@ static const struct pid_entry tgid_base_
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3357,6 +3358,7 @@ static const struct pid_entry tid_base_s
 	REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
--- linux.orig/fs/proc/internal.h	2018-12-23 20:08:14.228919325 +0800
+++ linux/fs/proc/internal.h	2018-12-23 20:08:14.224919327 +0800
@@ -298,6 +298,7 @@ extern const struct file_operations proc
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_mm_idle_operations;
 
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
--- linux.orig/fs/proc/task_mmu.c	2018-12-23 20:08:14.228919325 +0800
+++ linux/fs/proc/task_mmu.c	2018-12-23 20:08:14.224919327 +0800
@@ -1559,6 +1559,60 @@ const struct file_operations proc_pagema
 	.open		= pagemap_open,
 	.release	= pagemap_release,
 };
+
+/* will be filled when kvm_ept_idle module loads */
+struct file_operations proc_ept_idle_operations = {
+};
+EXPORT_SYMBOL_GPL(proc_ept_idle_operations);
+
+static ssize_t mm_idle_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	if (proc_ept_idle_operations.read)
+		return proc_ept_idle_operations.read(file, buf, count, ppos);
+
+	return 0;
+}
+
+
+static int mm_idle_open(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = proc_mem_open(inode, PTRACE_MODE_READ);
+
+	if (IS_ERR(mm))
+		return PTR_ERR(mm);
+
+	file->private_data = mm;
+
+	if (proc_ept_idle_operations.open)
+		return proc_ept_idle_operations.open(inode, file);
+
+	return 0;
+}
+
+static int mm_idle_release(struct inode *inode, struct file *file)
+{
+	struct mm_struct *mm = file->private_data;
+
+	if (mm) {
+		if (!mm_kvm(mm))
+			flush_tlb_mm(mm);
+		mmdrop(mm);
+	}
+
+	if (proc_ept_idle_operations.release)
+		return proc_ept_idle_operations.release(inode, file);
+
+	return 0;
+}
+
+const struct file_operations proc_mm_idle_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= mm_idle_read,
+	.open		= mm_idle_open,
+	.release	= mm_idle_release,
+};
+
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA



  parent reply	other threads:[~2018-12-26 13:38 UTC|newest]

Thread overview: 99+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-26 13:14 [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration Fengguang Wu
2018-12-26 13:14 ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 01/21] e820: cheat PMEM as DRAM Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-27  3:41   ` Matthew Wilcox
2018-12-27  4:11     ` Fengguang Wu
2018-12-27  5:13       ` Dan Williams
2018-12-27  5:13         ` Dan Williams
2018-12-27 19:32         ` Yang Shi
2018-12-27 19:32           ` Yang Shi
2018-12-28  3:27           ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 02/21] acpi/numa: memorize NUMA node type from SRAT table Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 03/21] x86/numa_emulation: fix fake NUMA in uniform case Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 04/21] x86/numa_emulation: pass numa node type to fake nodes Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 05/21] mmzone: new pgdat flags for DRAM and PMEM Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 06/21] x86,numa: update numa node type Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 07/21] mm: export node type {pmem|dram} under /sys/bus/node Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 08/21] mm: introduce and export pgdat peer_node Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-27 20:07   ` Christopher Lameter
2018-12-27 20:07     ` Christopher Lameter
2018-12-28  2:31     ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 09/21] mm: avoid duplicate peer target node Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 10/21] mm: build separate zonelist for PMEM and DRAM node Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2019-01-01  9:14   ` Aneesh Kumar K.V
2019-01-01  9:14     ` Aneesh Kumar K.V
2019-01-07  9:57     ` Fengguang Wu
2019-01-07 14:09       ` Aneesh Kumar K.V
2018-12-26 13:14 ` [RFC][PATCH v2 11/21] kvm: allocate page table pages from DRAM Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2019-01-01  9:23   ` Aneesh Kumar K.V
2019-01-01  9:23     ` Aneesh Kumar K.V
2019-01-02  0:59     ` Yuan Yao
2019-01-02 16:47   ` Dave Hansen
2019-01-07 10:21     ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 12/21] x86/pgtable: " Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:14 ` [RFC][PATCH v2 13/21] x86/pgtable: dont check PMD accessed bit Fengguang Wu
2018-12-26 13:14   ` Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 14/21] kvm: register in mm_struct Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2019-02-02  6:57   ` Peter Xu
2019-02-02 10:50     ` Fengguang Wu
2019-02-04 10:46     ` Paolo Bonzini
2018-12-26 13:15 ` [RFC][PATCH v2 15/21] ept-idle: EPT walk for virtual machine Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 16/21] mm-idle: mm_walk for normal task Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-26 13:15 ` Fengguang Wu [this message]
2018-12-26 13:15   ` [RFC][PATCH v2 17/21] proc: introduce /proc/PID/idle_pages Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 18/21] kvm-ept-idle: enable module Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 19/21] mm/migrate.c: add move_pages(MPOL_MF_SW_YOUNG) flag Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 20/21] mm/vmscan.c: migrate anon DRAM pages to PMEM node Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-26 13:15 ` [RFC][PATCH v2 21/21] mm/vmscan.c: shrink anon list if can migrate to PMEM Fengguang Wu
2018-12-26 13:15   ` Fengguang Wu
2018-12-27 20:31 ` [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration Michal Hocko
2018-12-28  5:08   ` Fengguang Wu
2018-12-28  8:41     ` Michal Hocko
2018-12-28  9:42       ` Fengguang Wu
2018-12-28 12:15         ` Michal Hocko
2018-12-28 13:15           ` Fengguang Wu
2018-12-28 13:15             ` Fengguang Wu
2018-12-28 19:46             ` Michal Hocko
2018-12-28 13:31           ` Fengguang Wu
2018-12-28 18:28             ` Yang Shi
2018-12-28 18:28               ` Yang Shi
2018-12-28 19:52             ` Michal Hocko
2019-01-02 12:21               ` Jonathan Cameron
2019-01-02 12:21                 ` Jonathan Cameron
2019-01-08 14:52                 ` Michal Hocko
2019-01-10 15:53                   ` Jerome Glisse
2019-01-10 15:53                     ` Jerome Glisse
2019-01-10 16:42                     ` Michal Hocko
2019-01-10 17:42                       ` Jerome Glisse
2019-01-10 17:42                         ` Jerome Glisse
2019-01-10 18:26                   ` Jonathan Cameron
2019-01-10 18:26                     ` Jonathan Cameron
2019-01-28 17:42                 ` Jonathan Cameron
2019-01-28 17:42                   ` Jonathan Cameron
2019-01-29  2:00                   ` Fengguang Wu
2019-01-03 10:57               ` Mel Gorman
2019-01-10 16:25               ` Jerome Glisse
2019-01-10 16:25                 ` Jerome Glisse
2019-01-10 16:50                 ` Michal Hocko
2019-01-10 18:02                   ` Jerome Glisse
2019-01-10 18:02                     ` Jerome Glisse
2019-01-02 18:12       ` Dave Hansen
2019-01-08 14:53         ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181226133352.076749877@intel.com \
    --to=fengguang.wu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=bgregg@netflix.com \
    --cc=linux-mm@kvack.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).