From: Yuanchu Xie <yuanchu@google.com>
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
"Rafael J . Wysocki" <rafael@kernel.org>,
"Michael S . Tsirkin" <mst@redhat.com>,
David Hildenbrand <david@redhat.com>,
Jason Wang <jasowang@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Roman Gushchin <roman.gushchin@linux.dev>,
Shakeel Butt <shakeelb@google.com>,
Muchun Song <muchun.song@linux.dev>, Yu Zhao <yuzhao@google.com>,
Kefeng Wang <wangkefeng.wang@huawei.com>,
Kairui Song <kasong@tencent.com>,
Yosry Ahmed <yosryahmed@google.com>,
Yuanchu Xie <yuanchu@google.com>,
"T . J . Alumbaugh" <talumbau@google.com>
Cc: Wei Xu <weixugc@google.com>, SeongJae Park <sj@kernel.org>,
Sudarshan Rajagopalan <quic_sudaraja@quicinc.com>,
kai.huang@intel.com, hch@lst.de, jon@nutanix.com,
Aneesh Kumar K V <aneesh.kumar@linux.ibm.com>,
Matthew Wilcox <willy@infradead.org>,
Vasily Averin <vasily.averin@linux.dev>,
linux-kernel@vger.kernel.org,
virtualization@lists.linux-foundation.org, linux-mm@kvack.org,
cgroups@vger.kernel.org
Subject: [RFC PATCH v2 5/6] mm: add per-memcg reaccess histogram
Date: Wed, 21 Jun 2023 18:04:53 +0000 [thread overview]
Message-ID: <20230621180454.973862-6-yuanchu@google.com> (raw)
In-Reply-To: <20230621180454.973862-1-yuanchu@google.com>
A reaccess refers to detecting an access on a page via refault
or access bit harvesting after the initial access. Similar to
the working set histogram, the reaccess histogram breaks down
reaccesses into user-defined bins.
Currently it only tracks reaccesses from access bit harvesting,
and the plan is to include refaults in the same histogram
by pulling information from folio->mapping->i_pages shadow entry
for swapped out pages.
Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
Signed-off-by: Yuanchu Xie <yuanchu@google.com>
---
include/linux/wsr.h | 9 +++-
mm/memcontrol.c | 89 ++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 6 ++-
mm/wsr.c | 101 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 203 insertions(+), 2 deletions(-)
diff --git a/include/linux/wsr.h b/include/linux/wsr.h
index d45f7cc0672ac..68246734679cd 100644
--- a/include/linux/wsr.h
+++ b/include/linux/wsr.h
@@ -26,11 +26,14 @@ struct ws_bin {
struct wsr {
/* protects bins */
struct mutex bins_lock;
+ /* protects reaccess_bins */
+ struct mutex reaccess_bins_lock;
struct kernfs_node *notifier;
unsigned long timestamp;
unsigned long report_threshold;
unsigned long refresh_threshold;
struct ws_bin bins[MAX_NR_BINS];
+ struct ws_bin reaccess_bins[MAX_NR_BINS];
};
void wsr_register_node(struct node *node);
@@ -48,6 +51,7 @@ ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins);
*/
void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
struct pglist_data *pgdat);
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk);
void report_ws(struct pglist_data *pgdat, struct scan_control *sc);
#else
struct ws_bin;
@@ -71,7 +75,10 @@ static inline ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins)
return -EINVAL;
}
static inline void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat)
+{
+}
+static inline void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
{
}
static inline void report_ws(struct pglist_data *pgdat, struct scan_control *sc)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index edf5bb31bb19c..b901982d659d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6736,6 +6736,56 @@ static ssize_t memory_wsr_intervals_ms_write(struct kernfs_open_file *of,
return err ?: nbytes;
}
+static int memory_reaccess_intervals_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr;
+ struct ws_bin *bin;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->reaccess_bins_lock);
+ seq_printf(m, "N%d=", nid);
+ for (bin = wsr->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u,", jiffies_to_msecs(bin->idle_age));
+ mutex_unlock(&wsr->reaccess_bins_lock);
+
+ seq_printf(m, "%lld ", LLONG_MAX);
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_reaccess_intervals_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int nid;
+ int err;
+ struct wsr *wsr;
+ struct ws_bin *bins;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ bins = kzalloc(sizeof(wsr->reaccess_bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ err = memory_wsr_intervals_ms_parse(of, buf, nbytes, &nid, bins);
+ if (err)
+ goto failed;
+
+ wsr = lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wsr->reaccess_bins_lock);
+ memcpy(wsr->reaccess_bins, bins, sizeof(wsr->reaccess_bins));
+ mutex_unlock(&wsr->reaccess_bins_lock);
+failed:
+ kfree(bins);
+ return err ?: nbytes;
+}
+
static int memory_wsr_refresh_ms_show(struct seq_file *m, void *v)
{
int nid;
@@ -6874,6 +6924,34 @@ __poll_t memory_wsr_histogram_poll(struct kernfs_open_file *of,
return DEFAULT_POLLMASK | EPOLLPRI;
return DEFAULT_POLLMASK;
}
+
+static int memory_reaccess_histogram_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wsr *wsr =
+ lruvec_wsr(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ struct ws_bin *bin;
+
+ seq_printf(m, "N%d\n", nid);
+
+ mutex_lock(&wsr->reaccess_bins_lock);
+ wsr_refresh(wsr, memcg, NODE_DATA(nid));
+ for (bin = wsr->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age),
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ seq_printf(m, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wsr->reaccess_bins_lock);
+ }
+
+ return 0;
+}
#endif
static struct cftype memory_files[] = {
@@ -6969,6 +7047,17 @@ static struct cftype memory_files[] = {
.seq_show = memory_wsr_histogram_show,
.poll = memory_wsr_histogram_poll,
},
+ {
+ .name = "reaccess.intervals_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_intervals_ms_show,
+ .write = memory_reaccess_intervals_ms_write,
+ },
+ {
+ .name = "reaccess.histogram",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_histogram_show,
+ },
#endif
{} /* terminate */
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba254b6e91e19..bc8c026ceef0d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4226,6 +4226,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
mem_cgroup_unlock_pages();
if (walk->batched) {
+ report_reaccess(lruvec, walk);
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(lruvec, walk);
spin_unlock_irq(&lruvec->lru_lock);
@@ -5079,11 +5080,14 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_scanned -= folio_nr_pages(folio);
}
+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ report_reaccess(lruvec, walk);
+
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &list);
- walk = current->reclaim_state->mm_walk;
if (walk && walk->batched)
reset_batch_size(lruvec, walk);
diff --git a/mm/wsr.c b/mm/wsr.c
index cd045ade5e9ba..a63d678e64f8b 100644
--- a/mm/wsr.c
+++ b/mm/wsr.c
@@ -23,8 +23,10 @@ void wsr_init(struct lruvec *lruvec)
struct wsr *wsr = lruvec_wsr(lruvec);
mutex_init(&wsr->bins_lock);
+ mutex_init(&wsr->reaccess_bins_lock);
wsr->bins[0].idle_age = -1;
wsr->notifier = NULL;
+ wsr->reaccess_bins[0].idle_age = -1;
}
void wsr_destroy(struct lruvec *lruvec)
@@ -32,6 +34,7 @@ void wsr_destroy(struct lruvec *lruvec)
struct wsr *wsr = lruvec_wsr(lruvec);
mutex_destroy(&wsr->bins_lock);
+ mutex_destroy(&wsr->reaccess_bins_lock);
memset(wsr, 0, sizeof(*wsr));
}
@@ -172,6 +175,104 @@ void refresh_wsr(struct wsr *wsr, struct mem_cgroup *root,
cond_resched();
} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
}
+
+static void collect_reaccess_locked(struct wsr *wsr,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ unsigned long curr_timestamp = jiffies;
+ unsigned long max_seq = READ_ONCE(walk->max_seq);
+ unsigned long min_seq[ANON_AND_FILE] = {
+ READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]),
+ READ_ONCE(lrugen->min_seq[LRU_GEN_FILE]),
+ };
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+ struct ws_bin *bin = wsr->reaccess_bins;
+
+ lockdep_assert_held(&wsr->reaccess_bins_lock);
+ /* Skip max_seq because a reaccess moves a page from another seq
+ * to max_seq. We use the negative change in page count from
+ * other seqs to track the number of reaccesses.
+ */
+ for (seq = max_seq - 1; seq + 1 > min_seq[type]; seq--) {
+ long error;
+ int next_gen;
+ unsigned long birth, gen_start;
+ long delta = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ long nr_pages = walk->nr_pages[gen][type][zone];
+
+ if (nr_pages < 0)
+ delta += -nr_pages;
+ }
+
+ birth = READ_ONCE(lrugen->timestamps[gen]);
+ next_gen = lru_gen_from_seq(seq + 1);
+ gen_start = READ_ONCE(lrugen->timestamps[next_gen]);
+
+ /* ensure gen_start is within idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(gen_start + bin->idle_age,
+ curr_timestamp))
+ bin++;
+
+ error = delta;
+ /* gen exceeds the idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(birth + bin->idle_age,
+ curr_timestamp)) {
+ unsigned long proportion =
+ gen_start -
+ (curr_timestamp - bin->idle_age);
+ unsigned long gen_len = gen_start - birth;
+
+ if (!gen_len)
+ break;
+ if (proportion) {
+ unsigned long split_bin =
+ delta / gen_len * proportion;
+ bin->nr_pages[type] += split_bin;
+ error -= split_bin;
+ }
+ gen_start = curr_timestamp - bin->idle_age;
+ bin++;
+ }
+ bin->nr_pages[type] += error;
+ }
+ }
+}
+
+static void collect_reaccess(struct wsr *wsr,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ if (READ_ONCE(wsr->reaccess_bins->idle_age) == -1)
+ return;
+
+ mutex_lock(&wsr->reaccess_bins_lock);
+ collect_reaccess_locked(wsr, lrugen, walk);
+ mutex_unlock(&wsr->reaccess_bins_lock);
+}
+
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ while (memcg) {
+ collect_reaccess(lruvec_wsr(mem_cgroup_lruvec(
+ memcg, lruvec_pgdat(lruvec))),
+ lrugen, walk);
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
{
int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
--
2.41.0.162.gfafddb0af9-goog
next prev parent reply other threads:[~2023-06-21 18:17 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-06-21 18:04 [RFC PATCH v2 0/6] mm: working set reporting Yuanchu Xie
2023-06-21 18:04 ` [RFC PATCH v2 1/6] mm: aggregate working set information into histograms Yuanchu Xie
2023-06-21 18:04 ` [RFC PATCH v2 2/6] mm: add working set refresh threshold to rate-limit aggregation Yuanchu Xie
2023-06-21 18:04 ` [RFC PATCH v2 3/6] mm: report working set when under memory pressure Yuanchu Xie
2023-06-21 18:04 ` [RFC PATCH v2 4/6] mm: extend working set reporting to memcgs Yuanchu Xie
2023-06-21 18:04 ` Yuanchu Xie [this message]
2023-06-21 18:04 ` [RFC PATCH v2 6/6] virtio-balloon: Add Working Set reporting Yuanchu Xie
2023-06-21 18:48 ` [RFC PATCH v2 0/6] mm: working set reporting Yu Zhao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230621180454.973862-6-yuanchu@google.com \
--to=yuanchu@google.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@linux.ibm.com \
--cc=cgroups@vger.kernel.org \
--cc=david@redhat.com \
--cc=gregkh@linuxfoundation.org \
--cc=hannes@cmpxchg.org \
--cc=hch@lst.de \
--cc=jasowang@redhat.com \
--cc=jon@nutanix.com \
--cc=kai.huang@intel.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mst@redhat.com \
--cc=muchun.song@linux.dev \
--cc=quic_sudaraja@quicinc.com \
--cc=rafael@kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeelb@google.com \
--cc=sj@kernel.org \
--cc=talumbau@google.com \
--cc=vasily.averin@linux.dev \
--cc=virtualization@lists.linux-foundation.org \
--cc=wangkefeng.wang@huawei.com \
--cc=weixugc@google.com \
--cc=willy@infradead.org \
--cc=yosryahmed@google.com \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).