All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gang Li <ligang.bdlg@bytedance.com>
To: akpm@linux-foundation.org
Cc: songmuchun@bytedance.com, hca@linux.ibm.com, gor@linux.ibm.com,
	agordeev@linux.ibm.com, borntraeger@linux.ibm.com,
	svens@linux.ibm.com, ebiederm@xmission.com,
	keescook@chromium.org, viro@zeniv.linux.org.uk,
	rostedt@goodmis.org, mingo@redhat.com, peterz@infradead.org,
	acme@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	namhyung@kernel.org, david@redhat.com, imbrenda@linux.ibm.com,
	apopple@nvidia.com, adobriyan@gmail.com,
	stephen.s.brennan@oracle.com, ohoono.kwon@samsung.com,
	haolee.swjtu@gmail.com, kaleshsingh@google.com,
	zhengqi.arch@bytedance.com, peterx@redhat.com,
	shy828301@gmail.com, surenb@google.com, ccross@google.com,
	vincent.whitchurch@axis.com, tglx@linutronix.de,
	bigeasy@linutronix.de, fenghua.yu@intel.com,
	linux-s390@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-perf-users@vger.kernel.org,
	Gang Li <ligang.bdlg@bytedance.com>
Subject: [PATCH 2/5 v1] mm: add numa_count field for rss_stat
Date: Thu, 12 May 2022 12:46:31 +0800	[thread overview]
Message-ID: <20220512044634.63586-3-ligang.bdlg@bytedance.com> (raw)
In-Reply-To: <20220512044634.63586-1-ligang.bdlg@bytedance.com>

This patch add new fields `numa_count` for mm_rss_stat and
task_rss_stat.

`numa_count` are in the size of `sizeof(long) * num_possible_numa()`.
To reduce mem consumption, they only contain the sum of rss which is
needed by `oom_badness` instead of recording different kinds of rss
sepratly.

Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---
 include/linux/mm_types_task.h |  6 +++
 kernel/fork.c                 | 70 +++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 3e7da8c7ab95..c1ac2a33b697 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -64,11 +64,17 @@ enum {
 struct task_rss_stat {
 	int events;	/* for synchronization threshold */
 	int count[NR_MM_COUNTERS];
+#ifdef CONFIG_NUMA
+	int *numa_count;
+#endif
 };
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
 struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
+#ifdef CONFIG_NUMA
+	atomic_long_t *numa_count;
+#endif
 };
 
 struct page_frag {
diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..e549e0b30e2b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -141,6 +141,10 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+#if (defined SPLIT_RSS_COUNTING) && (defined CONFIG_NUMA)
+#define SPLIT_RSS_NUMA_COUNTING
+#endif
+
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
 {
@@ -765,6 +769,16 @@ static void check_mm(struct mm_struct *mm)
 				 mm, resident_page_types[i], x);
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < num_possible_nodes(); i++) {
+		long x = atomic_long_read(&mm->rss_stat.numa_count[i]);
+
+		if (unlikely(x))
+			pr_alert("BUG: Bad rss-counter state mm:%p node:%d val:%ld\n",
+				 mm, i, x);
+	}
+#endif
+
 	if (mm_pgtables_bytes(mm))
 		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
 				mm_pgtables_bytes(mm));
@@ -777,6 +791,29 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+#ifdef CONFIG_NUMA
+static inline void mm_free_rss_stat(struct mm_struct *mm)
+{
+	kfree(mm->rss_stat.numa_count);
+}
+
+static inline int mm_init_rss_stat(struct mm_struct *mm)
+{
+	memset(&mm->rss_stat.count, 0, sizeof(mm->rss_stat.count));
+	mm->rss_stat.numa_count = kcalloc(num_possible_nodes(), sizeof(atomic_long_t), GFP_KERNEL);
+	if (unlikely(!mm->rss_stat.numa_count))
+		return -ENOMEM;
+	return 0;
+}
+#else
+static inline void mm_free_rss_stat(struct mm_struct *mm) {}
+static inline int mm_init_rss_stat(struct mm_struct *mm)
+{
+	memset(&mm->rss_stat.count, 0, sizeof(mm->rss_stat.count));
+	return 0;
+}
+#endif
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -791,6 +828,7 @@ void __mmdrop(struct mm_struct *mm)
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
 	check_mm(mm);
+	mm_free_rss_stat(mm);
 	put_user_ns(mm->user_ns);
 	free_mm(mm);
 }
@@ -831,12 +869,22 @@ static inline void put_signal_struct(struct signal_struct *sig)
 		free_signal_struct(sig);
 }
 
+#ifdef SPLIT_RSS_NUMA_COUNTING
+void rss_stat_free(struct task_struct *p)
+{
+	kfree(p->rss_stat.numa_count);
+}
+#else
+void rss_stat_free(struct task_struct *p) {}
+#endif
+
 void __put_task_struct(struct task_struct *tsk)
 {
 	WARN_ON(!tsk->exit_state);
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	rss_stat_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
@@ -963,6 +1011,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
+	int *numa_count __maybe_unused;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -984,9 +1033,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #endif
 	account_kernel_stack(tsk, 1);
 
+#ifdef SPLIT_RSS_NUMA_COUNTING
+	numa_count = kcalloc(num_possible_nodes(), sizeof(int), GFP_KERNEL);
+	if (!numa_count)
+		goto free_stack;
+	tsk->rss_stat.numa_count = numa_count;
+#endif
+
 	err = scs_prepare(tsk, node);
 	if (err)
-		goto free_stack;
+		goto free_rss_stat;
 
 #ifdef CONFIG_SECCOMP
 	/*
@@ -1047,6 +1103,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #endif
 	return tsk;
 
+free_rss_stat:
+#ifdef SPLIT_RSS_NUMA_COUNTING
+	kfree(numa_count);
+#endif
 free_stack:
 	exit_task_stack_account(tsk);
 	free_thread_stack(tsk);
@@ -1117,7 +1177,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	atomic64_set(&mm->pinned_vm, 0);
-	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	spin_lock_init(&mm->arg_lock);
 	mm_init_cpumask(mm);
@@ -1144,6 +1203,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_pgd(mm))
 		goto fail_nopgd;
 
+	if (mm_init_rss_stat(mm))
+		goto fail_nocontext;
+
 	if (init_new_context(p, mm))
 		goto fail_nocontext;
 
@@ -2139,7 +2201,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p->io_uring = NULL;
 #endif
 
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef SPLIT_RSS_NUMA_COUNTING
+	memset(&p->rss_stat, 0, sizeof(p->rss_stat) - sizeof(p->rss_stat.numa_count));
+#else
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
 
-- 
2.20.1


  parent reply	other threads:[~2022-05-12  4:48 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-12  4:46 [PATCH 0/5 v1] mm, oom: Introduce per numa node oom for CONSTRAINT_MEMORY_POLICY Gang Li
2022-05-12  4:46 ` [PATCH 1/5 v1] mm: add a new parameter `node` to `get/add/inc/dec_mm_counter` Gang Li
2022-05-12  4:46 ` Gang Li [this message]
2022-05-12 16:31   ` [PATCH 2/5 v1] mm: add numa_count field for rss_stat kernel test robot
2022-05-12  4:46 ` [PATCH 3/5 v1] mm: add numa fields for tracepoint rss_stat Gang Li
2022-05-12  4:46 ` [PATCH 4/5 v1] mm: enable per numa node rss_stat count Gang Li
2022-05-17  2:28   ` [mm] c9dc81ef10: BUG:Bad_rss-counter_state_mm:#node:#val kernel test robot
2022-05-17  2:28     ` kernel test robot
2022-05-12  4:46 ` [PATCH 5/5 v1] mm, oom: enable per numa node oom for CONSTRAINT_MEMORY_POLICY Gang Li
2022-05-12 22:31 ` [PATCH 0/5 v1] mm, oom: Introduce " Suren Baghdasaryan
2022-05-16 16:44 ` Michal Hocko
2022-06-15 10:13   ` Gang Li
2022-05-12 21:21 [PATCH 2/5 v1] mm: add numa_count field for rss_stat kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220512044634.63586-3-ligang.bdlg@bytedance.com \
    --to=ligang.bdlg@bytedance.com \
    --cc=acme@kernel.org \
    --cc=adobriyan@gmail.com \
    --cc=agordeev@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=apopple@nvidia.com \
    --cc=bigeasy@linutronix.de \
    --cc=borntraeger@linux.ibm.com \
    --cc=ccross@google.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=fenghua.yu@intel.com \
    --cc=gor@linux.ibm.com \
    --cc=haolee.swjtu@gmail.com \
    --cc=hca@linux.ibm.com \
    --cc=imbrenda@linux.ibm.com \
    --cc=jolsa@kernel.org \
    --cc=kaleshsingh@google.com \
    --cc=keescook@chromium.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=ohoono.kwon@samsung.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=shy828301@gmail.com \
    --cc=songmuchun@bytedance.com \
    --cc=stephen.s.brennan@oracle.com \
    --cc=surenb@google.com \
    --cc=svens@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=vincent.whitchurch@axis.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.