[PATCH] pid: add handling of too many zombie processes

* [PATCH] pid: add handling of too many zombie processes
@ 2023-02-08  9:49 liuq
  2023-02-09  7:14 ` 回复: " huyd12
  0 siblings, 1 reply; 3+ messages in thread
From: liuq @ 2023-02-08  9:49 UTC (permalink / raw)
  To: akpm; +Cc: agruenba, linux-mm, linux-kernel, huyd12, liuq

There is a common situation that a parent process forks many child
processes to execute tasks, but the parent process does not execute
wait/waitpid when the child process exits, resulting in a large
number of child processes becoming zombie processes.

At this time, if the number of processes in the system out of
kernel.pid_max, the new fork syscall will fail, and the system will
not be able to execute any command at this time
(unless an old process exits)

eg:
[root@lq-workstation ~]# ls
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable
[root@lq-workstation ~]# reboot
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: retry: Resource temporarily unavailable
-bash: fork: Resource temporarily unavailable

I dealt with this situation in the alloc_pid function,
and found a process with the most zombie subprocesses,
and more than 10(or other reasonable values?) zombie subprocesses,
so I tried to kill this process to release the pid resources.

Signed-off-by: liuq <liuq131@chinatelecom.cn>
---
 include/linux/mm.h |  2 ++
 kernel/pid.c       |  6 +++-
 mm/oom_kill.c      | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f857163ac89..afcff08a3878 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1940,6 +1940,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
  */
 extern void pagefault_out_of_memory(void);
+extern void pid_max_oom_check(struct pid_namespace *ns);
+
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p)	((unsigned long)(p) & (thp_size(page) - 1))
diff --git a/kernel/pid.c b/kernel/pid.c
index 3fbc5e46b721..1a9a60e19ab6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -237,7 +237,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 		idr_preload_end();
 
 		if (nr < 0) {
-			retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+			retval = nr;
+			if (nr == -ENOSPC) {
+				retval = -EAGAIN;
+				pid_max_oom_check(tmp);
+			}
 			goto out_free;
 		}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1276e49b31b0..18d05d706f48 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1260,3 +1260,73 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
 	return -ENOSYS;
 #endif /* CONFIG_MMU */
 }
+
+static void oom_pid_evaluate_task(struct task_struct *p,
+	struct task_struct **max_zombie_task, int *max_zombie_num)
+{
+	struct task_struct *child;
+	int zombie_num = 0;
+
+	list_for_each_entry(child, &p->children, sibling) {
+		if (child->exit_state == EXIT_ZOMBIE)
+			zombie_num++;
+	}
+	if (zombie_num > *max_zombie_num) {
+		*max_zombie_num = zombie_num;
+		*max_zombie_task = p;
+	}
+}
+#define MAX_ZOMBIE_NUM 10
+struct task_struct *pid_max_bad_process(struct pid_namespace *ns)
+{
+	int max_zombie_num = 0;
+	struct task_struct *max_zombie_task = &init_task;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	for_each_process(p)
+		oom_pid_evaluate_task(p, &max_zombie_task, &max_zombie_num);
+	rcu_read_unlock();
+
+	if (max_zombie_num > MAX_ZOMBIE_NUM) {
+		pr_info("process %d has %d zombie child\n",
+			task_pid_nr_ns(max_zombie_task, ns), max_zombie_num);
+		return max_zombie_task;
+	}
+
+	return NULL;
+}
+
+void pid_max_oom_kill_process(struct task_struct *task)
+{
+	struct oom_control oc = {
+		.zonelist = NULL,
+		.nodemask = NULL,
+		.memcg = NULL,
+		.gfp_mask = 0,
+		.order = 0,
+	};
+
+	get_task_struct(task);
+	oc.chosen = task;
+
+	if (mem_cgroup_oom_synchronize(true))
+		return;
+
+	if (!mutex_trylock(&oom_lock))
+		return;
+
+	oom_kill_process(&oc, "Out of pid max(oom_kill_allocating_task)");
+	mutex_unlock(&oom_lock);
+}
+
+void pid_max_oom_check(struct pid_namespace *ns)
+{
+	struct task_struct *p;
+
+	p = pid_max_bad_process(ns);
+	if (p) {
+		pr_info("oom_kill process %d\n", task_pid_nr_ns(p, ns));
+		pid_max_oom_kill_process(p);
+	}
+}
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread