From: Chengming Zhou <zhouchengming@bytedance.com>
To: hannes@cmpxchg.org, surenb@google.com, mingo@redhat.com,
peterz@infradead.org, tj@kernel.org, corbet@lwn.net,
akpm@linux-foundation.org, rdunlap@infradead.org
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
songmuchun@bytedance.com, cgroups@vger.kernel.org,
Chengming Zhou <zhouchengming@bytedance.com>
Subject: [PATCH 9/9] sched/psi: add PSI_IRQ to track IRQ/SOFTIRQ pressure
Date: Thu, 21 Jul 2022 12:04:39 +0800 [thread overview]
Message-ID: <20220721040439.2651-10-zhouchengming@bytedance.com> (raw)
In-Reply-To: <20220721040439.2651-1-zhouchengming@bytedance.com>
Now PSI already tracked workload pressure stall information for
CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have
obvious impact on some workload productivity, such as web service
workload.
When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time
from update_rq_clock_task(), in which we can record that delta
to CPU curr task's cgroups as PSI_IRQ_FULL status.
Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in
the current task on the CPU, make nothing productive could run
even if it were runnable, so we only use PSI_IRQ_FULL.
For performance impact consideration, this is enabled by default when
CONFIG_IRQ_TIME_ACCOUNTING, but can be disabled by kernel cmdline
parameter "psi_irq=".
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
.../admin-guide/kernel-parameters.txt | 5 ++
include/linux/psi.h | 1 +
include/linux/psi_types.h | 7 +-
kernel/cgroup/cgroup.c | 27 +++++++
kernel/sched/core.c | 1 +
kernel/sched/psi.c | 76 ++++++++++++++++++-
kernel/sched/stats.h | 13 ++++
7 files changed, 126 insertions(+), 4 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6beef5b8bc36..1067dde299a0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4425,6 +4425,11 @@
Format: <bool>
default: enabled
+ psi_irq= [KNL] Enable or disable IRQ/SOFTIRQ pressure stall
+ information tracking.
+ Format: <bool>
+ default: enabled when CONFIG_IRQ_TIME_ACCOUNTING.
+
psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to
probe for; one of (bare|imps|exps|lifebook|any).
psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports
diff --git a/include/linux/psi.h b/include/linux/psi.h
index aa168a038242..f5cf3e45d5a5 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -14,6 +14,7 @@ struct css_set;
#ifdef CONFIG_PSI
extern struct static_key_false psi_disabled;
+extern struct static_key_true psi_irq_enabled;
extern struct psi_group psi_system;
void psi_init(void);
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index c124f7d186d0..195f123b1cd1 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -47,7 +47,8 @@ enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
- NR_PSI_RESOURCES = 3,
+ PSI_IRQ,
+ NR_PSI_RESOURCES = 4,
};
/*
@@ -63,9 +64,11 @@ enum psi_states {
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
+ PSI_IRQ_SOME,
+ PSI_IRQ_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
- NR_PSI_STATES = 7,
+ NR_PSI_STATES = 9,
};
enum psi_aggregators {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1424da7ed2c4..cf61df0ac892 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3683,6 +3683,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
@@ -5079,6 +5096,16 @@ static struct cftype cgroup_base_files[] = {
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .flags = CFTYPE_PRESSURE,
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
#endif /* CONFIG_PSI */
{ } /* terminate */
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5f2d3542b05..08637cfb7ed9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8d76920f47b3..6a0894e28780 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -141,6 +141,7 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+DEFINE_STATIC_KEY_TRUE(psi_irq_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -150,6 +151,12 @@ static bool psi_enable = true;
static bool psi_inner_cgroup __read_mostly = true;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static bool psi_irq_enable = true;
+#else
+static bool psi_irq_enable;
+#endif
+
static int __init setup_psi(char *str)
{
return kstrtobool(str, &psi_enable) == 0;
@@ -162,6 +169,12 @@ static int __init setup_psi_inner_cgroup(char *str)
}
__setup("psi_inner_cgroup=", setup_psi_inner_cgroup);
+static int __init setup_psi_irq(char *str)
+{
+ return kstrtobool(str, &psi_irq_enable) == 0;
+}
+__setup("psi_irq=", setup_psi_irq);
+
/* Running averages - we need to be higher-res than loadavg */
#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
@@ -215,12 +228,16 @@ void __init psi_init(void)
if (!psi_enable) {
static_branch_enable(&psi_disabled);
static_branch_disable(&psi_cgroups_enabled);
+ static_branch_disable(&psi_irq_enabled);
return;
}
if (!cgroup_psi_enabled())
static_branch_disable(&psi_cgroups_enabled);
+ if (!psi_irq_enable)
+ static_branch_disable(&psi_irq_enabled);
+
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
}
@@ -893,6 +910,28 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
}
+void psi_groups_account_irqtime(struct task_struct *task, u32 delta)
+{
+ struct psi_group *group = task_psi_group(task);
+ int cpu = task_cpu(task);
+ u64 now = cpu_clock(cpu);
+ struct psi_group_cpu *groupc;
+
+ for_each_psi_group(group) {
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ write_seqcount_begin(&groupc->seq);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ write_seqcount_end(&groupc->seq);
+
+ if (group->poll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_poll_work(group, 1);
+ }
+}
+
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@@ -1069,7 +1108,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2; full++) {
+ for (full = (res == PSI_IRQ); full < 2; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@@ -1111,9 +1150,12 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
- if (state >= PSI_NONIDLE)
+ if (state >= PSI_NONIDLE || state == PSI_IRQ_SOME)
return ERR_PTR(-EINVAL);
+ if (!static_branch_likely(&psi_irq_enabled) && state == PSI_IRQ_FULL)
+ return ERR_PTR(-EOPNOTSUPP);
+
if (window_us < WINDOW_MIN_US ||
window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
@@ -1395,6 +1437,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
@@ -1402,6 +1471,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index e930b8fa6253..10926cdaccc8 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -111,6 +111,7 @@ void psi_change_groups(struct task_struct *task, int clear, int set);
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
+void psi_groups_account_irqtime(struct task_struct *task, u32 delta);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
@@ -196,6 +197,17 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+ if (!static_branch_likely(&psi_irq_enabled))
+ return;
+
+ if (!task->pid)
+ return;
+
+ psi_groups_account_irqtime(task, delta);
+}
+
#else /* CONFIG_PSI */
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -203,6 +215,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *curr, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
--
2.36.1
next prev parent reply other threads:[~2022-07-21 4:06 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-21 4:04 [PATCH 0/9] sched/psi: some optimization and extension Chengming Zhou
2022-07-21 4:04 ` [PATCH 1/9] sched/psi: fix periodic aggregation shut off Chengming Zhou
2022-07-25 15:34 ` Johannes Weiner
2022-07-25 15:39 ` Johannes Weiner
2022-07-26 13:28 ` Chengming Zhou
2022-07-21 4:04 ` [PATCH 2/9] sched/psi: optimize task switch inside shared cgroups again Chengming Zhou
2022-07-21 4:04 ` [PATCH 3/9] sched/psi: move private helpers to sched/stats.h Chengming Zhou
2022-07-25 16:39 ` Johannes Weiner
2022-07-21 4:04 ` [PATCH 4/9] sched/psi: don't change task psi_flags when migrate CPU/group Chengming Zhou
2022-07-21 4:04 ` [PATCH 5/9] sched/psi: don't create cgroup PSI files when psi_disabled Chengming Zhou
2022-07-25 16:41 ` Johannes Weiner
2022-07-21 4:04 ` [PATCH 6/9] sched/psi: save percpu memory when !psi_cgroups_enabled Chengming Zhou
2022-07-25 16:47 ` Johannes Weiner
2022-07-21 4:04 ` [PATCH 7/9] sched/psi: cache parent psi_group to speed up groups iterate Chengming Zhou
2022-07-21 4:04 ` [PATCH 8/9] sched/psi: add kernel cmdline parameter psi_inner_cgroup Chengming Zhou
2022-07-25 16:52 ` Johannes Weiner
2022-07-26 13:38 ` [External] " Chengming Zhou
2022-07-26 17:54 ` Tejun Heo
2022-08-03 12:17 ` Chengming Zhou
2022-08-03 17:58 ` Tejun Heo
2022-08-03 19:22 ` Johannes Weiner
2022-08-03 19:48 ` Tejun Heo
2022-08-04 13:51 ` Chengming Zhou
2022-08-04 16:56 ` Johannes Weiner
2022-08-04 2:02 ` Chengming Zhou
2022-07-21 4:04 ` Chengming Zhou [this message]
2022-07-21 10:00 ` [PATCH 9/9] sched/psi: add PSI_IRQ to track IRQ/SOFTIRQ pressure kernel test robot
2022-07-21 22:10 ` kernel test robot
2022-07-22 3:30 ` Abel Wu
2022-07-22 6:13 ` Chengming Zhou
2022-07-22 7:14 ` Abel Wu
2022-07-22 7:33 ` Chengming Zhou
2022-07-25 18:26 ` Johannes Weiner
2022-07-26 13:55 ` [External] " Chengming Zhou
2022-07-27 11:28 ` Chengming Zhou
2022-07-27 13:00 ` Johannes Weiner
2022-07-27 15:09 ` Chengming Zhou
2022-07-27 16:07 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220721040439.2651-10-zhouchengming@bytedance.com \
--to=zhouchengming@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=cgroups@vger.kernel.org \
--cc=corbet@lwn.net \
--cc=hannes@cmpxchg.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=rdunlap@infradead.org \
--cc=songmuchun@bytedance.com \
--cc=surenb@google.com \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).