From: Konstantin Khebnikov <khlebnikov@yandex-team.ru>
To: linux-mm@kvack.org, cgroups@vger.kernel.org
Cc: Roman Gushchin <klamm@yandex-team.ru>, Jan Kara <jack@suse.cz>,
Dave Chinner <david@fromorbit.com>,
linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>,
linux-fsdevel@vger.kernel.org, koct9i@gmail.com
Subject: [PATCH 4/6] percpu_ratelimit: high-performance ratelimiting counter
Date: Thu, 15 Jan 2015 21:49:15 +0300 [thread overview]
Message-ID: <20150115184915.10450.1814.stgit@buzz> (raw)
In-Reply-To: <20150115180242.10450.92.stgit@buzz>
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Parameters:
period - interval between refills (100ms should be fine)
quota - events refill per period
deadline - interval to utilize unused past quota (1s by default)
latency - maximum injected delay (10s by default)
Quota sums into 'budget' and spreads across cpus.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
include/linux/percpu_ratelimit.h | 45 ++++++++++
lib/Makefile | 1
lib/percpu_ratelimit.c | 168 ++++++++++++++++++++++++++++++++++++++
3 files changed, 214 insertions(+)
create mode 100644 include/linux/percpu_ratelimit.h
create mode 100644 lib/percpu_ratelimit.c
diff --git a/include/linux/percpu_ratelimit.h b/include/linux/percpu_ratelimit.h
new file mode 100644
index 0000000..42c45d4
--- /dev/null
+++ b/include/linux/percpu_ratelimit.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_PERCPU_RATELIMIT_H
+#define _LINUX_PERCPU_RATELIMIT_H
+
+#include <linux/hrtimer.h>
+
+struct percpu_ratelimit {
+ struct hrtimer timer;
+ ktime_t target; /* time of next refill */
+ ktime_t deadline; /* interval to utilize past budget */
+ ktime_t latency; /* maximum injected delay */
+ ktime_t period; /* interval between refills */
+ u64 quota; /* events refill per period */
+ u64 budget; /* amount of available events */
+ u64 total; /* consumed and pre-charged events */
+ raw_spinlock_t lock; /* protect the state */
+ u32 cpu_batch; /* events in per-cpu precharge */
+ u32 __percpu *cpu_budget; /* per-cpu precharge */
+};
+
+static inline bool percpu_ratelimit_blocked(struct percpu_ratelimit *rl)
+{
+ return hrtimer_active(&rl->timer);
+}
+
+static inline ktime_t percpu_ratelimit_target(struct percpu_ratelimit *rl)
+{
+ return rl->target;
+}
+
+static inline int percpu_ratelimit_wait(struct percpu_ratelimit *rl)
+{
+ ktime_t target = rl->target;
+
+ return schedule_hrtimeout_range(&target, ktime_to_ns(rl->period),
+ HRTIMER_MODE_ABS);
+}
+
+int percpu_ratelimit_init(struct percpu_ratelimit *rl, gfp_t gfp);
+void percpu_ratelimit_destroy(struct percpu_ratelimit *rl);
+void percpu_ratelimit_setup(struct percpu_ratelimit *rl, u64 quota, u64 period);
+u64 percpu_ratelimit_quota(struct percpu_ratelimit *rl, u64 period);
+bool percpu_ratelimit_charge(struct percpu_ratelimit *rl, u64 events);
+u64 percpu_ratelimit_sum(struct percpu_ratelimit *rl);
+
+#endif /* _LINUX_PERCPU_RATELIMIT_H */
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b..b20ab47 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -21,6 +21,7 @@ lib-$(CONFIG_SMP) += cpumask.o
lib-y += kobject.o klist.o
obj-y += lockref.o
+obj-y += percpu_ratelimit.o
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
diff --git a/lib/percpu_ratelimit.c b/lib/percpu_ratelimit.c
new file mode 100644
index 0000000..8254683
--- /dev/null
+++ b/lib/percpu_ratelimit.c
@@ -0,0 +1,168 @@
+#include <linux/percpu_ratelimit.h>
+
+static void __percpu_ratelimit_setup(struct percpu_ratelimit *rl,
+ u64 period, u64 quota)
+{
+ rl->period = ns_to_ktime(period);
+ rl->quota = quota;
+ rl->total += quota - rl->budget;
+ rl->budget = quota;
+ if (do_div(quota, num_possible_cpus() * 2))
+ quota++;
+ rl->cpu_batch = min_t(u64, UINT_MAX, quota);
+ rl->target = ktime_get();
+}
+
+static enum hrtimer_restart ratelimit_unblock(struct hrtimer *t)
+{
+ struct percpu_ratelimit *rl = container_of(t, struct percpu_ratelimit, timer);
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ ktime_t now = t->base->get_time();
+
+ raw_spin_lock(&rl->lock);
+ if (ktime_after(rl->target, now)) {
+ hrtimer_set_expires_range(t, rl->target, rl->period);
+ ret = HRTIMER_RESTART;
+ }
+ raw_spin_unlock(&rl->lock);
+
+ return ret;
+}
+
+int percpu_ratelimit_init(struct percpu_ratelimit *rl, gfp_t gfp)
+{
+ memset(rl, 0, sizeof(*rl));
+ rl->cpu_budget = alloc_percpu_gfp(typeof(*rl->cpu_budget), gfp);
+ if (!rl->cpu_budget)
+ return -ENOMEM;
+ raw_spin_lock_init(&rl->lock);
+ hrtimer_init(&rl->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ rl->timer.function = ratelimit_unblock;
+ rl->deadline = ns_to_ktime(NSEC_PER_SEC);
+ rl->latency = ns_to_ktime(NSEC_PER_SEC * 10);
+ __percpu_ratelimit_setup(rl, NSEC_PER_SEC, ULLONG_MAX);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_init);
+
+void percpu_ratelimit_destroy(struct percpu_ratelimit *rl)
+{
+ free_percpu(rl->cpu_budget);
+ hrtimer_cancel(&rl->timer);
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_destroy);
+
+static void percpu_ratelimit_drain(void *info)
+{
+ struct percpu_ratelimit *rl = info;
+
+ __this_cpu_write(*rl->cpu_budget, 0);
+}
+
+void percpu_ratelimit_setup(struct percpu_ratelimit *rl, u64 quota, u64 period)
+{
+ unsigned long flags;
+
+ if (!quota || !period) {
+ quota = ULLONG_MAX;
+ period = NSEC_PER_SEC;
+ } else if (period > NSEC_PER_SEC / 10) {
+ u64 quant = div_u64(quota * NSEC_PER_SEC / 10, period);
+
+ if (quant > 20) {
+ quota = quant;
+ period = NSEC_PER_SEC / 10;
+ }
+ }
+
+ raw_spin_lock_irqsave(&rl->lock, flags);
+ __percpu_ratelimit_setup(rl, period, quota);
+ raw_spin_unlock_irqrestore(&rl->lock, flags);
+ on_each_cpu(percpu_ratelimit_drain, rl, 1);
+ hrtimer_cancel(&rl->timer);
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_setup);
+
+u64 percpu_ratelimit_quota(struct percpu_ratelimit *rl, u64 period)
+{
+ unsigned long flags;
+ u64 quota;
+
+ raw_spin_lock_irqsave(&rl->lock, flags);
+ if (rl->quota == ULLONG_MAX)
+ quota = 0;
+ else
+ quota = div64_u64(rl->quota * period, ktime_to_ns(rl->period));
+ raw_spin_unlock_irqrestore(&rl->lock, flags);
+
+ return quota;
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_quota);
+
+/*
+ * Charges events, returns true if ratelimit is blocked and caller should sleep.
+ */
+bool percpu_ratelimit_charge(struct percpu_ratelimit *rl, u64 events)
+{
+ unsigned long flags;
+ u64 budget, delta;
+ ktime_t now, deadline;
+
+ preempt_disable();
+ budget = __this_cpu_read(*rl->cpu_budget);
+ if (likely(budget >= events)) {
+ __this_cpu_sub(*rl->cpu_budget, events);
+ } else {
+ now = ktime_get();
+ raw_spin_lock_irqsave(&rl->lock, flags);
+ deadline = ktime_sub(now, rl->deadline);
+ if (ktime_after(deadline, rl->target))
+ rl->target = deadline;
+ budget += rl->budget;
+ if (budget >= events + rl->cpu_batch) {
+ budget -= events;
+ } else {
+ delta = events + rl->cpu_batch - budget;
+ if (do_div(delta, rl->quota))
+ delta++;
+ rl->target = ktime_add_ns(rl->target,
+ ktime_to_ns(rl->period) * delta);
+ deadline = ktime_add(now, rl->latency);
+ if (ktime_after(rl->target, deadline))
+ rl->target = deadline;
+ delta *= rl->quota;
+ rl->total += delta;
+ budget += delta - events;
+ }
+ rl->budget = budget - rl->cpu_batch;
+ __this_cpu_write(*rl->cpu_budget, rl->cpu_batch);
+ if (!hrtimer_active(&rl->timer) && ktime_after(rl->target, now))
+ hrtimer_start_range_ns(&rl->timer, rl->target,
+ ktime_to_ns(rl->period),
+ HRTIMER_MODE_ABS);
+ raw_spin_unlock_irqrestore(&rl->lock, flags);
+ }
+ preempt_enable();
+
+ return percpu_ratelimit_blocked(rl);
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_charge);
+
+/*
+ * Returns count of consumed events.
+ */
+u64 percpu_ratelimit_sum(struct percpu_ratelimit *rl)
+{
+ unsigned long flags;
+ int cpu;
+ s64 ret;
+
+ raw_spin_lock_irqsave(&rl->lock, flags);
+ ret = rl->total - rl->budget;
+ for_each_online_cpu(cpu)
+ ret -= per_cpu(*rl->cpu_budget, cpu);
+ raw_spin_unlock_irqrestore(&rl->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(percpu_ratelimit_sum);
next prev parent reply other threads:[~2015-01-15 18:49 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-01-15 18:49 [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 1/6] memcg: inode-based dirty and writeback pages accounting Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 2/6] memcg: dirty-set limiting and filtered writeback Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 3/6] memcg: track shared inodes with dirty pages Konstantin Khebnikov
2015-01-15 18:55 ` Tejun Heo
2015-01-15 19:04 ` Konstantin Khlebnikov
2015-01-15 19:08 ` Tejun Heo
2015-01-15 18:49 ` Konstantin Khebnikov [this message]
2015-01-15 18:49 ` [PATCH 5/6] delay-injection: resource management via procrastination Konstantin Khebnikov
2015-01-15 18:49 ` [PATCH 6/6] memcg: filesystem bandwidth controller Konstantin Khebnikov
2015-01-16 9:37 ` [PATCHSET RFC 0/6] memcg: inode-based dirty-set controller Jan Kara
2015-01-16 12:33 ` Konstantin Khlebnikov
2015-01-16 14:25 ` Jan Kara
2015-01-29 1:21 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150115184915.10450.1814.stgit@buzz \
--to=khlebnikov@yandex-team.ru \
--cc=cgroups@vger.kernel.org \
--cc=david@fromorbit.com \
--cc=jack@suse.cz \
--cc=klamm@yandex-team.ru \
--cc=koct9i@gmail.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).