[PATCH] kernel/time: Add hr_sleep syscall, a high-resolution sleep service

From: Marco Faltelli <marco.faltelli@uniroma2.it>
To: tglx@linutronix.de, linux-kernel@vger.kernel.org, x86@kernel.org
Cc: Marco Faltelli <marco.faltelli@uniroma2.it>
Subject: [PATCH] kernel/time: Add hr_sleep syscall, a high-resolution sleep service
Date: Fri, 15 Jan 2021 18:07:33 +0000	[thread overview]
Message-ID: <20210115180733.5663-1-marco.faltelli@uniroma2.it> (raw)

hr_sleep is a new system call engineered for nanosecond time scale
granularities.
With respect to nanosleep, it uses a single value representation
of the sleep period.
hr_sleep achieves 15x improvement for microsecond scale timers
w.r.t. nanosleep: the reason is the use of a CPU register for
passing the sleep period (avoiding cross-ring data move) and
the use of the thread's kernel stack area (avoiding in-kernel
memory allocations).
Further details about hr_sleep and the evaluation compared
to nanosleep can be found in Section 3 of our paper "Metronome:
adaptive and precise intermittent packet retrieval in DPDK"
hr_sleep in this patch has syscall number 442, so you can try it
calling syscall(442, sleep_period)

Signed-off-by: Marco Faltelli <marco.faltelli@uniroma2.it>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 kernel/time/hrtimer.c                  | 61 ++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..27343c016e42 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
 441	common	epoll_pwait2		sys_epoll_pwait2
+442	common	hr_sleep		sys_hr_sleep
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..422410c60a9f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1988,6 +1988,67 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
 }
 #endif
 
+#ifdef CONFIG_64BIT
+
+
+typedef struct _control_record {
+	struct task_struct *task;
+	int pid;
+	int awake;
+	struct hrtimer hr_timer;
+} control_record;
+
+
+static enum hrtimer_restart hr_sleep_callback(struct hrtimer *timer)
+{
+	control_record *control;
+	struct task_struct *the_task;
+
+	control = (control_record *)container_of(timer, control_record, hr_timer);
+	control->awake = 1;
+	the_task = control->task;
+	wake_up_process(the_task);
+
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * hr_sleep - a high-resolution sleep service for fine-grained timeouts
+ * @nanoseconds:	the requested sleep period in nanoseconds
+ *
+ * Returns:
+ * 0 when the sleep request successfully terminated
+ * -EINVAL if a sleep period < 0 is requested
+ */
+SYSCALL_DEFINE1(hr_sleep, long, nanoseconds)
+{
+	DECLARE_WAIT_QUEUE_HEAD(the_queue);//here we use a private queue
+	control_record *control;
+	ktime_t ktime_interval;
+
+	if (nanoseconds < 0)
+		return -EINVAL;
+
+	if (nanoseconds == 0)
+		return 0;
+
+	ktime_interval = ktime_set(0, nanoseconds);
+	control = (control_record *)((void *) current->stack + sizeof(struct thread_info));
+	hrtimer_init(&(control->hr_timer), CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	control->hr_timer.function = &hr_sleep_callback;
+	control->task = current;
+	control->pid  = control->task->pid; //current->pid is more costly
+	control->awake = 0;
+	hrtimer_start(&(control->hr_timer), ktime_interval, HRTIMER_MODE_REL);
+	wait_event_interruptible(the_queue, control->awake == 1);
+	hrtimer_cancel(&(control->hr_timer));
+
+	return 0;
+
+}
+
+#endif
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
2.25.1