All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>
To: Ben Segall <bsegall@google.com>, Borislav Petkov <bp@alien8.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	"H . Peter Anvin" <hpa@zytor.com>, Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>, Mel Gorman <mgorman@suse.de>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Andy Lutomirski <luto@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Sean Christopherson <seanjc@google.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Valentin Schneider <vschneid@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Wanpeng Li <wanpengli@tencent.com>
Cc: "Vineeth Pillai (Google)" <vineeth@bitbyteword.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Joel Fernandes <joel@joelfernandes.org>,
	Suleiman Souhlal <suleiman@google.com>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	himadrics@inria.fr, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, x86@kernel.org
Subject: [RFC PATCH v2 1/5] pvsched: paravirt scheduling framework
Date: Wed,  3 Apr 2024 10:01:12 -0400	[thread overview]
Message-ID: <20240403140116.3002809-2-vineeth@bitbyteword.org> (raw)
In-Reply-To: <20240403140116.3002809-1-vineeth@bitbyteword.org>

Implement a paravirt scheduling framework for linux kernel.

The framework allows for pvsched driver to register to the kernel and
receive callbacks from hypervisor(eg: kvm) for interested vcpu events
like VMENTER, VMEXIT etc.

The framework also allows hypervisor to select a pvsched driver (from
the available list of registered drivers) for each guest.

Also implement a sysctl for listing the available pvsched drivers.

Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 Kconfig                 |   2 +
 include/linux/pvsched.h | 102 +++++++++++++++++++
 kernel/sysctl.c         |  27 +++++
 virt/Makefile           |   2 +-
 virt/pvsched/Kconfig    |  12 +++
 virt/pvsched/Makefile   |   2 +
 virt/pvsched/pvsched.c  | 215 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 361 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/pvsched.h
 create mode 100644 virt/pvsched/Kconfig
 create mode 100644 virt/pvsched/Makefile
 create mode 100644 virt/pvsched/pvsched.c

diff --git a/Kconfig b/Kconfig
index 745bc773f567..4a52eaa21166 100644
--- a/Kconfig
+++ b/Kconfig
@@ -29,4 +29,6 @@ source "lib/Kconfig"
 
 source "lib/Kconfig.debug"
 
+source "virt/pvsched/Kconfig"
+
 source "Documentation/Kconfig"
diff --git a/include/linux/pvsched.h b/include/linux/pvsched.h
new file mode 100644
index 000000000000..59df6b44aacb
--- /dev/null
+++ b/include/linux/pvsched.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Google  */
+
+#ifndef _LINUX_PVSCHED_H
+#define _LINUX_PVSCHED_H 1
+
+/*
+ * List of events for which hypervisor calls back into pvsched driver.
+ * Driver can specify the events it is interested in.
+ */
+enum pvsched_vcpu_events {
+	PVSCHED_VCPU_VMENTER = 0x1,
+	PVSCHED_VCPU_VMEXIT = 0x2,
+	PVSCHED_VCPU_HALT = 0x4,
+	PVSCHED_VCPU_INTR_INJ = 0x8,
+};
+
+#define PVSCHED_NAME_MAX	32
+#define PVSCHED_MAX		8
+#define PVSCHED_DRV_BUF_MAX	(PVSCHED_NAME_MAX * PVSCHED_MAX + PVSCHED_MAX)
+
+/*
+ * pvsched driver callbacks.
+ * TODO: versioning support for better compatibility with the guest
+ *       component implementing this feature.
+ */
+struct pvsched_vcpu_ops {
+	/*
+	 * pvsched_vcpu_register() - Register the vcpu with pvsched driver.
+	 * @pid: pid of the vcpu task.
+	 *
+	 * pvsched driver can store the pid internally and initialize
+	 * itself to prepare for receiving callbacks from thsi vcpu.
+	 */
+	int (*pvsched_vcpu_register)(struct pid *pid);
+
+	/*
+	 * pvsched_vcpu_unregister() - Un-register the vcpu with pvsched driver.
+	 * @pid: pid of the vcpu task.
+	 */
+	void (*pvsched_vcpu_unregister)(struct pid *pid);
+
+	/*
+	 * pvsched_vcpu_notify_event() - Callback for pvsched events
+	 * @addr: Address of the memory region shared with guest
+	 * @pid: pid of the vcpu task.
+	 * @events: bit mask of the events that hypervisor wants to notify.
+	 */
+	void (*pvsched_vcpu_notify_event)(void *addr, struct pid *pid, u32 event);
+
+	char name[PVSCHED_NAME_MAX];
+	struct module *owner;
+	struct list_head list;
+	u32 events;
+	u32 key;
+};
+
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+int pvsched_get_available_drivers(char *buf, size_t maxlen);
+
+int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops);
+void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops);
+
+struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name);
+void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops);
+
+static inline int pvsched_validate_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+	/*
+	 * All callbacks are mandatory.
+	 */
+	if (!ops->pvsched_vcpu_register || !ops->pvsched_vcpu_unregister ||
+			!ops->pvsched_vcpu_notify_event)
+		return -EINVAL;
+
+	return 0;
+}
+#else
+static inline void pvsched_get_available_drivers(char *buf, size_t maxlen)
+{
+}
+
+static inline int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+	return -ENOTSUPP;
+}
+
+static inline void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+}
+
+static inline struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
+{
+	return NULL;
+}
+
+static inline void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+}
+#endif
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 157f7ce2942d..10a18a791b4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/mount.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/pid.h>
+#include <linux/pvsched.h>
 
 #include "../lib/kstrtox.h"
 
@@ -1615,6 +1616,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
 	return ret;
 }
 
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+static int proc_pvsched_available_drivers(struct ctl_table *ctl,
+						 int write, void *buffer,
+						 size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = PVSCHED_DRV_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	pvsched_get_available_drivers(tbl.data, PVSCHED_DRV_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+	return ret;
+}
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "panic",
@@ -2033,6 +2052,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= SYSCTL_INT_MAX,
 	},
+#endif
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+	{
+		.procname	= "pvsched_available_drivers",
+		.maxlen		= PVSCHED_DRV_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_pvsched_available_drivers,
+	},
 #endif
 	{ }
 };
diff --git a/virt/Makefile b/virt/Makefile
index 1cfea9436af9..9d0f32d775a1 100644
--- a/virt/Makefile
+++ b/virt/Makefile
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y	+= lib/
+obj-y	+= lib/ pvsched/
diff --git a/virt/pvsched/Kconfig b/virt/pvsched/Kconfig
new file mode 100644
index 000000000000..5ca2669060cb
--- /dev/null
+++ b/virt/pvsched/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config PARAVIRT_SCHED_HOST
+	bool "Paravirt scheduling framework in the host kernel"
+	default n
+	help
+	  Paravirtualized scheduling facilitates the exchange of scheduling
+	  related information between the host and guest through shared memory,
+	  enhancing the efficiency of vCPU thread scheduling by the hypervisor.
+	  An illustrative use case involves dynamically boosting the priority of
+	  a vCPU thread when the guest is executing a latency-sensitive workload
+	  on that specific vCPU.
+	  This config enables paravirt scheduling framework in the host kernel.
diff --git a/virt/pvsched/Makefile b/virt/pvsched/Makefile
new file mode 100644
index 000000000000..4ca38e30479b
--- /dev/null
+++ b/virt/pvsched/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_PARAVIRT_SCHED_HOST) += pvsched.o
diff --git a/virt/pvsched/pvsched.c b/virt/pvsched/pvsched.c
new file mode 100644
index 000000000000..610c85cf90d2
--- /dev/null
+++ b/virt/pvsched/pvsched.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google  */
+
+/*
+ *  Paravirt scheduling framework
+ *
+ */
+
+/*
+ * Heavily inspired from tcp congestion avoidance implementation.
+ * (net/ipv4/tcp_cong.c)
+ */
+
+#define pr_fmt(fmt) "PVSCHED: " fmt
+
+#include <linux/module.h>
+#include <linux/bpf.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/pvsched.h>
+
+static DEFINE_SPINLOCK(pvsched_drv_list_lock);
+static int nr_pvsched_drivers = 0;
+static LIST_HEAD(pvsched_drv_list);
+
+/*
+ * Retrieve pvsched_vcpu_ops given the name.
+ */
+static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_name(char *name)
+{
+	struct pvsched_vcpu_ops *ops;
+
+	list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+		if (strcmp(ops->name, name) == 0)
+			return ops;
+	}
+
+	return NULL;
+}
+
+/*
+ * Retrieve pvsched_vcpu_ops given the hash key.
+ */
+static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_key(u32 key)
+{
+	struct pvsched_vcpu_ops *ops;
+
+	list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+		if (ops->key == key)
+			return ops;
+	}
+
+	return NULL;
+}
+
+/*
+ * pvsched_get_available_drivers() - Copy space separated list of pvsched
+ * driver names.
+ * @buf: buffer to store the list of driver names
+ * @maxlen: size of the buffer
+ *
+ * Return: 0 on success, negative value on error.
+ */
+int pvsched_get_available_drivers(char *buf, size_t maxlen)
+{
+	struct pvsched_vcpu_ops *ops;
+	size_t offs = 0;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (maxlen > PVSCHED_DRV_BUF_MAX)
+		maxlen = PVSCHED_DRV_BUF_MAX;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ops->name);
+
+		if (WARN_ON_ONCE(offs >= maxlen))
+			break;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pvsched_get_available_drivers);
+
+/*
+ * pvsched_register_vcpu_ops() - Register the driver in the kernel.
+ * @ops: Driver data(callbacks)
+ *
+ * After the registration, driver will be exposed to the hypervisor
+ * for assignment to the guest VMs.
+ *
+ * Return: 0 on success, negative value on error.
+ */
+int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+	int ret = 0;
+
+	ops->key = jhash(ops->name, sizeof(ops->name), strlen(ops->name));
+	spin_lock(&pvsched_drv_list_lock);
+	if (nr_pvsched_drivers > PVSCHED_MAX) {
+		ret = -ENOSPC;
+	} if (pvsched_find_vcpu_ops_key(ops->key)) {
+		ret = -EEXIST;
+	} else if (!(ret = pvsched_validate_vcpu_ops(ops))) {
+		list_add_tail_rcu(&ops->list, &pvsched_drv_list);
+		nr_pvsched_drivers++;
+	}
+	spin_unlock(&pvsched_drv_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pvsched_register_vcpu_ops);
+
+/*
+ * pvsched_register_vcpu_ops() - Un-register the driver from the kernel.
+ * @ops: Driver data(callbacks)
+ *
+ * After un-registration, driver will not be visible to hypervisor.
+ */
+void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+	spin_lock(&pvsched_drv_list_lock);
+	list_del_rcu(&ops->list);
+	nr_pvsched_drivers--;
+	spin_unlock(&pvsched_drv_list_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(pvsched_unregister_vcpu_ops);
+
+/*
+ * pvsched_get_vcpu_ops: Acquire the driver.
+ * @name: Name of the driver to be acquired.
+ *
+ * Hypervisor can use this API to get the driver structure for
+ * assigning it to guest VMs. This API takes a reference on the
+ * module/bpf program so that driver doesn't vanish under the
+ * hypervisor.
+ *
+ * Return: driver structure if found, else NULL.
+ */
+struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
+{
+	struct pvsched_vcpu_ops *ops;
+
+	if (!name || (strlen(name) >= PVSCHED_NAME_MAX))
+		return NULL;
+
+	rcu_read_lock();
+	ops = pvsched_find_vcpu_ops_name(name);
+	if (!ops)
+		goto out;
+
+	if (unlikely(!bpf_try_module_get(ops, ops->owner))) {
+		ops = NULL;
+		goto out;
+	}
+
+out:
+	rcu_read_unlock();
+	return ops;
+}
+EXPORT_SYMBOL_GPL(pvsched_get_vcpu_ops);
+
+/*
+ * pvsched_put_vcpu_ops: Release the driver.
+ * @name: Name of the driver to be releases.
+ *
+ * Hypervisor can use this API to release the driver.
+ */
+void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+	bpf_module_put(ops, ops->owner);
+}
+EXPORT_SYMBOL_GPL(pvsched_put_vcpu_ops);
+
+/*
+ * NOP vm_ops Sample implementation.
+ * This driver doesn't do anything other than registering itself.
+ * Placeholder for adding some default logic when the feature is
+ * complete.
+ */
+static int nop_pvsched_vcpu_register(struct pid *pid)
+{
+	return 0;
+}
+static void nop_pvsched_vcpu_unregister(struct pid *pid)
+{
+}
+static void nop_pvsched_notify_event(void *addr, struct pid *pid, u32 event)
+{
+}
+
+struct pvsched_vcpu_ops nop_vcpu_ops = {
+	.events = PVSCHED_VCPU_VMENTER | PVSCHED_VCPU_VMEXIT | PVSCHED_VCPU_HALT,
+	.pvsched_vcpu_register = nop_pvsched_vcpu_register,
+	.pvsched_vcpu_unregister = nop_pvsched_vcpu_unregister,
+	.pvsched_vcpu_notify_event = nop_pvsched_notify_event,
+	.name = "pvsched_nop",
+	.owner = THIS_MODULE,
+};
+
+static int __init pvsched_init(void)
+{
+	return WARN_ON(pvsched_register_vcpu_ops(&nop_vcpu_ops));
+}
+
+late_initcall(pvsched_init);
-- 
2.40.1


  reply	other threads:[~2024-04-03 14:01 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-03 14:01 [RFC PATCH v2 0/5] Paravirt Scheduling (Dynamic vcpu priority management) Vineeth Pillai (Google)
2024-04-03 14:01 ` Vineeth Pillai (Google) [this message]
2024-04-08 13:57   ` [RFC PATCH v2 1/5] pvsched: paravirt scheduling framework Vineeth Remanan Pillai
2024-04-03 14:01 ` [RFC PATCH v2 2/5] kvm: Implement the paravirt sched framework for kvm Vineeth Pillai (Google)
2024-04-08 13:58   ` Vineeth Remanan Pillai
2024-04-03 14:01 ` [RFC PATCH v2 3/5] kvm: interface for managing pvsched driver for guest VMs Vineeth Pillai (Google)
2024-04-06  5:56   ` kernel test robot
2024-04-08 13:59   ` Vineeth Remanan Pillai
2024-04-03 14:01 ` [RFC PATCH v2 4/5] pvsched: bpf support for pvsched Vineeth Pillai (Google)
2024-04-08 14:00   ` Vineeth Remanan Pillai
2024-04-03 14:01 ` [RFC PATCH v2 5/5] selftests/bpf: sample implementation of a bpf pvsched driver Vineeth Pillai (Google)
2024-04-08 14:01   ` Vineeth Remanan Pillai
2024-04-08 13:54 ` [RFC PATCH v2 0/5] Paravirt Scheduling (Dynamic vcpu priority management) Vineeth Remanan Pillai
2024-05-01 15:29 ` Sean Christopherson
2024-05-02 13:42   ` Vineeth Remanan Pillai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240403140116.3002809-2-vineeth@bitbyteword.org \
    --to=vineeth@bitbyteword.org \
    --cc=bp@alien8.de \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=himadrics@inria.fr \
    --cc=hpa@zytor.com \
    --cc=joel@joelfernandes.org \
    --cc=juri.lelli@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mhiramat@kernel.org \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=seanjc@google.com \
    --cc=suleiman@google.com \
    --cc=tglx@linutronix.de \
    --cc=vincent.guittot@linaro.org \
    --cc=vkuznets@redhat.com \
    --cc=vschneid@redhat.com \
    --cc=wanpengli@tencent.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.