All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/2] xen: Update dm_op.h from Xen public header
@ 2023-07-20  9:30 Viresh Kumar
  2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Viresh Kumar @ 2023-07-20  9:30 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Viresh Kumar, Vincent Guittot, Alex Bennée, stratos-dev,
	Erik Schilling, Manos Pitsidianakis, Mathieu Poirier, xen-devel,
	linux-kernel

Update the definitions in dm_op.h from Xen public header.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V1->V2:
- New commit.

 include/xen/interface/hvm/dm_op.h | 445 ++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

diff --git a/include/xen/interface/hvm/dm_op.h b/include/xen/interface/hvm/dm_op.h
index 08d972f87c7b..bc6948fd1815 100644
--- a/include/xen/interface/hvm/dm_op.h
+++ b/include/xen/interface/hvm/dm_op.h
@@ -6,6 +6,451 @@
 #ifndef __XEN_PUBLIC_HVM_DM_OP_H__
 #define __XEN_PUBLIC_HVM_DM_OP_H__
 
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+
+/*
+ * IOREQ Servers
+ *
+ * The interface between an I/O emulator and Xen is called an IOREQ Server.
+ * A domain supports a single 'legacy' IOREQ Server which is instantiated if
+ * parameter...
+ *
+ * HVM_PARAM_IOREQ_PFN is read (to get the gfn containing the synchronous
+ * ioreq structures), or...
+ * HVM_PARAM_BUFIOREQ_PFN is read (to get the gfn containing the buffered
+ * ioreq ring), or...
+ * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that Xen uses
+ * to request buffered I/O emulation).
+ *
+ * The following hypercalls facilitate the creation of IOREQ Servers for
+ * 'secondary' emulators which are invoked to implement port I/O, memory, or
+ * PCI config space ranges which they explicitly register.
+ */
+
+typedef uint16_t ioservid_t;
+
+/*
+ * XEN_DMOP_create_ioreq_server: Instantiate a new IOREQ Server for a
+ *                               secondary emulator.
+ *
+ * The <id> handed back is unique for target domain. The valur of
+ * <handle_bufioreq> should be one of HVM_IOREQSRV_BUFIOREQ_* defined in
+ * hvm_op.h. If the value is HVM_IOREQSRV_BUFIOREQ_OFF then  the buffered
+ * ioreq ring will not be allocated and hence all emulation requests to
+ * this server will be synchronous.
+ */
+#define XEN_DMOP_create_ioreq_server 1
+
+struct xen_dm_op_create_ioreq_server {
+    /* IN - should server handle buffered ioreqs */
+    uint8_t handle_bufioreq;
+    uint8_t pad[3];
+    /* OUT - server id */
+    ioservid_t id;
+};
+
+/*
+ * XEN_DMOP_get_ioreq_server_info: Get all the information necessary to
+ *                                 access IOREQ Server <id>.
+ *
+ * If the IOREQ Server is handling buffered emulation requests, the
+ * emulator needs to bind to event channel <bufioreq_port> to listen for
+ * them. (The event channels used for synchronous emulation requests are
+ * specified in the per-CPU ioreq structures).
+ * In addition, if the XENMEM_acquire_resource memory op cannot be used,
+ * the emulator will need to map the synchronous ioreq structures and
+ * buffered ioreq ring (if it exists) from guest memory. If <flags> does
+ * not contain XEN_DMOP_no_gfns then these pages will be made available and
+ * the frame numbers passed back in gfns <ioreq_gfn> and <bufioreq_gfn>
+ * respectively. (If the IOREQ Server is not handling buffered emulation
+ * only <ioreq_gfn> will be valid).
+ *
+ * NOTE: To access the synchronous ioreq structures and buffered ioreq
+ *       ring, it is preferable to use the XENMEM_acquire_resource memory
+ *       op specifying resource type XENMEM_resource_ioreq_server.
+ */
+#define XEN_DMOP_get_ioreq_server_info 2
+
+struct xen_dm_op_get_ioreq_server_info {
+    /* IN - server id */
+    ioservid_t id;
+    /* IN - flags */
+    uint16_t flags;
+
+#define _XEN_DMOP_no_gfns 0
+#define XEN_DMOP_no_gfns (1u << _XEN_DMOP_no_gfns)
+
+    /* OUT - buffered ioreq port */
+    evtchn_port_t bufioreq_port;
+    /* OUT - sync ioreq gfn (see block comment above) */
+    uint64_aligned_t ioreq_gfn;
+    /* OUT - buffered ioreq gfn (see block comment above)*/
+    uint64_aligned_t bufioreq_gfn;
+};
+
+/*
+ * XEN_DMOP_map_io_range_to_ioreq_server: Register an I/O range for
+ *                                        emulation by the client of
+ *                                        IOREQ Server <id>.
+ * XEN_DMOP_unmap_io_range_from_ioreq_server: Deregister an I/O range
+ *                                            previously registered for
+ *                                            emulation by the client of
+ *                                            IOREQ Server <id>.
+ *
+ * There are three types of I/O that can be emulated: port I/O, memory
+ * accesses and PCI config space accesses. The <type> field denotes which
+ * type of range* the <start> and <end> (inclusive) fields are specifying.
+ * PCI config space ranges are specified by segment/bus/device/function
+ * values which should be encoded using the DMOP_PCI_SBDF helper macro
+ * below.
+ *
+ * NOTE: unless an emulation request falls entirely within a range mapped
+ * by a secondary emulator, it will not be passed to that emulator.
+ */
+#define XEN_DMOP_map_io_range_to_ioreq_server 3
+#define XEN_DMOP_unmap_io_range_from_ioreq_server 4
+
+struct xen_dm_op_ioreq_server_range {
+    /* IN - server id */
+    ioservid_t id;
+    uint16_t pad;
+    /* IN - type of range */
+    uint32_t type;
+# define XEN_DMOP_IO_RANGE_PORT   0 /* I/O port range */
+# define XEN_DMOP_IO_RANGE_MEMORY 1 /* MMIO range */
+# define XEN_DMOP_IO_RANGE_PCI    2 /* PCI segment/bus/dev/func range */
+    /* IN - inclusive start and end of range */
+    uint64_aligned_t start, end;
+};
+
+#define XEN_DMOP_PCI_SBDF(s,b,d,f) \
+	((((s) & 0xffff) << 16) |  \
+	 (((b) & 0xff) << 8) |     \
+	 (((d) & 0x1f) << 3) |     \
+	 ((f) & 0x07))
+
+/*
+ * XEN_DMOP_set_ioreq_server_state: Enable or disable the IOREQ Server <id>
+ *
+ * The IOREQ Server will not be passed any emulation requests until it is
+ * in the enabled state.
+ * Note that the contents of the ioreq_gfn and bufioreq_gfn (see
+ * XEN_DMOP_get_ioreq_server_info) are not meaningful until the IOREQ Server
+ * is in the enabled state.
+ */
+#define XEN_DMOP_set_ioreq_server_state 5
+
+struct xen_dm_op_set_ioreq_server_state {
+    /* IN - server id */
+    ioservid_t id;
+    /* IN - enabled? */
+    uint8_t enabled;
+    uint8_t pad;
+};
+
+/*
+ * XEN_DMOP_destroy_ioreq_server: Destroy the IOREQ Server <id>.
+ *
+ * Any registered I/O ranges will be automatically deregistered.
+ */
+#define XEN_DMOP_destroy_ioreq_server 6
+
+struct xen_dm_op_destroy_ioreq_server {
+    /* IN - server id */
+    ioservid_t id;
+    uint16_t pad;
+};
+
+/*
+ * XEN_DMOP_track_dirty_vram: Track modifications to the specified pfn
+ *                            range.
+ *
+ * NOTE: The bitmap passed back to the caller is passed in a
+ *       secondary buffer.
+ */
+#define XEN_DMOP_track_dirty_vram 7
+
+struct xen_dm_op_track_dirty_vram {
+    /* IN - number of pages to be tracked */
+    uint32_t nr;
+    uint32_t pad;
+    /* IN - first pfn to track */
+    uint64_aligned_t first_pfn;
+};
+
+/*
+ * XEN_DMOP_set_pci_intx_level: Set the logical level of one of a domain's
+ *                              PCI INTx pins.
+ */
+#define XEN_DMOP_set_pci_intx_level 8
+
+struct xen_dm_op_set_pci_intx_level {
+    /* IN - PCI INTx identification (domain:bus:device:intx) */
+    uint16_t domain;
+    uint8_t bus, device, intx;
+    /* IN - Level: 0 -> deasserted, 1 -> asserted */
+    uint8_t  level;
+};
+
+/*
+ * XEN_DMOP_set_isa_irq_level: Set the logical level of a one of a domain's
+ *                             ISA IRQ lines.
+ */
+#define XEN_DMOP_set_isa_irq_level 9
+
+struct xen_dm_op_set_isa_irq_level {
+    /* IN - ISA IRQ (0-15) */
+    uint8_t  isa_irq;
+    /* IN - Level: 0 -> deasserted, 1 -> asserted */
+    uint8_t  level;
+};
+
+/*
+ * XEN_DMOP_set_pci_link_route: Map a PCI INTx line to an IRQ line.
+ */
+#define XEN_DMOP_set_pci_link_route 10
+
+struct xen_dm_op_set_pci_link_route {
+    /* PCI INTx line (0-3) */
+    uint8_t  link;
+    /* ISA IRQ (1-15) or 0 -> disable link */
+    uint8_t  isa_irq;
+};
+
+/*
+ * XEN_DMOP_modified_memory: Notify that a set of pages were modified by
+ *                           an emulator.
+ *
+ * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with
+ * @nr_extents entries.
+ *
+ * On error, @nr_extents will contain the index+1 of the extent that
+ * had the error.  It is not defined if or which pages may have been
+ * marked as dirty, in this event.
+ */
+#define XEN_DMOP_modified_memory 11
+
+struct xen_dm_op_modified_memory {
+    /*
+     * IN - Number of extents to be processed
+     * OUT -returns n+1 for failing extent
+     */
+    uint32_t nr_extents;
+    /* IN/OUT - Must be set to 0 */
+    uint32_t opaque;
+};
+
+struct xen_dm_op_modified_memory_extent {
+    /* IN - number of contiguous pages modified */
+    uint32_t nr;
+    uint32_t pad;
+    /* IN - first pfn modified */
+    uint64_aligned_t first_pfn;
+};
+
+/*
+ * XEN_DMOP_set_mem_type: Notify that a region of memory is to be treated
+ *                        in a specific way. (See definition of
+ *                        hvmmem_type_t).
+ *
+ * NOTE: In the event of a continuation (return code -ERESTART), the
+ *       @first_pfn is set to the value of the pfn of the remaining
+ *       region and @nr reduced to the size of the remaining region.
+ */
+#define XEN_DMOP_set_mem_type 12
+
+struct xen_dm_op_set_mem_type {
+    /* IN - number of contiguous pages */
+    uint32_t nr;
+    /* IN - new hvmmem_type_t of region */
+    uint16_t mem_type;
+    uint16_t pad;
+    /* IN - first pfn in region */
+    uint64_aligned_t first_pfn;
+};
+
+/*
+ * XEN_DMOP_inject_event: Inject an event into a VCPU, which will
+ *                        get taken up when it is next scheduled.
+ *
+ * Note that the caller should know enough of the state of the CPU before
+ * injecting, to know what the effect of injecting the event will be.
+ */
+#define XEN_DMOP_inject_event 13
+
+struct xen_dm_op_inject_event {
+    /* IN - index of vCPU */
+    uint32_t vcpuid;
+    /* IN - interrupt vector */
+    uint8_t vector;
+    /* IN - event type (DMOP_EVENT_* ) */
+    uint8_t type;
+/* NB. This enumeration precisely matches hvm.h:X86_EVENTTYPE_* */
+# define XEN_DMOP_EVENT_ext_int    0 /* external interrupt */
+# define XEN_DMOP_EVENT_nmi        2 /* nmi */
+# define XEN_DMOP_EVENT_hw_exc     3 /* hardware exception */
+# define XEN_DMOP_EVENT_sw_int     4 /* software interrupt (CD nn) */
+# define XEN_DMOP_EVENT_pri_sw_exc 5 /* ICEBP (F1) */
+# define XEN_DMOP_EVENT_sw_exc     6 /* INT3 (CC), INTO (CE) */
+    /* IN - instruction length */
+    uint8_t insn_len;
+    uint8_t pad0;
+    /* IN - error code (or ~0 to skip) */
+    uint32_t error_code;
+    uint32_t pad1;
+    /* IN - type-specific extra data (%cr2 for #PF, pending_dbg for #DB) */
+    uint64_aligned_t cr2;
+};
+
+/*
+ * XEN_DMOP_inject_msi: Inject an MSI for an emulated device.
+ */
+#define XEN_DMOP_inject_msi 14
+
+struct xen_dm_op_inject_msi {
+    /* IN - MSI data (lower 32 bits) */
+    uint32_t data;
+    uint32_t pad;
+    /* IN - MSI address (0xfeexxxxx) */
+    uint64_aligned_t addr;
+};
+
+/*
+ * XEN_DMOP_map_mem_type_to_ioreq_server : map or unmap the IOREQ Server <id>
+ *                                      to specific memory type <type>
+ *                                      for specific accesses <flags>
+ *
+ * For now, flags only accept the value of XEN_DMOP_IOREQ_MEM_ACCESS_WRITE,
+ * which means only write operations are to be forwarded to an ioreq server.
+ * Support for the emulation of read operations can be added when an ioreq
+ * server has such requirement in future.
+ */
+#define XEN_DMOP_map_mem_type_to_ioreq_server 15
+
+struct xen_dm_op_map_mem_type_to_ioreq_server {
+    ioservid_t id;      /* IN - ioreq server id */
+    uint16_t type;      /* IN - memory type */
+    uint32_t flags;     /* IN - types of accesses to be forwarded to the
+                           ioreq server. flags with 0 means to unmap the
+                           ioreq server */
+
+#define XEN_DMOP_IOREQ_MEM_ACCESS_READ (1u << 0)
+#define XEN_DMOP_IOREQ_MEM_ACCESS_WRITE (1u << 1)
+
+    uint64_t opaque;    /* IN/OUT - only used for hypercall continuation,
+                           has to be set to zero by the caller */
+};
+
+/*
+ * XEN_DMOP_remote_shutdown : Declare a shutdown for another domain
+ *                            Identical to SCHEDOP_remote_shutdown
+ */
+#define XEN_DMOP_remote_shutdown 16
+
+struct xen_dm_op_remote_shutdown {
+    uint32_t reason;       /* SHUTDOWN_* => enum sched_shutdown_reason */
+                           /* (Other reason values are not blocked) */
+};
+
+/*
+ * XEN_DMOP_relocate_memory : Relocate GFNs for the specified guest.
+ *                            Identical to XENMEM_add_to_physmap with
+ *                            space == XENMAPSPACE_gmfn_range.
+ */
+#define XEN_DMOP_relocate_memory 17
+
+struct xen_dm_op_relocate_memory {
+    /* All fields are IN/OUT, with their OUT state undefined. */
+    /* Number of GFNs to process. */
+    uint32_t size;
+    uint32_t pad;
+    /* Starting GFN to relocate. */
+    uint64_aligned_t src_gfn;
+    /* Starting GFN where GFNs should be relocated. */
+    uint64_aligned_t dst_gfn;
+};
+
+/*
+ * XEN_DMOP_pin_memory_cacheattr : Pin caching type of RAM space.
+ *                                 Identical to XEN_DOMCTL_pin_mem_cacheattr.
+ */
+#define XEN_DMOP_pin_memory_cacheattr 18
+
+struct xen_dm_op_pin_memory_cacheattr {
+    uint64_aligned_t start; /* Start gfn. */
+    uint64_aligned_t end;   /* End gfn. */
+/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */
+#define XEN_DMOP_MEM_CACHEATTR_UC  0
+#define XEN_DMOP_MEM_CACHEATTR_WC  1
+#define XEN_DMOP_MEM_CACHEATTR_WT  4
+#define XEN_DMOP_MEM_CACHEATTR_WP  5
+#define XEN_DMOP_MEM_CACHEATTR_WB  6
+#define XEN_DMOP_MEM_CACHEATTR_UCM 7
+#define XEN_DMOP_DELETE_MEM_CACHEATTR (~(uint32_t)0)
+    uint32_t type;          /* XEN_DMOP_MEM_CACHEATTR_* */
+    uint32_t pad;
+};
+
+/*
+ * XEN_DMOP_set_irq_level: Set the logical level of a one of a domain's
+ *                         IRQ lines (currently Arm only).
+ * Only SPIs are supported.
+ */
+#define XEN_DMOP_set_irq_level 19
+
+struct xen_dm_op_set_irq_level {
+    uint32_t irq;
+    /* IN - Level: 0 -> deasserted, 1 -> asserted */
+    uint8_t level;
+    uint8_t pad[3];
+};
+
+/*
+ * XEN_DMOP_nr_vcpus: Query the number of vCPUs a domain has.
+ *
+ * This is the number of vcpu objects allocated in Xen for the domain, and is
+ * fixed from creation time.  This bound is applicable to e.g. the vcpuid
+ * parameter of XEN_DMOP_inject_event, or number of struct ioreq objects
+ * mapped via XENMEM_acquire_resource.
+ */
+#define XEN_DMOP_nr_vcpus 20
+
+struct xen_dm_op_nr_vcpus {
+    uint32_t vcpus; /* OUT */
+};
+
+struct xen_dm_op {
+    uint32_t op;
+    uint32_t pad;
+    union {
+        struct xen_dm_op_create_ioreq_server create_ioreq_server;
+        struct xen_dm_op_get_ioreq_server_info get_ioreq_server_info;
+        struct xen_dm_op_ioreq_server_range map_io_range_to_ioreq_server;
+        struct xen_dm_op_ioreq_server_range unmap_io_range_from_ioreq_server;
+        struct xen_dm_op_set_ioreq_server_state set_ioreq_server_state;
+        struct xen_dm_op_destroy_ioreq_server destroy_ioreq_server;
+        struct xen_dm_op_track_dirty_vram track_dirty_vram;
+        struct xen_dm_op_set_pci_intx_level set_pci_intx_level;
+        struct xen_dm_op_set_isa_irq_level set_isa_irq_level;
+        struct xen_dm_op_set_irq_level set_irq_level;
+        struct xen_dm_op_set_pci_link_route set_pci_link_route;
+        struct xen_dm_op_modified_memory modified_memory;
+        struct xen_dm_op_set_mem_type set_mem_type;
+        struct xen_dm_op_inject_event inject_event;
+        struct xen_dm_op_inject_msi inject_msi;
+        struct xen_dm_op_map_mem_type_to_ioreq_server map_mem_type_to_ioreq_server;
+        struct xen_dm_op_remote_shutdown remote_shutdown;
+        struct xen_dm_op_relocate_memory relocate_memory;
+        struct xen_dm_op_pin_memory_cacheattr pin_memory_cacheattr;
+        struct xen_dm_op_nr_vcpus nr_vcpus;
+    } u;
+};
+
 struct xen_dm_op_buf {
 	GUEST_HANDLE(void) h;
 	xen_ulong_t size;
-- 
2.31.1.272.g89b43f80a514


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V2 2/2] xen: privcmd: Add support for irqfd
  2023-07-20  9:30 [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Viresh Kumar
@ 2023-07-20  9:30 ` Viresh Kumar
  2023-07-21  0:38   ` kernel test robot
  2023-07-23  4:49   ` kernel test robot
  2023-07-21  6:39 ` [PATCH V2.1 " Viresh Kumar
  2023-07-22 15:03 ` [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Oleksandr Tyshchenko
  2 siblings, 2 replies; 7+ messages in thread
From: Viresh Kumar @ 2023-07-20  9:30 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Viresh Kumar, Vincent Guittot, Alex Bennée, stratos-dev,
	Erik Schilling, Manos Pitsidianakis, Mathieu Poirier, xen-devel,
	linux-kernel

Xen provides support for injecting interrupts to the guests via the
HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
device backend implementations, in an inefficient manner currently.

Generally, the Virtio backends are implemented to work with the Eventfd
based mechanism. In order to make such backends work with Xen, another
software layer needs to poll the Eventfds and raise an interrupt to the
guest using the Xen based mechanism. This results in an extra context
switch.

This is not a new problem in Linux though. It is present with other
hypervisors like KVM, etc. as well. The generic solution implemented in
the kernel for them is to provide an IOCTL call to pass the interrupt
details and eventfd, which lets the kernel take care of polling the
eventfd and raising of the interrupt, instead of handling this in user
space (which involves an extra context switch).

This patch adds support to inject a specific interrupt to guest using
the eventfd mechanism, by preventing the extra context switch.

Inspired by existing implementations for KVM, etc..

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V1->V2:
- Improve error handling.
- Remove the unnecessary usage of list_for_each_entry_safe().
- Restrict the use of XEN_DMOP_set_irq_level to only ARM64.

 drivers/xen/privcmd.c      | 276 ++++++++++++++++++++++++++++++++++++-
 include/uapi/xen/privcmd.h |  14 ++
 2 files changed, 288 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index e2f580e30a86..0debc5482253 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -9,11 +9,16 @@
 
 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
 
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/workqueue.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -833,6 +838,257 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
 	return rc;
 }
 
+/* Irqfd support */
+static struct workqueue_struct *irqfd_cleanup_wq;
+static DEFINE_MUTEX(irqfds_lock);
+static LIST_HEAD(irqfds_list);
+
+struct privcmd_kernel_irqfd {
+	domid_t dom;
+	u8 level;
+	bool error;
+	u32 irq;
+	struct eventfd_ctx *eventfd;
+	struct work_struct shutdown;
+	wait_queue_entry_t wait;
+	struct list_head list;
+	poll_table pt;
+};
+
+static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
+{
+	lockdep_assert_held(&irqfds_lock);
+
+	list_del_init(&kirqfd->list);
+	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
+}
+
+static void irqfd_shutdown(struct work_struct *work)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(work, struct privcmd_kernel_irqfd, shutdown);
+	u64 cnt;
+
+	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
+	eventfd_ctx_put(kirqfd->eventfd);
+	kfree(kirqfd);
+}
+
+static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
+{
+	/* Different architectures support this differently */
+	struct xen_dm_op dm_op = {
+#ifdef CONFIG_ARM64
+		.op = XEN_DMOP_set_irq_level,
+		.u.set_irq_level.irq = kirqfd->irq,
+		.u.set_irq_level.level = kirqfd->level,
+#endif
+	};
+	struct xen_dm_op_buf xbufs = {
+		.size = sizeof(dm_op),
+	};
+	u64 cnt;
+	long rc;
+
+	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
+	set_xen_guest_handle(xbufs.h, &dm_op);
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);
+	xen_preemptible_hcall_end();
+
+	/* Don't repeat the error message for consecutive failures */
+	if (rc && !kirqfd->error) {
+		pr_err("Failed to configure irq: %d to level: %d for guest domain: %d\n",
+		       kirqfd->irq, kirqfd->level, kirqfd->dom);
+	}
+
+	kirqfd->error = !!rc;
+}
+
+static int
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(wait, struct privcmd_kernel_irqfd, wait);
+	__poll_t flags = key_to_poll(key);
+
+	if (flags & EPOLLIN)
+		irqfd_inject(kirqfd);
+
+	if (flags & EPOLLHUP) {
+		mutex_lock(&irqfds_lock);
+		irqfd_deactivate(kirqfd);
+		mutex_unlock(&irqfds_lock);
+	}
+
+	return 0;
+}
+
+static void
+irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(pt, struct privcmd_kernel_irqfd, pt);
+
+	add_wait_queue_priority(wqh, &kirqfd->wait);
+}
+
+static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
+{
+	struct privcmd_kernel_irqfd *kirqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+	__poll_t events;
+	struct fd f;
+	int ret;
+
+	kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
+	if (!kirqfd)
+		return -ENOMEM;
+
+	kirqfd->irq = irqfd->irq;
+	kirqfd->dom = irqfd->dom;
+	kirqfd->level = irqfd->level;
+	INIT_LIST_HEAD(&kirqfd->list);
+	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
+
+	f = fdget(irqfd->fd);
+	if (!f.file) {
+		ret = -EBADF;
+		goto error_kfree;
+	}
+
+	eventfd = eventfd_ctx_fileget(f.file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto error_fd_put;
+	}
+
+	kirqfd->eventfd = eventfd;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via a
+	 * callback whenever someone signals the underlying eventfd.
+	 */
+	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry(tmp, &irqfds_list, list) {
+		if (kirqfd->eventfd == tmp->eventfd) {
+			ret = -EBUSY;
+			mutex_unlock(&irqfds_lock);
+			goto error_eventfd;
+		}
+	}
+
+	list_add_tail(&kirqfd->list, &irqfds_list);
+	mutex_unlock(&irqfds_lock);
+
+	/*
+	 * Check if there was an event already pending on the eventfd before we
+	 * registered, and trigger it as if we didn't miss it.
+	 */
+	events = vfs_poll(f.file, &kirqfd->pt);
+	if (events & EPOLLIN)
+		irqfd_inject(kirqfd);
+
+	/*
+	 * Do not drop the file until the kirqfd is fully initialized, otherwise
+	 * we might race against the EPOLLHUP.
+	 */
+	fdput(f);
+	return 0;
+
+error_eventfd:
+	eventfd_ctx_put(eventfd);
+
+error_fd_put:
+	fdput(f);
+
+error_kfree:
+	kfree(kirqfd);
+	return ret;
+}
+
+static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
+{
+	struct privcmd_kernel_irqfd *kirqfd;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(irqfd->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry(kirqfd, &irqfds_list, list) {
+		if (kirqfd->eventfd == eventfd) {
+			irqfd_deactivate(kirqfd);
+			break;
+		}
+	}
+
+	mutex_unlock(&irqfds_lock);
+
+	eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed so
+	 * that we guarantee there will not be any more interrupts once this
+	 * deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return 0;
+}
+
+static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_irqfd irqfd;
+
+	if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
+		return -EFAULT;
+
+	/* No other flags should be set */
+	if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
+		return -EINVAL;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
+		return -EPERM;
+
+	if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
+		return privcmd_irqfd_deassign(&irqfd);
+
+	return privcmd_irqfd_assign(&irqfd);
+}
+
+static int privcmd_irqfd_init(void)
+{
+	irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
+	if (!irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void privcmd_irqfd_exit(void)
+{
+	struct privcmd_kernel_irqfd *kirqfd, *tmp;
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
+		irqfd_deactivate(kirqfd);
+
+	mutex_unlock(&irqfds_lock);
+
+	destroy_workqueue(irqfd_cleanup_wq);
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
@@ -868,6 +1124,10 @@ static long privcmd_ioctl(struct file *file,
 		ret = privcmd_ioctl_mmap_resource(file, udata);
 		break;
 
+	case IOCTL_PRIVCMD_IRQFD:
+		ret = privcmd_ioctl_irqfd(file, udata);
+		break;
+
 	default:
 		break;
 	}
@@ -992,15 +1252,27 @@ static int __init privcmd_init(void)
 	err = misc_register(&xen_privcmdbuf_dev);
 	if (err != 0) {
 		pr_err("Could not register Xen hypercall-buf device\n");
-		misc_deregister(&privcmd_dev);
-		return err;
+		goto err_privcmdbuf;
+	}
+
+	err = privcmd_irqfd_init();
+	if (err != 0) {
+		pr_err("irqfd init failed\n");
+		goto err_irqfd;
 	}
 
 	return 0;
+
+err_irqfd:
+	misc_deregister(&xen_privcmdbuf_dev);
+err_privcmdbuf:
+	misc_deregister(&privcmd_dev);
+	return err;
 }
 
 static void __exit privcmd_exit(void)
 {
+	privcmd_irqfd_exit();
 	misc_deregister(&privcmd_dev);
 	misc_deregister(&xen_privcmdbuf_dev);
 }
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index d2029556083e..47334bb91a09 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
 	__u64 addr;
 };
 
+/* For privcmd_irqfd::flags */
+#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct privcmd_irqfd {
+	__u32 fd;
+	__u32 flags;
+	__u32 irq;
+	domid_t dom;
+	__u8 level;
+	__u8 pad;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
 	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
 #define IOCTL_PRIVCMD_MMAP_RESOURCE				\
 	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
+#define IOCTL_PRIVCMD_IRQFD					\
+	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
-- 
2.31.1.272.g89b43f80a514


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
  2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
@ 2023-07-21  0:38   ` kernel test robot
  2023-07-21  6:45     ` Viresh Kumar
  2023-07-23  4:49   ` kernel test robot
  1 sibling, 1 reply; 7+ messages in thread
From: kernel test robot @ 2023-07-21  0:38 UTC (permalink / raw)
  To: Viresh Kumar, Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: llvm, oe-kbuild-all, Viresh Kumar, Vincent Guittot,
	Alex Bennée, stratos-dev, Erik Schilling,
	Manos Pitsidianakis, Mathieu Poirier, xen-devel, linux-kernel

Hi Viresh,

kernel test robot noticed the following build errors:

[auto build test ERROR on xen-tip/linux-next]
[also build test ERROR on linus/master v6.5-rc2 next-20230720]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Viresh-Kumar/xen-privcmd-Add-support-for-irqfd/20230720-173905
base:   https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next
patch link:    https://lore.kernel.org/r/a25d5f01fe9b4624aa12cab77abd001044ea02d5.1689845210.git.viresh.kumar%40linaro.org
patch subject: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
config: arm64-randconfig-r026-20230720 (https://download.01.org/0day-ci/archive/20230721/202307210852.ukq5f98v-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230721/202307210852.ukq5f98v-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202307210852.ukq5f98v-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/xen/privcmd.c:961:12: error: call to undeclared function 'eventfd_ctx_fileget'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     961 |         eventfd = eventfd_ctx_fileget(f.file);
         |                   ^
   drivers/xen/privcmd.c:961:12: note: did you mean 'eventfd_ctx_fdget'?
   include/linux/eventfd.h:56:35: note: 'eventfd_ctx_fdget' declared here
      56 | static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
         |                                   ^
>> drivers/xen/privcmd.c:961:10: error: incompatible integer to pointer conversion assigning to 'struct eventfd_ctx *' from 'int' [-Wint-conversion]
     961 |         eventfd = eventfd_ctx_fileget(f.file);
         |                 ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
   2 errors generated.


vim +/eventfd_ctx_fileget +961 drivers/xen/privcmd.c

   936	
   937	static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
   938	{
   939		struct privcmd_kernel_irqfd *kirqfd, *tmp;
   940		struct eventfd_ctx *eventfd;
   941		__poll_t events;
   942		struct fd f;
   943		int ret;
   944	
   945		kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
   946		if (!kirqfd)
   947			return -ENOMEM;
   948	
   949		kirqfd->irq = irqfd->irq;
   950		kirqfd->dom = irqfd->dom;
   951		kirqfd->level = irqfd->level;
   952		INIT_LIST_HEAD(&kirqfd->list);
   953		INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
   954	
   955		f = fdget(irqfd->fd);
   956		if (!f.file) {
   957			ret = -EBADF;
   958			goto error_kfree;
   959		}
   960	
 > 961		eventfd = eventfd_ctx_fileget(f.file);
   962		if (IS_ERR(eventfd)) {
   963			ret = PTR_ERR(eventfd);
   964			goto error_fd_put;
   965		}
   966	
   967		kirqfd->eventfd = eventfd;
   968	
   969		/*
   970		 * Install our own custom wake-up handling so we are notified via a
   971		 * callback whenever someone signals the underlying eventfd.
   972		 */
   973		init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
   974		init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
   975	
   976		mutex_lock(&irqfds_lock);
   977	
   978		list_for_each_entry(tmp, &irqfds_list, list) {
   979			if (kirqfd->eventfd == tmp->eventfd) {
   980				ret = -EBUSY;
   981				mutex_unlock(&irqfds_lock);
   982				goto error_eventfd;
   983			}
   984		}
   985	
   986		list_add_tail(&kirqfd->list, &irqfds_list);
   987		mutex_unlock(&irqfds_lock);
   988	
   989		/*
   990		 * Check if there was an event already pending on the eventfd before we
   991		 * registered, and trigger it as if we didn't miss it.
   992		 */
   993		events = vfs_poll(f.file, &kirqfd->pt);
   994		if (events & EPOLLIN)
   995			irqfd_inject(kirqfd);
   996	
   997		/*
   998		 * Do not drop the file until the kirqfd is fully initialized, otherwise
   999		 * we might race against the EPOLLHUP.
  1000		 */
  1001		fdput(f);
  1002		return 0;
  1003	
  1004	error_eventfd:
  1005		eventfd_ctx_put(eventfd);
  1006	
  1007	error_fd_put:
  1008		fdput(f);
  1009	
  1010	error_kfree:
  1011		kfree(kirqfd);
  1012		return ret;
  1013	}
  1014	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH V2.1 2/2] xen: privcmd: Add support for irqfd
  2023-07-20  9:30 [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Viresh Kumar
  2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
@ 2023-07-21  6:39 ` Viresh Kumar
  2023-07-22 15:03 ` [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Oleksandr Tyshchenko
  2 siblings, 0 replies; 7+ messages in thread
From: Viresh Kumar @ 2023-07-21  6:39 UTC (permalink / raw)
  To: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: Viresh Kumar, Vincent Guittot, Alex Bennée, stratos-dev,
	Erik Schilling, Manos Pitsidianakis, Mathieu Poirier, xen-devel,
	linux-kernel

Xen provides support for injecting interrupts to the guests via the
HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
device backend implementations, in an inefficient manner currently.

Generally, the Virtio backends are implemented to work with the Eventfd
based mechanism. In order to make such backends work with Xen, another
software layer needs to poll the Eventfds and raise an interrupt to the
guest using the Xen based mechanism. This results in an extra context
switch.

This is not a new problem in Linux though. It is present with other
hypervisors like KVM, etc. as well. The generic solution implemented in
the kernel for them is to provide an IOCTL call to pass the interrupt
details and eventfd, which lets the kernel take care of polling the
eventfd and raising of the interrupt, instead of handling this in user
space (which involves an extra context switch).

This patch adds support to inject a specific interrupt to guest using
the eventfd mechanism, by preventing the extra context switch.

Inspired by existing implementations for KVM, etc..

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V2->V2.1
- Select EVENTFD from Kconfig

V1->V2:
- Improve error handling.
- Remove the unnecessary usage of list_for_each_entry_safe().
- Restrict the use of XEN_DMOP_set_irq_level to only ARM64.

 drivers/xen/Kconfig        |   1 +
 drivers/xen/privcmd.c      | 276 ++++++++++++++++++++++++++++++++++++-
 include/uapi/xen/privcmd.h |  14 ++
 3 files changed, 289 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d5d7c402b651..7967393c55a4 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -261,6 +261,7 @@ config XEN_SCSI_BACKEND
 config XEN_PRIVCMD
 	tristate "Xen hypercall passthrough driver"
 	depends on XEN
+	select EVENTFD
 	default m
 	help
 	  The hypercall passthrough driver allows privileged user programs to
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index e2f580e30a86..0debc5482253 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -9,11 +9,16 @@
 
 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
 
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/workqueue.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -833,6 +838,257 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
 	return rc;
 }
 
+/* Irqfd support */
+static struct workqueue_struct *irqfd_cleanup_wq;
+static DEFINE_MUTEX(irqfds_lock);
+static LIST_HEAD(irqfds_list);
+
+struct privcmd_kernel_irqfd {
+	domid_t dom;
+	u8 level;
+	bool error;
+	u32 irq;
+	struct eventfd_ctx *eventfd;
+	struct work_struct shutdown;
+	wait_queue_entry_t wait;
+	struct list_head list;
+	poll_table pt;
+};
+
+static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
+{
+	lockdep_assert_held(&irqfds_lock);
+
+	list_del_init(&kirqfd->list);
+	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
+}
+
+static void irqfd_shutdown(struct work_struct *work)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(work, struct privcmd_kernel_irqfd, shutdown);
+	u64 cnt;
+
+	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
+	eventfd_ctx_put(kirqfd->eventfd);
+	kfree(kirqfd);
+}
+
+static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
+{
+	/* Different architectures support this differently */
+	struct xen_dm_op dm_op = {
+#ifdef CONFIG_ARM64
+		.op = XEN_DMOP_set_irq_level,
+		.u.set_irq_level.irq = kirqfd->irq,
+		.u.set_irq_level.level = kirqfd->level,
+#endif
+	};
+	struct xen_dm_op_buf xbufs = {
+		.size = sizeof(dm_op),
+	};
+	u64 cnt;
+	long rc;
+
+	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
+	set_xen_guest_handle(xbufs.h, &dm_op);
+
+	xen_preemptible_hcall_begin();
+	rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);
+	xen_preemptible_hcall_end();
+
+	/* Don't repeat the error message for consecutive failures */
+	if (rc && !kirqfd->error) {
+		pr_err("Failed to configure irq: %d to level: %d for guest domain: %d\n",
+		       kirqfd->irq, kirqfd->level, kirqfd->dom);
+	}
+
+	kirqfd->error = !!rc;
+}
+
+static int
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(wait, struct privcmd_kernel_irqfd, wait);
+	__poll_t flags = key_to_poll(key);
+
+	if (flags & EPOLLIN)
+		irqfd_inject(kirqfd);
+
+	if (flags & EPOLLHUP) {
+		mutex_lock(&irqfds_lock);
+		irqfd_deactivate(kirqfd);
+		mutex_unlock(&irqfds_lock);
+	}
+
+	return 0;
+}
+
+static void
+irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct privcmd_kernel_irqfd *kirqfd =
+		container_of(pt, struct privcmd_kernel_irqfd, pt);
+
+	add_wait_queue_priority(wqh, &kirqfd->wait);
+}
+
+static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
+{
+	struct privcmd_kernel_irqfd *kirqfd, *tmp;
+	struct eventfd_ctx *eventfd;
+	__poll_t events;
+	struct fd f;
+	int ret;
+
+	kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
+	if (!kirqfd)
+		return -ENOMEM;
+
+	kirqfd->irq = irqfd->irq;
+	kirqfd->dom = irqfd->dom;
+	kirqfd->level = irqfd->level;
+	INIT_LIST_HEAD(&kirqfd->list);
+	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
+
+	f = fdget(irqfd->fd);
+	if (!f.file) {
+		ret = -EBADF;
+		goto error_kfree;
+	}
+
+	eventfd = eventfd_ctx_fileget(f.file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto error_fd_put;
+	}
+
+	kirqfd->eventfd = eventfd;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via a
+	 * callback whenever someone signals the underlying eventfd.
+	 */
+	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
+	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry(tmp, &irqfds_list, list) {
+		if (kirqfd->eventfd == tmp->eventfd) {
+			ret = -EBUSY;
+			mutex_unlock(&irqfds_lock);
+			goto error_eventfd;
+		}
+	}
+
+	list_add_tail(&kirqfd->list, &irqfds_list);
+	mutex_unlock(&irqfds_lock);
+
+	/*
+	 * Check if there was an event already pending on the eventfd before we
+	 * registered, and trigger it as if we didn't miss it.
+	 */
+	events = vfs_poll(f.file, &kirqfd->pt);
+	if (events & EPOLLIN)
+		irqfd_inject(kirqfd);
+
+	/*
+	 * Do not drop the file until the kirqfd is fully initialized, otherwise
+	 * we might race against the EPOLLHUP.
+	 */
+	fdput(f);
+	return 0;
+
+error_eventfd:
+	eventfd_ctx_put(eventfd);
+
+error_fd_put:
+	fdput(f);
+
+error_kfree:
+	kfree(kirqfd);
+	return ret;
+}
+
+static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
+{
+	struct privcmd_kernel_irqfd *kirqfd;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(irqfd->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry(kirqfd, &irqfds_list, list) {
+		if (kirqfd->eventfd == eventfd) {
+			irqfd_deactivate(kirqfd);
+			break;
+		}
+	}
+
+	mutex_unlock(&irqfds_lock);
+
+	eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed so
+	 * that we guarantee there will not be any more interrupts once this
+	 * deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return 0;
+}
+
+static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
+{
+	struct privcmd_data *data = file->private_data;
+	struct privcmd_irqfd irqfd;
+
+	if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
+		return -EFAULT;
+
+	/* No other flags should be set */
+	if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
+		return -EINVAL;
+
+	/* If restriction is in place, check the domid matches */
+	if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
+		return -EPERM;
+
+	if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
+		return privcmd_irqfd_deassign(&irqfd);
+
+	return privcmd_irqfd_assign(&irqfd);
+}
+
+static int privcmd_irqfd_init(void)
+{
+	irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
+	if (!irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void privcmd_irqfd_exit(void)
+{
+	struct privcmd_kernel_irqfd *kirqfd, *tmp;
+
+	mutex_lock(&irqfds_lock);
+
+	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
+		irqfd_deactivate(kirqfd);
+
+	mutex_unlock(&irqfds_lock);
+
+	destroy_workqueue(irqfd_cleanup_wq);
+}
+
 static long privcmd_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long data)
 {
@@ -868,6 +1124,10 @@ static long privcmd_ioctl(struct file *file,
 		ret = privcmd_ioctl_mmap_resource(file, udata);
 		break;
 
+	case IOCTL_PRIVCMD_IRQFD:
+		ret = privcmd_ioctl_irqfd(file, udata);
+		break;
+
 	default:
 		break;
 	}
@@ -992,15 +1252,27 @@ static int __init privcmd_init(void)
 	err = misc_register(&xen_privcmdbuf_dev);
 	if (err != 0) {
 		pr_err("Could not register Xen hypercall-buf device\n");
-		misc_deregister(&privcmd_dev);
-		return err;
+		goto err_privcmdbuf;
+	}
+
+	err = privcmd_irqfd_init();
+	if (err != 0) {
+		pr_err("irqfd init failed\n");
+		goto err_irqfd;
 	}
 
 	return 0;
+
+err_irqfd:
+	misc_deregister(&xen_privcmdbuf_dev);
+err_privcmdbuf:
+	misc_deregister(&privcmd_dev);
+	return err;
 }
 
 static void __exit privcmd_exit(void)
 {
+	privcmd_irqfd_exit();
 	misc_deregister(&privcmd_dev);
 	misc_deregister(&xen_privcmdbuf_dev);
 }
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index d2029556083e..47334bb91a09 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
 	__u64 addr;
 };
 
+/* For privcmd_irqfd::flags */
+#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct privcmd_irqfd {
+	__u32 fd;
+	__u32 flags;
+	__u32 irq;
+	domid_t dom;
+	__u8 level;
+	__u8 pad;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
 	_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
 #define IOCTL_PRIVCMD_MMAP_RESOURCE				\
 	_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
+#define IOCTL_PRIVCMD_IRQFD					\
+	_IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
 
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
-- 
2.31.1.272.g89b43f80a514


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
  2023-07-21  0:38   ` kernel test robot
@ 2023-07-21  6:45     ` Viresh Kumar
  0 siblings, 0 replies; 7+ messages in thread
From: Viresh Kumar @ 2023-07-21  6:45 UTC (permalink / raw)
  To: kernel test robot
  Cc: Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko, llvm,
	oe-kbuild-all, Vincent Guittot, Alex Bennée, stratos-dev,
	Erik Schilling, Manos Pitsidianakis, Mathieu Poirier, xen-devel,
	linux-kernel

On 21-07-23, 08:38, kernel test robot wrote:
> Hi Viresh,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on xen-tip/linux-next]
> [also build test ERROR on linus/master v6.5-rc2 next-20230720]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Viresh-Kumar/xen-privcmd-Add-support-for-irqfd/20230720-173905
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next
> patch link:    https://lore.kernel.org/r/a25d5f01fe9b4624aa12cab77abd001044ea02d5.1689845210.git.viresh.kumar%40linaro.org
> patch subject: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
> config: arm64-randconfig-r026-20230720 (https://download.01.org/0day-ci/archive/20230721/202307210852.ukq5f98v-lkp@intel.com/config)
> compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
> reproduce: (https://download.01.org/0day-ci/archive/20230721/202307210852.ukq5f98v-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202307210852.ukq5f98v-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
> >> drivers/xen/privcmd.c:961:12: error: call to undeclared function 'eventfd_ctx_fileget'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>      961 |         eventfd = eventfd_ctx_fileget(f.file);

Fixed by selecting EVENTFD. Thanks.

-- 
viresh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 1/2] xen: Update dm_op.h from Xen public header
  2023-07-20  9:30 [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Viresh Kumar
  2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
  2023-07-21  6:39 ` [PATCH V2.1 " Viresh Kumar
@ 2023-07-22 15:03 ` Oleksandr Tyshchenko
  2 siblings, 0 replies; 7+ messages in thread
From: Oleksandr Tyshchenko @ 2023-07-22 15:03 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Vincent Guittot, Alex Bennée, stratos-dev, Erik Schilling,
	Manos Pitsidianakis, Mathieu Poirier, Juergen Gross, xen-devel,
	linux-kernel, Stefano Stabellini



On 20.07.23 12:30, Viresh Kumar wrote:

Hello Viresh


> Update the definitions in dm_op.h from Xen public header.

I think, it would be good to mention exact Xen version (commit) we are 
based on.

In general patch looks good to me, just a note.

I compared with Xen's public/hvm/dm_op.h and noticed differences. I 
understand, this cannot be 100% verbatim copy, because of headers 
location, emacs magics, GUEST_HANDLE vs XEN_GUEST_HANDLE. The Linux 
header doesn't contain any aliases the Xen header has for each "struct 
xen_dm_op_xxx", for example ...

[snip]


>
> +/*
> + * XEN_DMOP_create_ioreq_server: Instantiate a new IOREQ Server for a
> + *                               secondary emulator.
> + *
> + * The <id> handed back is unique for target domain. The valur of
> + * <handle_bufioreq> should be one of HVM_IOREQSRV_BUFIOREQ_* defined in
> + * hvm_op.h. If the value is HVM_IOREQSRV_BUFIOREQ_OFF then  the buffered
> + * ioreq ring will not be allocated and hence all emulation requests to
> + * this server will be synchronous.
> + */
> +#define XEN_DMOP_create_ioreq_server 1
> +
> +struct xen_dm_op_create_ioreq_server {
> +    /* IN - should server handle buffered ioreqs */
> +    uint8_t handle_bufioreq;
> +    uint8_t pad[3];
> +    /* OUT - server id */
> +    ioservid_t id;
> +};

... this one:

typedef struct xen_dm_op_create_ioreq_server 
xen_dm_op_create_ioreq_server_t;

And "struct xen_dm_op" down the file uses these aliases inside a union.

I assume, we have to diverge here in order to follow a recommendation
to avoid typedef'ing structs at [1], am I сorrect? Or is there another 
reason?

I think, it would be good to mention a reason in the description.

[1] https://www.kernel.org/doc/html/v6.4/process/coding-style.html#typedefs



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
  2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
  2023-07-21  0:38   ` kernel test robot
@ 2023-07-23  4:49   ` kernel test robot
  1 sibling, 0 replies; 7+ messages in thread
From: kernel test robot @ 2023-07-23  4:49 UTC (permalink / raw)
  To: Viresh Kumar, Juergen Gross, Stefano Stabellini, Oleksandr Tyshchenko
  Cc: oe-kbuild-all, Viresh Kumar, Vincent Guittot, Alex Bennée,
	stratos-dev, Erik Schilling, Manos Pitsidianakis,
	Mathieu Poirier, xen-devel, linux-kernel

Hi Viresh,

kernel test robot noticed the following build errors:

[auto build test ERROR on xen-tip/linux-next]
[also build test ERROR on linus/master v6.5-rc2 next-20230721]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Viresh-Kumar/xen-privcmd-Add-support-for-irqfd/20230720-173905
base:   https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next
patch link:    https://lore.kernel.org/r/a25d5f01fe9b4624aa12cab77abd001044ea02d5.1689845210.git.viresh.kumar%40linaro.org
patch subject: [PATCH V2 2/2] xen: privcmd: Add support for irqfd
config: x86_64-randconfig-r091-20230723 (https://download.01.org/0day-ci/archive/20230723/202307231237.JHGxqmdg-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230723/202307231237.JHGxqmdg-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202307231237.JHGxqmdg-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   drivers/xen/privcmd.c: In function 'privcmd_irqfd_assign':
>> drivers/xen/privcmd.c:961:19: error: implicit declaration of function 'eventfd_ctx_fileget'; did you mean 'eventfd_ctx_fdget'? [-Werror=implicit-function-declaration]
     961 |         eventfd = eventfd_ctx_fileget(f.file);
         |                   ^~~~~~~~~~~~~~~~~~~
         |                   eventfd_ctx_fdget
>> drivers/xen/privcmd.c:961:17: warning: assignment to 'struct eventfd_ctx *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     961 |         eventfd = eventfd_ctx_fileget(f.file);
         |                 ^
   cc1: some warnings being treated as errors


vim +961 drivers/xen/privcmd.c

   936	
   937	static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
   938	{
   939		struct privcmd_kernel_irqfd *kirqfd, *tmp;
   940		struct eventfd_ctx *eventfd;
   941		__poll_t events;
   942		struct fd f;
   943		int ret;
   944	
   945		kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
   946		if (!kirqfd)
   947			return -ENOMEM;
   948	
   949		kirqfd->irq = irqfd->irq;
   950		kirqfd->dom = irqfd->dom;
   951		kirqfd->level = irqfd->level;
   952		INIT_LIST_HEAD(&kirqfd->list);
   953		INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
   954	
   955		f = fdget(irqfd->fd);
   956		if (!f.file) {
   957			ret = -EBADF;
   958			goto error_kfree;
   959		}
   960	
 > 961		eventfd = eventfd_ctx_fileget(f.file);
   962		if (IS_ERR(eventfd)) {
   963			ret = PTR_ERR(eventfd);
   964			goto error_fd_put;
   965		}
   966	
   967		kirqfd->eventfd = eventfd;
   968	
   969		/*
   970		 * Install our own custom wake-up handling so we are notified via a
   971		 * callback whenever someone signals the underlying eventfd.
   972		 */
   973		init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
   974		init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
   975	
   976		mutex_lock(&irqfds_lock);
   977	
   978		list_for_each_entry(tmp, &irqfds_list, list) {
   979			if (kirqfd->eventfd == tmp->eventfd) {
   980				ret = -EBUSY;
   981				mutex_unlock(&irqfds_lock);
   982				goto error_eventfd;
   983			}
   984		}
   985	
   986		list_add_tail(&kirqfd->list, &irqfds_list);
   987		mutex_unlock(&irqfds_lock);
   988	
   989		/*
   990		 * Check if there was an event already pending on the eventfd before we
   991		 * registered, and trigger it as if we didn't miss it.
   992		 */
   993		events = vfs_poll(f.file, &kirqfd->pt);
   994		if (events & EPOLLIN)
   995			irqfd_inject(kirqfd);
   996	
   997		/*
   998		 * Do not drop the file until the kirqfd is fully initialized, otherwise
   999		 * we might race against the EPOLLHUP.
  1000		 */
  1001		fdput(f);
  1002		return 0;
  1003	
  1004	error_eventfd:
  1005		eventfd_ctx_put(eventfd);
  1006	
  1007	error_fd_put:
  1008		fdput(f);
  1009	
  1010	error_kfree:
  1011		kfree(kirqfd);
  1012		return ret;
  1013	}
  1014	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-07-23  4:50 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-20  9:30 [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Viresh Kumar
2023-07-20  9:30 ` [PATCH V2 2/2] xen: privcmd: Add support for irqfd Viresh Kumar
2023-07-21  0:38   ` kernel test robot
2023-07-21  6:45     ` Viresh Kumar
2023-07-23  4:49   ` kernel test robot
2023-07-21  6:39 ` [PATCH V2.1 " Viresh Kumar
2023-07-22 15:03 ` [PATCH V2 1/2] xen: Update dm_op.h from Xen public header Oleksandr Tyshchenko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.