All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] ioeventfd: Remove natural sized length limitation
@ 2011-07-06  4:37 Sasha Levin
  2011-07-06  4:37 ` [PATCH 2/5] ioeventfd: Add helper functions for reading and writing Sasha Levin
                   ` (3 more replies)
  0 siblings, 4 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-06  4:37 UTC (permalink / raw)
  To: kvm
  Cc: Sasha Levin, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

This patch removes the 8 byte length limit on ioeventfds.

The consequences are that any write in the provided region
which is contained entierly by the region and matches datamatch
(if specified) will trigger an event.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 Documentation/virtual/kvm/api.txt |    4 ++--
 include/linux/kvm.h               |    4 ++--
 virt/kvm/eventfd.c                |   15 ++-------------
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b251136..7994840 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1334,8 +1334,8 @@ provided event instead of triggering an exit.
 
 struct kvm_ioeventfd {
 	__u64 datamatch;
-	__u64 addr;        /* legal pio/mmio address */
-	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__u64 addr;
+	__u32 len;
 	__s32 fd;
 	__u32 flags;
 	__u8  pad[36];
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9c9ca7c..81cd295 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -398,8 +398,8 @@ enum {
 
 struct kvm_ioeventfd {
 	__u64 datamatch;
-	__u64 addr;        /* legal pio/mmio address */
-	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__u64 addr;
+	__u32 len;
 	__s32 fd;
 	__u32 flags;
 	__u8  pad[36];
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 73358d2..5fb09b4 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -449,8 +449,8 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 {
 	u64 _val;
 
-	if (!(addr == p->addr && len == p->length))
-		/* address-range must be precise for a hit */
+	if (!((addr >= p->addr) && ((addr + len) <= (p->addr + p->length))))
+		/* the entire address-range must be within the ioeventfd */
 		return false;
 
 	if (p->wildcard)
@@ -536,17 +536,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	struct eventfd_ctx       *eventfd;
 	int                       ret;
 
-	/* must be natural-word sized */
-	switch (args->len) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		break;
-	default:
-		return -EINVAL;
-	}
-
 	/* check for range overflow */
 	if (args->addr + args->len < args->addr)
 		return -EINVAL;
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 2/5] ioeventfd: Add helper functions for reading and writing
  2011-07-06  4:37 [PATCH 1/5] ioeventfd: Remove natural sized length limitation Sasha Levin
@ 2011-07-06  4:37 ` Sasha Levin
  2011-07-06  4:37 ` [PATCH 3/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_READ Sasha Levin
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-06  4:37 UTC (permalink / raw)
  To: kvm; +Cc: Sasha Levin

Add get_val() and set_val() to help getting and setting a natural
sized variable pointed by a void ptr.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 virt/kvm/eventfd.c |   57 ++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 5fb09b4..a1a3f66 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -444,23 +444,10 @@ ioeventfd_release(struct _ioeventfd *p)
 	kfree(p);
 }
 
-static bool
-ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
+static u64 get_val(const void *val, int len)
 {
 	u64 _val;
 
-	if (!((addr >= p->addr) && ((addr + len) <= (p->addr + p->length))))
-		/* the entire address-range must be within the ioeventfd */
-		return false;
-
-	if (p->wildcard)
-		/* all else equal, wildcard is always a hit */
-		return true;
-
-	/* otherwise, we have to actually compare the data */
-
-	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
-
 	switch (len) {
 	case 1:
 		_val = *(u8 *)val;
@@ -475,8 +462,48 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 		_val = *(u64 *)val;
 		break;
 	default:
-		return false;
+		return 0;
+	}
+
+	return _val;
+}
+
+static void set_val(void *val, int len, u64 new_val)
+{
+	switch (len) {
+	case 1:
+		*(u8 *)val = new_val;
+		break;
+	case 2:
+		*(u16 *)val = new_val;
+		break;
+	case 4:
+		*(u32 *)val = new_val;
+		break;
+	case 8:
+		*(u64 *)val = new_val;
+		break;
 	}
+}
+
+static bool
+ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
+{
+	u64 _val;
+
+	if (!((addr >= p->addr) && ((addr + len) <= (p->addr + p->length))))
+		/* the entire address-range must be within the ioeventfd */
+		return false;
+
+	if (p->wildcard)
+		/* all else equal, wildcard is always a hit */
+		return true;
+
+	/* otherwise, we have to actually compare the data */
+
+	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
+
+	_val = get_val(val, len);
 
 	return _val == p->datamatch ? true : false;
 }
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 3/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_READ
  2011-07-06  4:37 [PATCH 1/5] ioeventfd: Remove natural sized length limitation Sasha Levin
  2011-07-06  4:37 ` [PATCH 2/5] ioeventfd: Add helper functions for reading and writing Sasha Levin
@ 2011-07-06  4:37 ` Sasha Levin
  2011-07-06  4:37 ` [PATCH 4/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_NOWRITE Sasha Levin
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
  3 siblings, 0 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-06  4:37 UTC (permalink / raw)
  To: kvm
  Cc: Sasha Levin, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

The new flag specifies whether reads from the address specified
in the ioeventfd will raise the event like writes do.

The read value will be the one passed in datamatch.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 Documentation/virtual/kvm/api.txt |    5 +++++
 include/linux/kvm.h               |    2 ++
 virt/kvm/eventfd.c                |   24 ++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7994840..8179bc1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1346,10 +1346,15 @@ The following flags are defined:
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
+If the read flag is set, the event will be signaled when the specified address
+is being read from. The value actually read by the guest will be the value
+passed in datamatch.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 81cd295..ce4cc89 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -387,12 +387,14 @@ enum {
 	kvm_ioeventfd_flag_nr_datamatch,
 	kvm_ioeventfd_flag_nr_pio,
 	kvm_ioeventfd_flag_nr_deassign,
+	kvm_ioeventfd_flag_nr_read,
 	kvm_ioeventfd_flag_nr_max,
 };
 
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a1a3f66..e0f24bf 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -428,6 +428,7 @@ struct _ioeventfd {
 	u64                  datamatch;
 	struct kvm_io_device dev;
 	bool                 wildcard;
+	bool                 track_reads;
 };
 
 static inline struct _ioeventfd *
@@ -522,6 +523,24 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 	return 0;
 }
 
+/* MMIO/PIO reads trigger an event if the addr/val match */
+static int
+ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
+		void *val)
+{
+	struct _ioeventfd *p = to_ioeventfd(this);
+
+	/* Exit if signaling on reads isn't requested */
+	if (!p->track_reads)
+		return -EOPNOTSUPP;
+
+	if (!ioeventfd_in_range(p, addr, len, val))
+		return -EOPNOTSUPP;
+
+	eventfd_signal(p->eventfd, 1);
+	return 0;
+}
+
 /*
  * This function is called as KVM is completely shutting down.  We do not
  * need to worry about locking just nuke anything we have as quickly as possible
@@ -535,6 +554,7 @@ ioeventfd_destructor(struct kvm_io_device *this)
 }
 
 static const struct kvm_io_device_ops ioeventfd_ops = {
+	.read       = ioeventfd_read,
 	.write      = ioeventfd_write,
 	.destructor = ioeventfd_destructor,
 };
@@ -600,6 +620,10 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		goto unlock_fail;
 	}
 
+	/* Raise signal on reads from address if requested */
+	if (args->flags & KVM_IOEVENTFD_FLAG_READ)
+		p->track_reads = true;
+
 	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 
 	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 4/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_NOWRITE
  2011-07-06  4:37 [PATCH 1/5] ioeventfd: Remove natural sized length limitation Sasha Levin
  2011-07-06  4:37 ` [PATCH 2/5] ioeventfd: Add helper functions for reading and writing Sasha Levin
  2011-07-06  4:37 ` [PATCH 3/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_READ Sasha Levin
@ 2011-07-06  4:37 ` Sasha Levin
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
  3 siblings, 0 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-06  4:37 UTC (permalink / raw)
  To: kvm
  Cc: Sasha Levin, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

The new flag specifies whether writes to the address specified
in the ioeventfd shouldn't raise the event.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 Documentation/virtual/kvm/api.txt |    5 +++++
 include/linux/kvm.h               |    2 ++
 virt/kvm/eventfd.c                |    9 +++++++++
 3 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 8179bc1..317d86a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1347,6 +1347,7 @@ The following flags are defined:
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
+#define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -1355,6 +1356,10 @@ If the read flag is set, the event will be signaled when the specified address
 is being read from. The value actually read by the guest will be the value
 passed in datamatch.
 
+If the nowrite flag is set, the event won't be signaled when the specified address
+is being written to.
+
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ce4cc89..8a12711 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -388,6 +388,7 @@ enum {
 	kvm_ioeventfd_flag_nr_pio,
 	kvm_ioeventfd_flag_nr_deassign,
 	kvm_ioeventfd_flag_nr_read,
+	kvm_ioeventfd_flag_nr_nowrite,
 	kvm_ioeventfd_flag_nr_max,
 };
 
@@ -395,6 +396,7 @@ enum {
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
+#define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index e0f24bf..5f2d203 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -429,6 +429,7 @@ struct _ioeventfd {
 	struct kvm_io_device dev;
 	bool                 wildcard;
 	bool                 track_reads;
+	bool                 track_writes;
 };
 
 static inline struct _ioeventfd *
@@ -516,6 +517,10 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
 
+	/* Exit if signaling on writes isn't requested */
+	if (!p->track_writes)
+		return -EOPNOTSUPP;
+
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
@@ -624,6 +629,10 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	if (args->flags & KVM_IOEVENTFD_FLAG_READ)
 		p->track_reads = true;
 
+	/* Don't raise signal on writes to address if requested */
+	if (!(args->flags & KVM_IOEVENTFD_FLAG_NOWRITE))
+		p->track_writes = true;
+
 	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 
 	ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev);
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06  4:37 [PATCH 1/5] ioeventfd: Remove natural sized length limitation Sasha Levin
                   ` (2 preceding siblings ...)
  2011-07-06  4:37 ` [PATCH 4/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_NOWRITE Sasha Levin
@ 2011-07-06  4:37 ` Sasha Levin
  2011-07-06 11:42   ` Michael S. Tsirkin
                     ` (3 more replies)
  3 siblings, 4 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-06  4:37 UTC (permalink / raw)
  To: kvm
  Cc: Sasha Levin, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

The new flag allows passing a connected socket instead of an
eventfd to be notified of writes or reads to the specified memory region.

Instead of signaling an event, On write - the value written to the memory
region is written to the pipe.
On read - a notification of the read is sent to the host, and a response
is expected with the value to be 'read'.

Using a socket instead of an eventfd is usefull when any value can be
written to the memory region but we're interested in recieving the
actual value instead of just a notification.

A simple example for practical use is the serial port. we are not
interested in an exit every time a char is written to the port, but
we do need to know what was written so we could handle it on the guest.

Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 Documentation/virtual/kvm/api.txt |   18 ++++-
 include/linux/kvm.h               |    9 ++
 virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
 3 files changed, 161 insertions(+), 19 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 317d86a..74f0946 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
 
 This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
 within the guest.  A guest write in the registered address will signal the
-provided event instead of triggering an exit.
+provided event or write to the provided socket instead of triggering an exit.
 
 struct kvm_ioeventfd {
 	__u64 datamatch;
@@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+struct kvm_ioeventfd_data {
+	__u64 data;
+	__u64 addr;
+	__u32 len;
+	__u8  is_write;
+};
+
 The following flags are defined:
 
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
@@ -1348,6 +1355,7 @@ The following flags are defined:
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -1359,6 +1367,14 @@ passed in datamatch.
 If the nowrite flag is set, the event won't be signaled when the specified address
 is being written to.
 
+If the socket flag is set, fd is expected to be a connected AF_UNIX
+SOCK_SEQPACKET socket. Once a guest write in the registered address is
+detected - a struct kvm_ioeventfd_data which describes the write will be
+written to the socket.
+On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and
+would wait for a response with a struct kvm_ioeventfd_data containing the
+value which should be 'read' by the guest.
+
 
 5. The kvm_run structure
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a12711..ff3d808 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -389,6 +389,7 @@ enum {
 	kvm_ioeventfd_flag_nr_deassign,
 	kvm_ioeventfd_flag_nr_read,
 	kvm_ioeventfd_flag_nr_nowrite,
+	kvm_ioeventfd_flag_nr_socket,
 	kvm_ioeventfd_flag_nr_max,
 };
 
@@ -397,6 +398,7 @@ enum {
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
 #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
@@ -409,6 +411,13 @@ struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
+struct kvm_ioeventfd_data {
+	__u64 data;
+	__u64 addr;
+	__u32 len;
+	__u8  is_write;
+};
+
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
 	/* in */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 5f2d203..d1d63b3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -32,6 +32,7 @@
 #include <linux/eventfd.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/net.h>
 
 #include "iodev.h"
 
@@ -413,10 +414,11 @@ module_exit(irqfd_module_exit);
 
 /*
  * --------------------------------------------------------------------
- * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
+ *            a socket write.
  *
- * userspace can register a PIO/MMIO address with an eventfd for receiving
- * notification when the memory has been touched.
+ * userspace can register a PIO/MMIO address with an eventfd or a
+ * socket for receiving notification when the memory has been touched.
  * --------------------------------------------------------------------
  */
 
@@ -424,7 +426,10 @@ struct _ioeventfd {
 	struct list_head     list;
 	u64                  addr;
 	int                  length;
-	struct eventfd_ctx  *eventfd;
+	union {
+		struct socket       *sock;
+		struct eventfd_ctx  *eventfd;
+	};
 	u64                  datamatch;
 	struct kvm_io_device dev;
 	bool                 wildcard;
@@ -441,7 +446,11 @@ to_ioeventfd(struct kvm_io_device *dev)
 static void
 ioeventfd_release(struct _ioeventfd *p)
 {
-	eventfd_ctx_put(p->eventfd);
+	if (p->eventfd)
+		eventfd_ctx_put(p->eventfd);
+	else
+		sockfd_put(p->sock);
+
 	list_del(&p->list);
 	kfree(p);
 }
@@ -510,12 +519,65 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 	return _val == p->datamatch ? true : false;
 }
 
+static ssize_t socket_write(struct socket *sock, const void *buf, size_t count)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+	struct msghdr msg;
+	struct iovec iov;
+
+	iov = (struct iovec) {
+		.iov_base = (void *)buf,
+		.iov_len  = count,
+	};
+
+	msg = (struct msghdr) {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = sock_sendmsg(sock, &msg, count);
+	set_fs(old_fs);
+
+	return res;
+}
+
+static ssize_t socket_read(struct socket *sock, void *buf, size_t count)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+	struct msghdr msg;
+	struct iovec iov;
+
+	iov = (struct iovec) {
+		.iov_base = (void *)buf,
+		.iov_len  = count,
+	};
+
+	msg = (struct msghdr) {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = sock_recvmsg(sock, &msg, count, 0);
+	set_fs(old_fs);
+
+	return res;
+}
+
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 		const void *val)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
+	struct kvm_ioeventfd_data data;
 
 	/* Exit if signaling on writes isn't requested */
 	if (!p->track_writes)
@@ -524,7 +586,18 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	data = (struct kvm_ioeventfd_data) {
+		.data = get_val(val, len),
+		.addr = addr,
+		.len = len,
+		.is_write = 1,
+	};
+
+	if (p->sock)
+		socket_write(p->sock, &data, sizeof(data));
+	else
+		eventfd_signal(p->eventfd, 1);
+
 	return 0;
 }
 
@@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
 		void *val)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
+	struct kvm_ioeventfd_data data;
 
 	/* Exit if signaling on reads isn't requested */
 	if (!p->track_reads)
@@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	data = (struct kvm_ioeventfd_data) {
+		.addr = addr,
+		.len = len,
+		.is_write = 0,
+	};
+
+	if (p->sock) {
+		socket_write(p->sock, &data, sizeof(data));
+		socket_read(p->sock, &data, sizeof(data));
+		set_val(val, len, data.data);
+	} else {
+		set_val(val, len, p->datamatch);
+		eventfd_signal(p->eventfd, 1);
+	}
+
 	return 0;
 }
 
@@ -585,7 +673,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 	struct _ioeventfd        *p;
-	struct eventfd_ctx       *eventfd;
+	struct eventfd_ctx       *eventfd = NULL;
 	int                       ret;
 
 	/* check for range overflow */
@@ -596,10 +684,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 		return -EINVAL;
 
-	eventfd = eventfd_ctx_fdget(args->fd);
-	if (IS_ERR(eventfd))
-		return PTR_ERR(eventfd);
-
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p) {
 		ret = -ENOMEM;
@@ -611,6 +695,20 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	p->length  = args->len;
 	p->eventfd = eventfd;
 
+	if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) {
+		ret = 0;
+		p->sock = sockfd_lookup(args->fd, &ret);
+		if (ret)
+			goto fail;
+	} else {
+		ret = -EINVAL;
+		eventfd = eventfd_ctx_fdget(args->fd);
+		if (IS_ERR(eventfd))
+			goto fail;
+
+		p->eventfd = eventfd;
+	}
+
 	/* The datamatch feature is optional, otherwise this is a wildcard */
 	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 		p->datamatch = args->datamatch;
@@ -649,8 +747,14 @@ unlock_fail:
 	mutex_unlock(&kvm->slots_lock);
 
 fail:
+	if (eventfd)
+		eventfd_ctx_put(eventfd);
+
+	if (p->sock)
+		sockfd_put(p->sock);
+
+
 	kfree(p);
-	eventfd_ctx_put(eventfd);
 
 	return ret;
 }
@@ -661,12 +765,21 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 	struct _ioeventfd        *p, *tmp;
-	struct eventfd_ctx       *eventfd;
+	struct eventfd_ctx       *eventfd = NULL;
+	struct socket            *sock = NULL;
 	int                       ret = -ENOENT;
 
-	eventfd = eventfd_ctx_fdget(args->fd);
-	if (IS_ERR(eventfd))
-		return PTR_ERR(eventfd);
+	if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) {
+		ret = 0;
+		sock = sockfd_lookup(args->fd, &ret);
+		if (ret)
+			return PTR_ERR(sock);
+	} else {
+		ret = -EINVAL;
+		eventfd = eventfd_ctx_fdget(args->fd);
+		if (IS_ERR(eventfd))
+			return PTR_ERR(eventfd);
+	}
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -674,6 +787,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 
 		if (p->eventfd != eventfd  ||
+		    p->sock != sock        ||
 		    p->addr != args->addr  ||
 		    p->length != args->len ||
 		    p->wildcard != wildcard)
@@ -690,7 +804,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	mutex_unlock(&kvm->slots_lock);
 
-	eventfd_ctx_put(eventfd);
+	if (eventfd)
+		eventfd_ctx_put(eventfd);
+	if (sock)
+		sockfd_put(sock);
 
 	return ret;
 }
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
@ 2011-07-06 11:42   ` Michael S. Tsirkin
  2011-07-06 15:01     ` Sasha Levin
  2011-07-06 12:39   ` Avi Kivity
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 62+ messages in thread
From: Michael S. Tsirkin @ 2011-07-06 11:42 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
> 
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
> 
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
> 
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
> 
> Cc: Avi Kivity <avi@redhat.com>
> Cc: Ingo Molnar <mingo@elte.hu>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> ---
>  Documentation/virtual/kvm/api.txt |   18 ++++-
>  include/linux/kvm.h               |    9 ++
>  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
>  3 files changed, 161 insertions(+), 19 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 317d86a..74f0946 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
>  
>  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
>  within the guest.  A guest write in the registered address will signal the
> -provided event instead of triggering an exit.
> +provided event or write to the provided socket instead of triggering an exit.
>  
>  struct kvm_ioeventfd {
>  	__u64 datamatch;
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>  	__u8  pad[36];
>  };
>  
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>  The following flags are defined:
>  
>  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> @@ -1348,6 +1355,7 @@ The following flags are defined:
>  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
>  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
>  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
>  
>  If datamatch flag is set, the event will be signaled only if the written value
>  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> @@ -1359,6 +1367,14 @@ passed in datamatch.
>  If the nowrite flag is set, the event won't be signaled when the specified address
>  is being written to.
>  
> +If the socket flag is set, fd is expected to be a connected AF_UNIX
> +SOCK_SEQPACKET socket.

Let's verify that then?

> +
> +	if (p->sock)
> +		socket_write(p->sock, &data, sizeof(data));
> +	else
> +		eventfd_signal(p->eventfd, 1);
> +
>  	return 0;
>  }

This still loses the data if socket would block and there's a signal.
I think we agreed to use non blocking operations and exit to
userspace in that case?


>  
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>  		void *val)
>  {
>  	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>  
>  	/* Exit if signaling on reads isn't requested */
>  	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>  	if (!ioeventfd_in_range(p, addr, len, val))
>  		return -EOPNOTSUPP;
>  
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock, &data, sizeof(data));
> +		socket_read(p->sock, &data, sizeof(data));
> +		set_val(val, len, data.data);

Same here.

> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>  	return 0;
>  }
>


-- 
MST  

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
  2011-07-06 11:42   ` Michael S. Tsirkin
@ 2011-07-06 12:39   ` Avi Kivity
  2011-07-06 12:58     ` Sasha Levin
  2011-07-06 13:00   ` Avi Kivity
  2011-07-20  2:42   ` Anthony Liguori
  3 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-06 12:39 UTC (permalink / raw)
  To: Sasha Levin
  Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin, Pekka Enberg

On 07/06/2011 07:37 AM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
>
>
>
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   		void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on reads isn't requested */
>   	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock,&data, sizeof(data));
> +		socket_read(p->sock,&data, sizeof(data));
> +		set_val(val, len, data.data);
> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>   	return 0;
>   }

If there are two reads on the same ioeventfd range, then the responses 
can mix up.  Need to make sure we get the right response.

Note that a mutex on the ioeventfd structure is insufficient, since the 
same socket may be used for multiple ranges.

One way out is to require that sockets not be shared among vcpus (there 
can be only one outstanding read per vcpu).  It seems heavy handed though.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06 12:39   ` Avi Kivity
@ 2011-07-06 12:58     ` Sasha Levin
  2011-07-06 13:04       ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-06 12:58 UTC (permalink / raw)
  To: Avi Kivity
  Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin, Pekka Enberg

On Wed, 2011-07-06 at 15:39 +0300, Avi Kivity wrote:
> On 07/06/2011 07:37 AM, Sasha Levin wrote:
> > The new flag allows passing a connected socket instead of an
> > eventfd to be notified of writes or reads to the specified memory region.
> >
> > Instead of signaling an event, On write - the value written to the memory
> > region is written to the pipe.
> > On read - a notification of the read is sent to the host, and a response
> > is expected with the value to be 'read'.
> >
> > Using a socket instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> >
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
> >
> >
> >
> > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >   		void *val)
> >   {
> >   	struct _ioeventfd *p = to_ioeventfd(this);
> > +	struct kvm_ioeventfd_data data;
> >
> >   	/* Exit if signaling on reads isn't requested */
> >   	if (!p->track_reads)
> > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >   	if (!ioeventfd_in_range(p, addr, len, val))
> >   		return -EOPNOTSUPP;
> >
> > -	eventfd_signal(p->eventfd, 1);
> > +	data = (struct kvm_ioeventfd_data) {
> > +		.addr = addr,
> > +		.len = len,
> > +		.is_write = 0,
> > +	};
> > +
> > +	if (p->sock) {
> > +		socket_write(p->sock,&data, sizeof(data));
> > +		socket_read(p->sock,&data, sizeof(data));
> > +		set_val(val, len, data.data);
> > +	} else {
> > +		set_val(val, len, p->datamatch);
> > +		eventfd_signal(p->eventfd, 1);
> > +	}
> > +
> >   	return 0;
> >   }
> 
> If there are two reads on the same ioeventfd range, then the responses 
> can mix up.  Need to make sure we get the right response.
> 
> Note that a mutex on the ioeventfd structure is insufficient, since the 
> same socket may be used for multiple ranges.
> 
> One way out is to require that sockets not be shared among vcpus (there 
> can be only one outstanding read per vcpu).  It seems heavy handed though.
> 

What about something as follows:

This requires an addition of a mutex to struct ioeventfd.

1. When adding a new ioeventfd, scan exiting ioeventfds (we already do
it anyway) and check whether another ioeventfd is using the socket
already.

2. If the existing ioeventfd doesn't have a mutex assigned, create a new
mutex and assign it to both ioeventfds.

3. If the existing ioeventfd already has a mutex assigned, copy it to
the new ioeventfd.

4. When removing an ioeventfd, do everything the other way around :)

This mutex can be used to lock the write/read pair.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
  2011-07-06 11:42   ` Michael S. Tsirkin
  2011-07-06 12:39   ` Avi Kivity
@ 2011-07-06 13:00   ` Avi Kivity
  2011-07-20  2:42   ` Anthony Liguori
  3 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-06 13:00 UTC (permalink / raw)
  To: Sasha Levin
  Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin, Pekka Enberg

On 07/06/2011 07:37 AM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
>
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};

Please pad, let's say to 32 bytes so it's a nice power of two.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06 12:58     ` Sasha Levin
@ 2011-07-06 13:04       ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-06 13:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin, Pekka Enberg

On 07/06/2011 03:58 PM, Sasha Levin wrote:
> What about something as follows:
>
> This requires an addition of a mutex to struct ioeventfd.
>
> 1. When adding a new ioeventfd, scan exiting ioeventfds (we already do
> it anyway) and check whether another ioeventfd is using the socket
> already.
>
> 2. If the existing ioeventfd doesn't have a mutex assigned, create a new
> mutex and assign it to both ioeventfds.

That fails if there is a read already in progress on the old ioeventfd.

Just create or share a mutex every time.  Taking a mutex is cheap enough.

> 3. If the existing ioeventfd already has a mutex assigned, copy it to
> the new ioeventfd.
>
> 4. When removing an ioeventfd, do everything the other way around :)
>
> This mutex can be used to lock the write/read pair.
>

Not very elegant, but reasonable and simple.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06 11:42   ` Michael S. Tsirkin
@ 2011-07-06 15:01     ` Sasha Levin
  2011-07-06 17:58       ` Michael S. Tsirkin
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-06 15:01 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > The new flag allows passing a connected socket instead of an
> > eventfd to be notified of writes or reads to the specified memory region.
> > 
> > Instead of signaling an event, On write - the value written to the memory
> > region is written to the pipe.
> > On read - a notification of the read is sent to the host, and a response
> > is expected with the value to be 'read'.
> > 
> > Using a socket instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> > 
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
> > 
> > Cc: Avi Kivity <avi@redhat.com>
> > Cc: Ingo Molnar <mingo@elte.hu>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Pekka Enberg <penberg@kernel.org>
> > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > ---
> >  Documentation/virtual/kvm/api.txt |   18 ++++-
> >  include/linux/kvm.h               |    9 ++
> >  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
> >  3 files changed, 161 insertions(+), 19 deletions(-)
> > 
> > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > index 317d86a..74f0946 100644
> > --- a/Documentation/virtual/kvm/api.txt
> > +++ b/Documentation/virtual/kvm/api.txt
> > @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
> >  
> >  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
> >  within the guest.  A guest write in the registered address will signal the
> > -provided event instead of triggering an exit.
> > +provided event or write to the provided socket instead of triggering an exit.
> >  
> >  struct kvm_ioeventfd {
> >  	__u64 datamatch;
> > @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
> >  	__u8  pad[36];
> >  };
> >  
> > +struct kvm_ioeventfd_data {
> > +	__u64 data;
> > +	__u64 addr;
> > +	__u32 len;
> > +	__u8  is_write;
> > +};
> > +
> >  The following flags are defined:
> >  
> >  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> > @@ -1348,6 +1355,7 @@ The following flags are defined:
> >  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
> >  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
> >  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> > +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
> >  
> >  If datamatch flag is set, the event will be signaled only if the written value
> >  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> > @@ -1359,6 +1367,14 @@ passed in datamatch.
> >  If the nowrite flag is set, the event won't be signaled when the specified address
> >  is being written to.
> >  
> > +If the socket flag is set, fd is expected to be a connected AF_UNIX
> > +SOCK_SEQPACKET socket.
> 
> Let's verify that then?
> 
> > +
> > +	if (p->sock)
> > +		socket_write(p->sock, &data, sizeof(data));
> > +	else
> > +		eventfd_signal(p->eventfd, 1);
> > +
> >  	return 0;
> >  }
> 
> This still loses the data if socket would block and there's a signal.
> I think we agreed to use non blocking operations and exit to
> userspace in that case?
> 
> 
> >  
> > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  		void *val)
> >  {
> >  	struct _ioeventfd *p = to_ioeventfd(this);
> > +	struct kvm_ioeventfd_data data;
> >  
> >  	/* Exit if signaling on reads isn't requested */
> >  	if (!p->track_reads)
> > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> >  	if (!ioeventfd_in_range(p, addr, len, val))
> >  		return -EOPNOTSUPP;
> >  
> > -	eventfd_signal(p->eventfd, 1);
> > +	data = (struct kvm_ioeventfd_data) {
> > +		.addr = addr,
> > +		.len = len,
> > +		.is_write = 0,
> > +	};
> > +
> > +	if (p->sock) {
> > +		socket_write(p->sock, &data, sizeof(data));
> > +		socket_read(p->sock, &data, sizeof(data));
> > +		set_val(val, len, data.data);
> 
> Same here.

The socket_read() here I should leave blocking, and spin on it until I
read something - right?

> > +	} else {
> > +		set_val(val, len, p->datamatch);
> > +		eventfd_signal(p->eventfd, 1);
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 
> 


-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06 15:01     ` Sasha Levin
@ 2011-07-06 17:58       ` Michael S. Tsirkin
  2011-07-10  5:34         ` Sasha Levin
  0 siblings, 1 reply; 62+ messages in thread
From: Michael S. Tsirkin @ 2011-07-06 17:58 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > The new flag allows passing a connected socket instead of an
> > > eventfd to be notified of writes or reads to the specified memory region.
> > > 
> > > Instead of signaling an event, On write - the value written to the memory
> > > region is written to the pipe.
> > > On read - a notification of the read is sent to the host, and a response
> > > is expected with the value to be 'read'.
> > > 
> > > Using a socket instead of an eventfd is usefull when any value can be
> > > written to the memory region but we're interested in recieving the
> > > actual value instead of just a notification.
> > > 
> > > A simple example for practical use is the serial port. we are not
> > > interested in an exit every time a char is written to the port, but
> > > we do need to know what was written so we could handle it on the guest.
> > > 
> > > Cc: Avi Kivity <avi@redhat.com>
> > > Cc: Ingo Molnar <mingo@elte.hu>
> > > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > Cc: Pekka Enberg <penberg@kernel.org>
> > > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > > ---
> > >  Documentation/virtual/kvm/api.txt |   18 ++++-
> > >  include/linux/kvm.h               |    9 ++
> > >  virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
> > >  3 files changed, 161 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> > > index 317d86a..74f0946 100644
> > > --- a/Documentation/virtual/kvm/api.txt
> > > +++ b/Documentation/virtual/kvm/api.txt
> > > @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
> > >  
> > >  This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
> > >  within the guest.  A guest write in the registered address will signal the
> > > -provided event instead of triggering an exit.
> > > +provided event or write to the provided socket instead of triggering an exit.
> > >  
> > >  struct kvm_ioeventfd {
> > >  	__u64 datamatch;
> > > @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
> > >  	__u8  pad[36];
> > >  };
> > >  
> > > +struct kvm_ioeventfd_data {
> > > +	__u64 data;
> > > +	__u64 addr;
> > > +	__u32 len;
> > > +	__u8  is_write;
> > > +};
> > > +
> > >  The following flags are defined:
> > >  
> > >  #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> > > @@ -1348,6 +1355,7 @@ The following flags are defined:
> > >  #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
> > >  #define KVM_IOEVENTFD_FLAG_READ      (1 << kvm_ioeventfd_flag_nr_read)
> > >  #define KVM_IOEVENTFD_FLAG_NOWRITE   (1 << kvm_ioeventfd_flag_nr_nowrite)
> > > +#define KVM_IOEVENTFD_FLAG_SOCKET    (1 << kvm_ioeventfd_flag_nr_socket)
> > >  
> > >  If datamatch flag is set, the event will be signaled only if the written value
> > >  to the registered address is equal to datamatch in struct kvm_ioeventfd.
> > > @@ -1359,6 +1367,14 @@ passed in datamatch.
> > >  If the nowrite flag is set, the event won't be signaled when the specified address
> > >  is being written to.
> > >  
> > > +If the socket flag is set, fd is expected to be a connected AF_UNIX
> > > +SOCK_SEQPACKET socket.
> > 
> > Let's verify that then?
> > 
> > > +
> > > +	if (p->sock)
> > > +		socket_write(p->sock, &data, sizeof(data));
> > > +	else
> > > +		eventfd_signal(p->eventfd, 1);
> > > +
> > >  	return 0;
> > >  }
> > 
> > This still loses the data if socket would block and there's a signal.
> > I think we agreed to use non blocking operations and exit to
> > userspace in that case?
> > 
> > 
> > >  
> > > @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> > >  		void *val)
> > >  {
> > >  	struct _ioeventfd *p = to_ioeventfd(this);
> > > +	struct kvm_ioeventfd_data data;
> > >  
> > >  	/* Exit if signaling on reads isn't requested */
> > >  	if (!p->track_reads)
> > > @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
> > >  	if (!ioeventfd_in_range(p, addr, len, val))
> > >  		return -EOPNOTSUPP;
> > >  
> > > -	eventfd_signal(p->eventfd, 1);
> > > +	data = (struct kvm_ioeventfd_data) {
> > > +		.addr = addr,
> > > +		.len = len,
> > > +		.is_write = 0,
> > > +	};
> > > +
> > > +	if (p->sock) {
> > > +		socket_write(p->sock, &data, sizeof(data));
> > > +		socket_read(p->sock, &data, sizeof(data));
> > > +		set_val(val, len, data.data);
> > 
> > Same here.
> 
> The socket_read() here I should leave blocking, and spin on it until I
> read something - right?

I think it's best to exit to userspace.

> > > +	} else {
> > > +		set_val(val, len, p->datamatch);
> > > +		eventfd_signal(p->eventfd, 1);
> > > +	}
> > > +
> > >  	return 0;
> > >  }
> > >
> > 
> > 
> 
> 
> -- 
> 
> Sasha.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06 17:58       ` Michael S. Tsirkin
@ 2011-07-10  5:34         ` Sasha Levin
  2011-07-10  8:05           ` Michael S. Tsirkin
  2011-07-13  7:51           ` Pekka Enberg
  0 siblings, 2 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-10  5:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > +	if (p->sock) {
> > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > +		set_val(val, len, data.data);
> > > 
> > > Same here.
> > 
> > The socket_read() here I should leave blocking, and spin on it until I
> > read something - right?
> 
> I think it's best to exit to userspace.

Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
the read here we'll take a regular MMIO exit and will allow the usermode
to deal with the MMIO in a regular way.

I've discussed the issue of usermode might having to handle the same
MMIO read request twice with Michael, and the solution proposed was to
add a new type of exit to handle this special case.

After working on that solution a bit I saw it's adding a lot of code and
complexity for this small issue, and I'm now thinking we may be better
off with just handling reads twice in case of a signal just between
socket_write() and socket_read() - once through the socket and once
through a regular MMIO exit.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-10  5:34         ` Sasha Levin
@ 2011-07-10  8:05           ` Michael S. Tsirkin
  2011-07-12 11:23             ` Sasha Levin
  2011-07-13  7:51           ` Pekka Enberg
  1 sibling, 1 reply; 62+ messages in thread
From: Michael S. Tsirkin @ 2011-07-10  8:05 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Sun, Jul 10, 2011 at 08:34:43AM +0300, Sasha Levin wrote:
> On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> > On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > > +	if (p->sock) {
> > > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > > +		set_val(val, len, data.data);
> > > > 
> > > > Same here.
> > > 
> > > The socket_read() here I should leave blocking, and spin on it until I
> > > read something - right?
> > 
> > I think it's best to exit to userspace.
> 
> Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
> the read here we'll take a regular MMIO exit and will allow the usermode
> to deal with the MMIO in a regular way.
> 
> I've discussed the issue of usermode might having to handle the same
> MMIO read request twice with Michael, and the solution proposed was to
> add a new type of exit to handle this special case.
> 
> After working on that solution a bit I saw it's adding a lot of code and
> complexity for this small issue, and I'm now thinking we may be better
> off with just handling reads twice in case of a signal just between
> socket_write() and socket_read() - once through the socket and once
> through a regular MMIO exit.

The problem with this approach would be reads with a side-effect though:
for example, the IRQ status read in virtio clears the status register.

I don't insist on a new type of exit, just pointing out the problem.


> -- 
> 
> Sasha.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-10  8:05           ` Michael S. Tsirkin
@ 2011-07-12 11:23             ` Sasha Levin
  2011-07-12 11:26               ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-12 11:23 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On Sun, 2011-07-10 at 11:05 +0300, Michael S. Tsirkin wrote:
> On Sun, Jul 10, 2011 at 08:34:43AM +0300, Sasha Levin wrote:
> > On Wed, 2011-07-06 at 20:58 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jul 06, 2011 at 06:01:46PM +0300, Sasha Levin wrote:
> > > > On Wed, 2011-07-06 at 14:42 +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Jul 06, 2011 at 07:37:58AM +0300, Sasha Levin wrote:
> > > > > > +	if (p->sock) {
> > > > > > +		socket_write(p->sock, &data, sizeof(data));
> > > > > > +		socket_read(p->sock, &data, sizeof(data));
> > > > > > +		set_val(val, len, data.data);
> > > > > 
> > > > > Same here.
> > > > 
> > > > The socket_read() here I should leave blocking, and spin on it until I
> > > > read something - right?
> > > 
> > > I think it's best to exit to userspace.
> > 
> > Since sock_recvmsg for AF_UNIX SEQPACKET is interruptible, if we fail
> > the read here we'll take a regular MMIO exit and will allow the usermode
> > to deal with the MMIO in a regular way.
> > 
> > I've discussed the issue of usermode might having to handle the same
> > MMIO read request twice with Michael, and the solution proposed was to
> > add a new type of exit to handle this special case.
> > 
> > After working on that solution a bit I saw it's adding a lot of code and
> > complexity for this small issue, and I'm now thinking we may be better
> > off with just handling reads twice in case of a signal just between
> > socket_write() and socket_read() - once through the socket and once
> > through a regular MMIO exit.
> 
> The problem with this approach would be reads with a side-effect though:
> for example, the IRQ status read in virtio clears the status register.
> 
> I don't insist on a new type of exit, just pointing out the problem.

I agree with you, I don't have a better solution either.

I don't feel it's worth it adding so much code for read support to
properly work. Can we do this patch series without socket read support
at the moment?


-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-12 11:23             ` Sasha Levin
@ 2011-07-12 11:26               ` Avi Kivity
  2011-07-13  6:37                 ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-12 11:26 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg

On 07/12/2011 02:23 PM, Sasha Levin wrote:
> >
> >  I don't insist on a new type of exit, just pointing out the problem.
>
> I agree with you, I don't have a better solution either.
>
> I don't feel it's worth it adding so much code for read support to
> properly work. Can we do this patch series without socket read support
> at the moment?

No.  As I said before, I don't want a fragmented ABI.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-12 11:26               ` Avi Kivity
@ 2011-07-13  6:37                 ` Pekka Enberg
  2011-07-13  6:45                   ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13  6:37 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

Hi,

On 07/12/2011 02:23 PM, Sasha Levin wrote:
>> >  I don't insist on a new type of exit, just pointing out the problem.
>>
>> I agree with you, I don't have a better solution either.
>>
>> I don't feel it's worth it adding so much code for read support to
>> properly work. Can we do this patch series without socket read support
>> at the moment?

On Tue, Jul 12, 2011 at 2:26 PM, Avi Kivity <avi@redhat.com> wrote:
> No.  As I said before, I don't want a fragmented ABI.

OK, what's the simplest thing we can do here to keep Avi happy and get
the functionality of Sasha's original patch that helps us avoid guest
exits for serial console? I agree with Avi that we don't want
fragmented ABI but it seems to me there's no clear idea how to fix
KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?

                                 Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  6:37                 ` Pekka Enberg
@ 2011-07-13  6:45                   ` Pekka Enberg
  2011-07-13  7:07                     ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13  6:45 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 9:37 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On 07/12/2011 02:23 PM, Sasha Levin wrote:
>>> >  I don't insist on a new type of exit, just pointing out the problem.
>>>
>>> I agree with you, I don't have a better solution either.
>>>
>>> I don't feel it's worth it adding so much code for read support to
>>> properly work. Can we do this patch series without socket read support
>>> at the moment?
>
> On Tue, Jul 12, 2011 at 2:26 PM, Avi Kivity <avi@redhat.com> wrote:
>> No.  As I said before, I don't want a fragmented ABI.
>
> OK, what's the simplest thing we can do here to keep Avi happy and get
> the functionality of Sasha's original patch that helps us avoid guest
> exits for serial console? I agree with Avi that we don't want
> fragmented ABI but it seems to me there's no clear idea how to fix
> KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?

And btw, I didn't follow the discussion closely, but introducing a new
type of exit for a feature that's designed to _avoid exits_ doesn't
seem like a smart thing to do. Is it even possible to support sockets
sanely for this?

                                 Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  6:45                   ` Pekka Enberg
@ 2011-07-13  7:07                     ` Avi Kivity
  2011-07-13  8:02                       ` Pekka Enberg
                                         ` (2 more replies)
  0 siblings, 3 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-13  7:07 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/13/2011 09:45 AM, Pekka Enberg wrote:
> >
> >  OK, what's the simplest thing we can do here to keep Avi happy and get
> >  the functionality of Sasha's original patch that helps us avoid guest
> >  exits for serial console? I agree with Avi that we don't want
> >  fragmented ABI but it seems to me there's no clear idea how to fix
> >  KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?
>
> And btw, I didn't follow the discussion closely, but introducing a new
> type of exit for a feature that's designed to _avoid exits_ doesn't
> seem like a smart thing to do. Is it even possible to support sockets
> sanely for this?

First of all, this feature doesn't avoid exits.  The guest exits to the 
kernel just the same.  It just avoids userspace exits, trading them off 
for remote wakeups or context switches, as the case may be, which are 
both more expensive.

This feature is a win in the following cases:
- the socket is consumed in the kernel (like ioeventfd/irqfd and 
vhost-net, or vfio)
- the I/O triggers a lot of cpu work, _and_ there is a lot of spare cpu 
available on the host, so we can parallelize work.  Note it ends up 
being less efficient, but delivering higher throughput.  That's common 
in benchmarks but not so good for real workloads.
- the socket is consumed in another process (not thread) in which case 
we gain some security
- the writes are such high frequency that we gain something from the 
queueing is happens when we thread work.  That's where the gain for 
serial console can come from - though it would be better to just use 
virtio-serial instead.

Second, introducing a new type of exit doesn't mean we see more exits.

Third, the new type of exit is probably not needed - we can use the 
existing mmio/pio exits, and document that in certain cases the kernel 
will fall back to these instead of delivering the I/O via the sockets 
(userspace can then try writing itself).

There was a fourth too but I forgot it already.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-10  5:34         ` Sasha Levin
  2011-07-10  8:05           ` Michael S. Tsirkin
@ 2011-07-13  7:51           ` Pekka Enberg
  2011-07-13 10:04             ` Pekka Enberg
  1 sibling, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13  7:51 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti

On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
> After working on that solution a bit I saw it's adding a lot of code and
> complexity for this small issue, and I'm now thinking we may be better
> off with just handling reads twice in case of a signal just between
> socket_write() and socket_read() - once through the socket and once
> through a regular MMIO exit.

I don't really understand the issue so can you elaborate where the
complexity comes from? Why can't we just switch to non-blocking read
and return -ENOSUPP if there's signal_pending() after socket_write()?
AFAICT, we can just let callers of kvm_iodevice_write() and
kvm_iodevice_read() deal with exits, no?

                                  Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  7:07                     ` Avi Kivity
@ 2011-07-13  8:02                       ` Pekka Enberg
  2011-07-13 12:57                         ` Avi Kivity
  2011-07-20  2:49                       ` Anthony Liguori
  2011-07-25 12:10                       ` Sasha Levin
  2 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13  8:02 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 10:07 AM, Avi Kivity <avi@redhat.com> wrote:
> - the writes are such high frequency that we gain something from the
> queueing is happens when we thread work.  That's where the gain for serial
> console can come from - though it would be better to just use virtio-serial
> instead.

We'd like to keep 8250 emulation because it's the most robust method
for getting data out of the kernel when there's problems. It's also
compatible with earlyprintk and such.

                                 Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  7:51           ` Pekka Enberg
@ 2011-07-13 10:04             ` Pekka Enberg
  2011-07-13 10:26               ` Sasha Levin
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13 10:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 10:51 AM, Pekka Enberg <penberg@kernel.org> wrote:
> On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
>> After working on that solution a bit I saw it's adding a lot of code and
>> complexity for this small issue, and I'm now thinking we may be better
>> off with just handling reads twice in case of a signal just between
>> socket_write() and socket_read() - once through the socket and once
>> through a regular MMIO exit.
>
> I don't really understand the issue so can you elaborate where the
> complexity comes from? Why can't we just switch to non-blocking read
> and return -ENOSUPP if there's signal_pending() after socket_write()?
> AFAICT, we can just let callers of kvm_iodevice_write() and
> kvm_iodevice_read() deal with exits, no?

Oh, re-reading Michael's explanation, signal_pending() is irrelevant
here and all we need to do is return -ENOSUPP if either the read or
write fails. What's the part I'm totally missing here?

                                  Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 10:04             ` Pekka Enberg
@ 2011-07-13 10:26               ` Sasha Levin
  2011-07-13 10:56                 ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-13 10:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Michael S. Tsirkin, kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti

On Wed, 2011-07-13 at 13:04 +0300, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 10:51 AM, Pekka Enberg <penberg@kernel.org> wrote:
> > On Sun, Jul 10, 2011 at 8:34 AM, Sasha Levin <levinsasha928@gmail.com> wrote:
> >> After working on that solution a bit I saw it's adding a lot of code and
> >> complexity for this small issue, and I'm now thinking we may be better
> >> off with just handling reads twice in case of a signal just between
> >> socket_write() and socket_read() - once through the socket and once
> >> through a regular MMIO exit.
> >
> > I don't really understand the issue so can you elaborate where the
> > complexity comes from? Why can't we just switch to non-blocking read
> > and return -ENOSUPP if there's signal_pending() after socket_write()?
> > AFAICT, we can just let callers of kvm_iodevice_write() and
> > kvm_iodevice_read() deal with exits, no?
> 
> Oh, re-reading Michael's explanation, signal_pending() is irrelevant
> here and all we need to do is return -ENOSUPP if either the read or
> write fails. What's the part I'm totally missing here?

The problem is that if we received a signal during the read notification
the write and before receiving the read, we have to go back to
userspace.

This means that userspace will see same read request twice (once in the
socket and once in the MMIO exit).


-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 10:26               ` Sasha Levin
@ 2011-07-13 10:56                 ` Pekka Enberg
  2011-07-13 11:14                   ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13 10:56 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 1:26 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
> The problem is that if we received a signal during the read notification
> the write and before receiving the read, we have to go back to
> userspace.
>
> This means that userspace will see same read request twice (once in the
> socket and once in the MMIO exit).

So the problem is only in ioeventfd_read() if socket_write() succeeds
but socket_read() fails? If so, can we do the socket_read() somewhere
else in the code which is able to restart it?

AFAICT, both ioeventfd_read() and ioeventfd_write() should -ENOSUPP if
socket_write() fails. If socket_write() succeeds, we should return
-EINTR and teach vcpu_mmio_read() and kernel_pio() to KVM_EXIT_INTR in
those cases. We'll can just restart the read, no? The only
complication I can see is that ioeventfd_read() needs to keep track if
it did the socket_write() already or not.

                                   Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 10:56                 ` Pekka Enberg
@ 2011-07-13 11:14                   ` Pekka Enberg
  0 siblings, 0 replies; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13 11:14 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Michael S. Tsirkin, kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 1:56 PM, Pekka Enberg <penberg@kernel.org> wrote:
> On Wed, Jul 13, 2011 at 1:26 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
>> The problem is that if we received a signal during the read notification
>> the write and before receiving the read, we have to go back to
>> userspace.
>>
>> This means that userspace will see same read request twice (once in the
>> socket and once in the MMIO exit).
>
> So the problem is only in ioeventfd_read() if socket_write() succeeds
> but socket_read() fails? If so, can we do the socket_read() somewhere
> else in the code which is able to restart it?
>
> AFAICT, both ioeventfd_read() and ioeventfd_write() should -ENOSUPP if
> socket_write() fails. If socket_write() succeeds, we should return
> -EINTR and teach vcpu_mmio_read() and kernel_pio() to KVM_EXIT_INTR in
> those cases. We'll can just restart the read, no? The only
> complication I can see is that ioeventfd_read() needs to keep track if
> it did the socket_write() already or not.

Maybe struct _ioeventfd could hold a flag that tells ioevenfd_read()
that the write part was already successful?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  8:02                       ` Pekka Enberg
@ 2011-07-13 12:57                         ` Avi Kivity
  2011-07-13 13:00                           ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-13 12:57 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/13/2011 11:02 AM, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 10:07 AM, Avi Kivity<avi@redhat.com>  wrote:
> >  - the writes are such high frequency that we gain something from the
> >  queueing is happens when we thread work.  That's where the gain for serial
> >  console can come from - though it would be better to just use virtio-serial
> >  instead.
>
> We'd like to keep 8250 emulation because it's the most robust method
> for getting data out of the kernel when there's problems. It's also
> compatible with earlyprintk and such.

What do you hope to gain from the optimization? 100ms less boot time?

btw, it likely needs read support, if any read can depend on a previous 
write.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 12:57                         ` Avi Kivity
@ 2011-07-13 13:00                           ` Pekka Enberg
  2011-07-13 13:32                             ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-13 13:00 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity <avi@redhat.com> wrote:
>> We'd like to keep 8250 emulation because it's the most robust method
>> for getting data out of the kernel when there's problems. It's also
>> compatible with earlyprintk and such.
>
> What do you hope to gain from the optimization? 100ms less boot time?

Hmm? I use serial console for logging into the guest and actually
using it all the time.

                                Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 13:00                           ` Pekka Enberg
@ 2011-07-13 13:32                             ` Avi Kivity
  2011-07-14  7:26                               ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-13 13:32 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/13/2011 04:00 PM, Pekka Enberg wrote:
> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  We'd like to keep 8250 emulation because it's the most robust method
> >>  for getting data out of the kernel when there's problems. It's also
> >>  compatible with earlyprintk and such.
> >
> >  What do you hope to gain from the optimization? 100ms less boot time?
>
> Hmm? I use serial console for logging into the guest and actually
> using it all the time.

Just how fast do you type?

It's not enough to make something faster, it has to be slow to being 
with.  The interface is good, but use it where it helps.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13 13:32                             ` Avi Kivity
@ 2011-07-14  7:26                               ` Pekka Enberg
  2011-07-14  8:07                                 ` Sasha Levin
  2011-07-14  8:09                                 ` Avi Kivity
  0 siblings, 2 replies; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14  7:26 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/13/2011 04:00 PM, Pekka Enberg wrote:
>> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
>> >>  We'd like to keep 8250 emulation because it's the most robust method
>> >>  for getting data out of the kernel when there's problems. It's also
>> >>  compatible with earlyprintk and such.
>> >
>> >  What do you hope to gain from the optimization? 100ms less boot time?
>>
>> Hmm? I use serial console for logging into the guest and actually
>> using it all the time.

On 7/13/11 4:32 PM, Avi Kivity wrote:
> Just how fast do you type?
>
> It's not enough to make something faster, it has to be slow to being 
> with.  The interface is good, but use it where it helps.

I don't know why you bring up serial console input here. Sasha's
original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:

   A simple example for practical use is the serial port. we are not
   interested in an exit every time a char is written to the port, but
   we do need to know what was written so we could handle it on the
   guest.

So it's talking about the serial console output, not input.

                             Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  7:26                               ` Pekka Enberg
@ 2011-07-14  8:07                                 ` Sasha Levin
  2011-07-14  8:09                                 ` Avi Kivity
  1 sibling, 0 replies; 62+ messages in thread
From: Sasha Levin @ 2011-07-14  8:07 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Avi Kivity, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, 2011-07-14 at 10:26 +0300, Pekka Enberg wrote:
> On 07/13/2011 04:00 PM, Pekka Enberg wrote:
> >> On Wed, Jul 13, 2011 at 3:57 PM, Avi Kivity<avi@redhat.com>  wrote:
> >> >>  We'd like to keep 8250 emulation because it's the most robust method
> >> >>  for getting data out of the kernel when there's problems. It's also
> >> >>  compatible with earlyprintk and such.
> >> >
> >> >  What do you hope to gain from the optimization? 100ms less boot time?
> >>
> >> Hmm? I use serial console for logging into the guest and actually
> >> using it all the time.
> 
> On 7/13/11 4:32 PM, Avi Kivity wrote:
> > Just how fast do you type?
> >
> > It's not enough to make something faster, it has to be slow to being 
> > with.  The interface is good, but use it where it helps.
> 
> I don't know why you bring up serial console input here. Sasha's
> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
> 
>    A simple example for practical use is the serial port. we are not
>    interested in an exit every time a char is written to the port, but
>    we do need to know what was written so we could handle it on the
>    guest.
> 
> So it's talking about the serial console output, not input.

Yes, I have originally thought of doing the pipefd interface to prevent
the overhead caused by console intensive benchmarks on actual
benchmarks. It's funny how much slowness spinning the cursor or printing
a table incurs.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  7:26                               ` Pekka Enberg
  2011-07-14  8:07                                 ` Sasha Levin
@ 2011-07-14  8:09                                 ` Avi Kivity
  2011-07-14  8:14                                   ` Pekka Enberg
                                                     ` (2 more replies)
  1 sibling, 3 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-14  8:09 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 10:26 AM, Pekka Enberg wrote:
>
> On 7/13/11 4:32 PM, Avi Kivity wrote:
>> Just how fast do you type?
>>
>> It's not enough to make something faster, it has to be slow to being 
>> with.  The interface is good, but use it where it helps.
>
> I don't know why you bring up serial console input here. Sasha's
> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
>
>   A simple example for practical use is the serial port. we are not
>   interested in an exit every time a char is written to the port, but
>   we do need to know what was written so we could handle it on the
>   guest.
>
> So it's talking about the serial console output, not input.
>

I'm talking about the real world.  Is any of this something that needs 
optimization?

1024 vcpus logging in via the serial console at 10Gbps.  Get real.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:09                                 ` Avi Kivity
@ 2011-07-14  8:14                                   ` Pekka Enberg
  2011-07-14  8:28                                     ` Avi Kivity
  2011-07-14  8:19                                   ` Gleb Natapov
  2011-07-14  8:25                                   ` Michael S. Tsirkin
  2 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14  8:14 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, Jul 14, 2011 at 11:09 AM, Avi Kivity <avi@redhat.com> wrote:
> On 07/14/2011 10:26 AM, Pekka Enberg wrote:
>>
>> On 7/13/11 4:32 PM, Avi Kivity wrote:
>>>
>>> Just how fast do you type?
>>>
>>> It's not enough to make something faster, it has to be slow to being
>>> with.  The interface is good, but use it where it helps.
>>
>> I don't know why you bring up serial console input here. Sasha's
>> original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
>>
>>  A simple example for practical use is the serial port. we are not
>>  interested in an exit every time a char is written to the port, but
>>  we do need to know what was written so we could handle it on the
>>  guest.
>>
>> So it's talking about the serial console output, not input.
>>
>
> I'm talking about the real world.  Is any of this something that needs
> optimization?

Yes. See Sasha's other email. We should have mentioned that in the changelog.

> 1024 vcpus logging in via the serial console at 10Gbps.  Get real.

[...]

Huh, why are you bringing something like that up?

                                Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:09                                 ` Avi Kivity
  2011-07-14  8:14                                   ` Pekka Enberg
@ 2011-07-14  8:19                                   ` Gleb Natapov
  2011-07-14  8:25                                   ` Michael S. Tsirkin
  2 siblings, 0 replies; 62+ messages in thread
From: Gleb Natapov @ 2011-07-14  8:19 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> On 07/14/2011 10:26 AM, Pekka Enberg wrote:
> >
> >On 7/13/11 4:32 PM, Avi Kivity wrote:
> >>Just how fast do you type?
> >>
> >>It's not enough to make something faster, it has to be slow to
> >>being with.  The interface is good, but use it where it helps.
> >
> >I don't know why you bring up serial console input here. Sasha's
> >original KVM_IOEVENTFD_FLAG_PIPE patch clearly stated:
> >
> >  A simple example for practical use is the serial port. we are not
> >  interested in an exit every time a char is written to the port, but
> >  we do need to know what was written so we could handle it on the
> >  guest.
> >
> >So it's talking about the serial console output, not input.
> >
> 
> I'm talking about the real world.  Is any of this something that
> needs optimization?
> 
> 1024 vcpus logging in via the serial console at 10Gbps.  Get real.
> 
It is nice to have the ability to trace a guest execution from a host
(added bonus if you can trace the guest and the host simultaneously and
have trace output in one nice file), but serial interface is not fit
for that. PV interface is needed. May be share ftrace buffer between
the host and the guest?

--
			Gleb.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:09                                 ` Avi Kivity
  2011-07-14  8:14                                   ` Pekka Enberg
  2011-07-14  8:19                                   ` Gleb Natapov
@ 2011-07-14  8:25                                   ` Michael S. Tsirkin
  2011-07-14  8:29                                     ` Avi Kivity
  2 siblings, 1 reply; 62+ messages in thread
From: Michael S. Tsirkin @ 2011-07-14  8:25 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Pekka Enberg, Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> I'm talking about the real world.  Is any of this something that
> needs optimization?

This work might help enable vhost support for non-MSI guests
(required for linux 2.6.30 and back). It's not enough
for that though: we would also need level interrupt
support in irqfd. That might be doable by using
sockets for irqfd in a similar way.

-- 
MST

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:14                                   ` Pekka Enberg
@ 2011-07-14  8:28                                     ` Avi Kivity
  2011-07-14  8:59                                       ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-14  8:28 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 11:14 AM, Pekka Enberg wrote:
> >
> >  I'm talking about the real world.  Is any of this something that needs
> >  optimization?
>
> Yes. See Sasha's other email. We should have mentioned that in the changelog.

It's completely unrealistic.  Sash and you are the only two people in 
the universe logging into the guest via a serial console and running 
benchmarks.

> >  1024 vcpus logging in via the serial console at 10Gbps.  Get real.
>
> [...]
>
> Huh, why are you bringing something like that up?

Because it's another example of an unrealistic approach on your part.  
No real user is interested in 1024 vcpus, and no real user is interested 
in optimized serial console.  It's good that you are bringing up new 
ideas and new code, but adding code just for its own sake is not a good 
thing.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:25                                   ` Michael S. Tsirkin
@ 2011-07-14  8:29                                     ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-14  8:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Pekka Enberg, Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 11:25 AM, Michael S. Tsirkin wrote:
> On Thu, Jul 14, 2011 at 11:09:57AM +0300, Avi Kivity wrote:
> >  I'm talking about the real world.  Is any of this something that
> >  needs optimization?
>
> This work might help enable vhost support for non-MSI guests
> (required for linux 2.6.30 and back). It's not enough
> for that though: we would also need level interrupt
> support in irqfd. That might be doable by using
> sockets for irqfd in a similar way.

I don't have an issue with socket mmio (it's a great feature once it's 
done well), just with presenting the serial port as motivation.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:28                                     ` Avi Kivity
@ 2011-07-14  8:59                                       ` Pekka Enberg
  2011-07-14  9:48                                         ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14  8:59 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, Jul 14, 2011 at 11:28 AM, Avi Kivity <avi@redhat.com> wrote:
> On 07/14/2011 11:14 AM, Pekka Enberg wrote:
>>
>> >
>> >  I'm talking about the real world.  Is any of this something that needs
>> >  optimization?
>>
>> Yes. See Sasha's other email. We should have mentioned that in the
>> changelog.
>
> It's completely unrealistic.  Sash and you are the only two people in the
> universe logging into the guest via a serial console and running benchmarks.

Why does that matter? Why should we keep the emulation slow if it's
possible to fix it? It's a fair question to ask if the benefits
outweigh the added complexity but asking as to keep serial emulation
slow because *you* think it's unrealistic is well - unrealistic from
your part!

>> >  1024 vcpus logging in via the serial console at 10Gbps.  Get real.
>>
>> [...]
>>
>> Huh, why are you bringing something like that up?
>
> Because it's another example of an unrealistic approach on your part.  No
> real user is interested in 1024 vcpus, and no real user is interested in
> optimized serial console.  It's good that you are bringing up new ideas and
> new code, but adding code just for its own sake is not a good thing.

*You* brought up 1024 vcpus using serial console! Obviously optimizing
something like that is stupid but we never claimed that we wanted to
do something like that!

As for 1024 vcpus, we already had the discussion where we explained
why we thought it was a good idea not to have such a low hard vcpu
limit for vcpus.

                                   Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  8:59                                       ` Pekka Enberg
@ 2011-07-14  9:48                                         ` Avi Kivity
       [not found]                                           ` <CAOJsxLHSeRuTOoiJssyrELRx-eXok3WinLr_+_G4dB+yHNBKdg@mail.gmai! l.com>
  2011-07-14 10:30                                           ` Pekka Enberg
  0 siblings, 2 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-14  9:48 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 11:59 AM, Pekka Enberg wrote:
> On Thu, Jul 14, 2011 at 11:28 AM, Avi Kivity<avi@redhat.com>  wrote:
> >  On 07/14/2011 11:14 AM, Pekka Enberg wrote:
> >>
> >>  >
> >>  >    I'm talking about the real world.  Is any of this something that needs
> >>  >    optimization?
> >>
> >>  Yes. See Sasha's other email. We should have mentioned that in the
> >>  changelog.
> >
> >  It's completely unrealistic.  Sash and you are the only two people in the
> >  universe logging into the guest via a serial console and running benchmarks.
>
> Why does that matter? Why should we keep the emulation slow if it's
> possible to fix it?

Fixing things that don't need fixing has a cost.  In work, in risk, and 
in maintainability.  If you can share this cost among other users (which 
is certainly possible with socket mmio), it may still be worth it.  But 
just making something faster is not sufficient, it has to be faster for 
a significant number of users.

> It's a fair question to ask if the benefits
> outweigh the added complexity but asking as to keep serial emulation
> slow because *you* think it's unrealistic is well - unrealistic from
> your part!

Exactly where am I unrealistic?  Do you think there are many users who 
suffer from slow serial emulation?

> >>  >    1024 vcpus logging in via the serial console at 10Gbps.  Get real.
> >>
> >>  [...]
> >>
> >>  Huh, why are you bringing something like that up?
> >
> >  Because it's another example of an unrealistic approach on your part.  No
> >  real user is interested in 1024 vcpus, and no real user is interested in
> >  optimized serial console.  It's good that you are bringing up new ideas and
> >  new code, but adding code just for its own sake is not a good thing.
>
> *You* brought up 1024 vcpus using serial console! Obviously optimizing
> something like that is stupid but we never claimed that we wanted to
> do something like that!

Either of them, independently, is unrealistic.  The example of them 
together was just levantine exaggeration, it wasn't meant to be taken 
literally.

> As for 1024 vcpus, we already had the discussion where we explained
> why we thought it was a good idea not to have such a low hard vcpu
> limit for vcpus.

I can't say I was convinced.  It's pretty simple to patch the kernel if 
you want to engage in such experiments.  We did find something that 
works out (the soft/hard limits), but it's still overkill.

There's a large attitude mismatch between tools/kvm developers and kvm 
developers (or at least me): tools/kvm is growing rapidly, adding 
features and improving stability at a fast pace.  kvm on the other hand 
is mature and a lot more concerned with preserving and improving 
stability than with adding new features.  The fact is, kvm is already 
very feature rich and very performant, so we're at a very different 
place in the performance/features/stability scales.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14  9:48                                         ` Avi Kivity
       [not found]                                           ` <CAOJsxLHSeRuTOoiJssyrELRx-eXok3WinLr_+_G4dB+yHNBKdg@mail.gmai! l.com>
@ 2011-07-14 10:30                                           ` Pekka Enberg
  2011-07-14 11:54                                             ` Avi Kivity
  1 sibling, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14 10:30 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

Hi Avi,

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> Why does that matter? Why should we keep the emulation slow if it's
>> possible to fix it?
>
> Fixing things that don't need fixing has a cost.  In work, in risk, and in
> maintainability.  If you can share this cost among other users (which is
> certainly possible with socket mmio), it may still be worth it.  But just
> making something faster is not sufficient, it has to be faster for a
> significant number of users.

I don't think it needs to be faster for *significant number* of users
but yes, I completely agree that we need to make sure KVM gains more
than the costs are.

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> It's a fair question to ask if the benefits
>> outweigh the added complexity but asking as to keep serial emulation
>> slow because *you* think it's unrealistic is well - unrealistic from
>> your part!
>
> Exactly where am I unrealistic?  Do you think there are many users who
> suffer from slow serial emulation?

Again, I don't with you that there needs to be many users for this
type of feature. That said, as a maintainer you seem to think that it
is and I'm obviously OK with that.

So if you've been saying 'this is too complex for too little gain' all
this time, I've misunderstood what you've been trying to say. The way
I've read your comments is "optimizing serial console is stupid
because it's a useless feature" which obviously not true because we
find it useful!

>> *You* brought up 1024 vcpus using serial console! Obviously optimizing
>> something like that is stupid but we never claimed that we wanted to
>> do something like that!
>
> Either of them, independently, is unrealistic.  The example of them together
> was just levantine exaggeration, it wasn't meant to be taken literally.

I obviously don't agree that they're unrealistic independently.

We want to use 8250 emulation instead of virtio-serial because it's
more compatible with kernel debugging mechanisms. Also, it makes
debugging virtio code much easier when we don't need to use virtio to
deliver console output while debugging it. We want to make it fast so
that we don't need to switch over to another console type after early
boot.

What's unreasonable about that?

Reasonably fast 1024 VCPUs would be great for testing kernel
configurations. KVM is not there yet so we suggested that we raise the
hard limit from current 64 VCPUs so that it's easier for people such
as ourselves to improve things. I don't understand why you think
that's unreasonable either!

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
>> As for 1024 vcpus, we already had the discussion where we explained
>> why we thought it was a good idea not to have such a low hard vcpu
>> limit for vcpus.
>
> I can't say I was convinced.  It's pretty simple to patch the kernel if you
> want to engage in such experiments.  We did find something that works out
> (the soft/hard limits), but it's still overkill.

I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
guess I misunderstood your position then.

On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity <avi@redhat.com> wrote:
> There's a large attitude mismatch between tools/kvm developers and kvm
> developers (or at least me): tools/kvm is growing rapidly, adding features
> and improving stability at a fast pace.  kvm on the other hand is mature and
> a lot more concerned with preserving and improving stability than with
> adding new features.  The fact is, kvm is already very feature rich and very
> performant, so we're at a very different place in the
> performance/features/stability scales.

Yes, we're at different places but we definitely appreciate the
stability and performance of KVM and have no interest in disrupting
that. I don't expect you to merge our patches when you think they're
risky or not worth the added complexity. So there's no attitude
mismatch there.

I simply don't agree with some of your requirements (significant
number of users) or some of the technical decisions (VCPU hard limit
at 64).

                                  Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 10:30                                           ` Pekka Enberg
@ 2011-07-14 11:54                                             ` Avi Kivity
  2011-07-14 12:32                                               ` Sasha Levin
  2011-07-14 12:37                                               ` Pekka Enberg
  0 siblings, 2 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 11:54 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> Hi Avi,
>
> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  Why does that matter? Why should we keep the emulation slow if it's
> >>  possible to fix it?
> >
> >  Fixing things that don't need fixing has a cost.  In work, in risk, and in
> >  maintainability.  If you can share this cost among other users (which is
> >  certainly possible with socket mmio), it may still be worth it.  But just
> >  making something faster is not sufficient, it has to be faster for a
> >  significant number of users.
>
> I don't think it needs to be faster for *significant number* of users
> but yes, I completely agree that we need to make sure KVM gains more
> than the costs are.

Significant, for me, means it's measured in a percentage, not as digits 
on various limbs.  2% is a significant amount of users.  5 is not.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  It's a fair question to ask if the benefits
> >>  outweigh the added complexity but asking as to keep serial emulation
> >>  slow because *you* think it's unrealistic is well - unrealistic from
> >>  your part!
> >
> >  Exactly where am I unrealistic?  Do you think there are many users who
> >  suffer from slow serial emulation?
>
> Again, I don't with you that there needs to be many users for this
> type of feature. That said, as a maintainer you seem to think that it
> is and I'm obviously OK with that.
>
> So if you've been saying 'this is too complex for too little gain' all
> this time, I've misunderstood what you've been trying to say. The way
> I've read your comments is "optimizing serial console is stupid
> because it's a useless feature" which obviously not true because we
> find it useful!

More or less.  Note "this" here is not socket mmio - but I want to see a 
real use case for socket mmio.

> >>  *You* brought up 1024 vcpus using serial console! Obviously optimizing
> >>  something like that is stupid but we never claimed that we wanted to
> >>  do something like that!
> >
> >  Either of them, independently, is unrealistic.  The example of them together
> >  was just levantine exaggeration, it wasn't meant to be taken literally.
>
> I obviously don't agree that they're unrealistic independently.
>
> We want to use 8250 emulation instead of virtio-serial because it's
> more compatible with kernel debugging mechanisms. Also, it makes
> debugging virtio code much easier when we don't need to use virtio to
> deliver console output while debugging it. We want to make it fast so
> that we don't need to switch over to another console type after early
> boot.
>
> What's unreasonable about that?

Does virtio debugging really need super-fast serial?  Does it need 
serial at all?

> Reasonably fast 1024 VCPUs would be great for testing kernel
> configurations. KVM is not there yet so we suggested that we raise the
> hard limit from current 64 VCPUs so that it's easier for people such
> as ourselves to improve things. I don't understand why you think
> that's unreasonable either!

You will never get reasonably fast 1024 vcpus on your laptop.  As soon 
as your vcpus start doing useful work, they will thrash.  The guest 
kernel expects reasonable latency on cross-cpu operations, and kvm won't 
be able to provide it with such overcommit.  The PLE stuff attempts to 
mitigate some of the problem, but it's not going to work for such huge 
overcommit.

Every contended spin_lock() or IPI will turn into a huge spin in the 
worst case or a context switch in the best case.  Performance will tank 
unless you're running some shared-nothing process-per-vcpu workload in 
the guest.

The only way to get reasonable 1024 vcpu performance is to run it on a 
1024 cpu host.  People who have such machines are usually interested in 
realistic workloads, and that means much smaller guests.  If you do want 
to run 1024-on-1024, there is a lot of work in getting NUMA to function 
correctly; what we have now is not sufficient for large machines with 
large NUMA factors.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  As for 1024 vcpus, we already had the discussion where we explained
> >>  why we thought it was a good idea not to have such a low hard vcpu
> >>  limit for vcpus.
> >
> >  I can't say I was convinced.  It's pretty simple to patch the kernel if you
> >  want to engage in such experiments.  We did find something that works out
> >  (the soft/hard limits), but it's still overkill.
>
> I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
> guess I misunderstood your position then.

It's "okay", but no more.  If I were Linus I'd say it's scalability 
masturbation.

> On Thu, Jul 14, 2011 at 12:48 PM, Avi Kivity<avi@redhat.com>  wrote:
> >  There's a large attitude mismatch between tools/kvm developers and kvm
> >  developers (or at least me): tools/kvm is growing rapidly, adding features
> >  and improving stability at a fast pace.  kvm on the other hand is mature and
> >  a lot more concerned with preserving and improving stability than with
> >  adding new features.  The fact is, kvm is already very feature rich and very
> >  performant, so we're at a very different place in the
> >  performance/features/stability scales.
>
> Yes, we're at different places but we definitely appreciate the
> stability and performance of KVM and have no interest in disrupting
> that. I don't expect you to merge our patches when you think they're
> risky or not worth the added complexity. So there's no attitude
> mismatch there.
>
> I simply don't agree with some of your requirements (significant
> number of users) or some of the technical decisions (VCPU hard limit
> at 64).

It's great to have a disagreement without descending into ugly 
flamewars, I appreciate that.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 11:54                                             ` Avi Kivity
@ 2011-07-14 12:32                                               ` Sasha Levin
  2011-07-14 12:46                                                 ` Avi Kivity
  2011-07-14 12:37                                               ` Pekka Enberg
  1 sibling, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-14 12:32 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> > We want to use 8250 emulation instead of virtio-serial because it's
> > more compatible with kernel debugging mechanisms. Also, it makes
> > debugging virtio code much easier when we don't need to use virtio to
> > deliver console output while debugging it. We want to make it fast so
> > that we don't need to switch over to another console type after early
> > boot.
> >
> > What's unreasonable about that?
> 
> Does virtio debugging really need super-fast serial?  Does it need 
> serial at all?
> 

Does it need super-fast serial? No, although it's nice. Does it need
serial at all? Definitely.

It's not just virtio which can fail running on virtio-console, it's also
the threadpool, the eventfd mechanism and even the PCI management
module. You can't really debug it if you can't depend on your debugging
mechanism to properly work.

So far, serial is the simplest, most effective, and never-failing method
we had for working on guests, I don't see how we can work without it at
the moment.


> > Reasonably fast 1024 VCPUs would be great for testing kernel
> > configurations. KVM is not there yet so we suggested that we raise the
> > hard limit from current 64 VCPUs so that it's easier for people such
> > as ourselves to improve things. I don't understand why you think
> > that's unreasonable either!
> 
> You will never get reasonably fast 1024 vcpus on your laptop.  As soon 
> as your vcpus start doing useful work, they will thrash.  The guest 
> kernel expects reasonable latency on cross-cpu operations, and kvm won't 
> be able to provide it with such overcommit.  The PLE stuff attempts to 
> mitigate some of the problem, but it's not going to work for such huge 
> overcommit.
> 

I agree here that the performance even with 256 vcpus would be terrible
and no 'real' users would be doing that until the infrastructure could
provide reasonable performance.

The two uses I see for it are:

1. Stressing out the usermode code. One of the reasons qemu can't
properly do 64 vcpus now is not just due to the KVM kernel code, it's
also due to qemu itself. We're trying to avoid doing the same
with /tools/kvm.

2. Preventing future scalability problems. Currently we can't do 1024
vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
for not scaling up to 1024 vcpus (and by scaling I mean running without
errors, without regards to performance).

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 11:54                                             ` Avi Kivity
  2011-07-14 12:32                                               ` Sasha Levin
@ 2011-07-14 12:37                                               ` Pekka Enberg
  2011-07-14 12:48                                                 ` Avi Kivity
  1 sibling, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14 12:37 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

Hi Avi,

On Thu, Jul 14, 2011 at 2:54 PM, Avi Kivity <avi@redhat.com> wrote:
>> I don't think it needs to be faster for *significant number* of users
>> but yes, I completely agree that we need to make sure KVM gains more
>> than the costs are.
>
> Significant, for me, means it's measured in a percentage, not as digits on
> various limbs.  2% is a significant amount of users.  5 is not.

Just to make my position clear: I think it's enough that there's one
user that benefits as long as complexity isn't too high and I think
x86/Voyager is a pretty good example of that. So while you can argue
*for* complexity if there are enough users, when there's only few
users, it's really about whether a feature adds significant complexity
or not.

>> We want to use 8250 emulation instead of virtio-serial because it's
>> more compatible with kernel debugging mechanisms. Also, it makes
>> debugging virtio code much easier when we don't need to use virtio to
>> deliver console output while debugging it. We want to make it fast so
>> that we don't need to switch over to another console type after early
>> boot.
>>
>> What's unreasonable about that?
>
> Does virtio debugging really need super-fast serial?  Does it need serial at
> all?

Text mode guests should be super-fast, not virtio debugging. We want
to use 8250 instead of virtio serial or virtio console (which we
support, btw) because it's more compatible with Linux. Debugging
virtio with 8250 has turned out to be useful in the past.

>> I thought you were convinced that KVM_CAP_MAX_VCPUS was reasonable. I
>> guess I misunderstood your position then.
>
> It's "okay", but no more.  If I were Linus I'd say it's scalability
> masturbation.

That's definitely not the intent and I suppose we have different
opinions on what's 'reasonably fast'.

                                Pekka

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 12:32                                               ` Sasha Levin
@ 2011-07-14 12:46                                                 ` Avi Kivity
  2011-07-14 13:00                                                   ` Sasha Levin
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 12:46 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 03:32 PM, Sasha Levin wrote:
> On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> >  On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> >  >  We want to use 8250 emulation instead of virtio-serial because it's
> >  >  more compatible with kernel debugging mechanisms. Also, it makes
> >  >  debugging virtio code much easier when we don't need to use virtio to
> >  >  deliver console output while debugging it. We want to make it fast so
> >  >  that we don't need to switch over to another console type after early
> >  >  boot.
> >  >
> >  >  What's unreasonable about that?
> >
> >  Does virtio debugging really need super-fast serial?  Does it need
> >  serial at all?
> >
>
> Does it need super-fast serial? No, although it's nice. Does it need
> serial at all? Definitely.

Why?  virtio is mature.  It's not some early boot thing which fails and 
kills the guest.  Even if you get an oops, usually the guest is still alive.

> It's not just virtio which can fail running on virtio-console, it's also
> the threadpool, the eventfd mechanism and even the PCI management
> module. You can't really debug it if you can't depend on your debugging
> mechanism to properly work.

Wait, those are guest things, not host things.

> So far, serial is the simplest, most effective, and never-failing method
> we had for working on guests, I don't see how we can work without it at
> the moment.

I really can't remember the last time I used the serial console for the 
guest.  In the early early days, sure, but now?

> I agree here that the performance even with 256 vcpus would be terrible
> and no 'real' users would be doing that until the infrastructure could
> provide reasonable performance.
>
> The two uses I see for it are:
>
> 1. Stressing out the usermode code. One of the reasons qemu can't
> properly do 64 vcpus now is not just due to the KVM kernel code, it's
> also due to qemu itself. We're trying to avoid doing the same
> with /tools/kvm.

It won't help without a 1024 cpu host.  As soon as you put a real 
workload on the guest, it will thrash and any scaling issue in qemu or 
tools/kvm will be drowned in the noise.

> 2. Preventing future scalability problems. Currently we can't do 1024
> vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
> for not scaling up to 1024 vcpus (and by scaling I mean running without
> errors, without regards to performance).

That's not what scaling means (not to say that it wouldn't be nice to 
fix coalesced mmio).

btw, why are you so eager to run 1024 vcpu guests? usually, if you have 
a need for such large systems, you're really performance sensitive.  
It's not a good case for virtualization.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 12:37                                               ` Pekka Enberg
@ 2011-07-14 12:48                                                 ` Avi Kivity
  2011-07-14 12:52                                                   ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 12:48 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 03:37 PM, Pekka Enberg wrote:
> Hi Avi,
>
> On Thu, Jul 14, 2011 at 2:54 PM, Avi Kivity<avi@redhat.com>  wrote:
> >>  I don't think it needs to be faster for *significant number* of users
> >>  but yes, I completely agree that we need to make sure KVM gains more
> >>  than the costs are.
> >
> >  Significant, for me, means it's measured in a percentage, not as digits on
> >  various limbs.  2% is a significant amount of users.  5 is not.
>
> Just to make my position clear: I think it's enough that there's one
> user that benefits as long as complexity isn't too high and I think
> x86/Voyager is a pretty good example of that. So while you can argue
> *for* complexity if there are enough users, when there's only few
> users, it's really about whether a feature adds significant complexity
> or not.

Everything adds complexity.  And I think x86/voyager was a mistake.

> >>  We want to use 8250 emulation instead of virtio-serial because it's
> >>  more compatible with kernel debugging mechanisms. Also, it makes
> >>  debugging virtio code much easier when we don't need to use virtio to
> >>  deliver console output while debugging it. We want to make it fast so
> >>  that we don't need to switch over to another console type after early
> >>  boot.
> >>
> >>  What's unreasonable about that?
> >
> >  Does virtio debugging really need super-fast serial?  Does it need serial at
> >  all?
>
> Text mode guests should be super-fast, not virtio debugging. We want
> to use 8250 instead of virtio serial or virtio console (which we
> support, btw) because it's more compatible with Linux. Debugging
> virtio with 8250 has turned out to be useful in the past.

Use virtio-console when you're in production (it will be much much 
faster than socket-mmio 8250), and 8250 when debugging.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 12:48                                                 ` Avi Kivity
@ 2011-07-14 12:52                                                   ` Pekka Enberg
  2011-07-14 12:54                                                     ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14 12:52 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, 2011-07-14 at 15:48 +0300, Avi Kivity wrote:
> Use virtio-console when you're in production (it will be much much 
> faster than socket-mmio 8250), and 8250 when debugging.

This is exactly what we try to avoid. We want to use same code for
production and debugging when possible. And btw, when I have my kernel
hacker hat on, production is debugging so it's important that that's
fast too.

			Pekka


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 12:52                                                   ` Pekka Enberg
@ 2011-07-14 12:54                                                     ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 12:54 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 03:52 PM, Pekka Enberg wrote:
> On Thu, 2011-07-14 at 15:48 +0300, Avi Kivity wrote:
> >  Use virtio-console when you're in production (it will be much much
> >  faster than socket-mmio 8250), and 8250 when debugging.
>
> This is exactly what we try to avoid. We want to use same code for
> production and debugging when possible. And btw, when I have my kernel
> hacker hat on, production is debugging so it's important that that's
> fast too.

But again, socket-mmio 8250 will never match virtio-console.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 12:46                                                 ` Avi Kivity
@ 2011-07-14 13:00                                                   ` Sasha Levin
  2011-07-14 13:05                                                     ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-14 13:00 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, 2011-07-14 at 15:46 +0300, Avi Kivity wrote:
> On 07/14/2011 03:32 PM, Sasha Levin wrote:
> > On Thu, 2011-07-14 at 14:54 +0300, Avi Kivity wrote:
> > >  On 07/14/2011 01:30 PM, Pekka Enberg wrote:
> > >  >  We want to use 8250 emulation instead of virtio-serial because it's
> > >  >  more compatible with kernel debugging mechanisms. Also, it makes
> > >  >  debugging virtio code much easier when we don't need to use virtio to
> > >  >  deliver console output while debugging it. We want to make it fast so
> > >  >  that we don't need to switch over to another console type after early
> > >  >  boot.
> > >  >
> > >  >  What's unreasonable about that?
> > >
> > >  Does virtio debugging really need super-fast serial?  Does it need
> > >  serial at all?
> > >
> >
> > Does it need super-fast serial? No, although it's nice. Does it need
> > serial at all? Definitely.
> 
> Why?  virtio is mature.  It's not some early boot thing which fails and 
> kills the guest.  Even if you get an oops, usually the guest is still alive.

virtio is mature, /tools/kvm isn't :)

> 
> > It's not just virtio which can fail running on virtio-console, it's also
> > the threadpool, the eventfd mechanism and even the PCI management
> > module. You can't really debug it if you can't depend on your debugging
> > mechanism to properly work.
> 
> Wait, those are guest things, not host things.

Yes, as you said in the previous mail, both KVM and virtio are very
stable. /tools/kvm was the one who was being debugged most of the time.


> > So far, serial is the simplest, most effective, and never-failing method
> > we had for working on guests, I don't see how we can work without it at
> > the moment.
> 
> I really can't remember the last time I used the serial console for the 
> guest.  In the early early days, sure, but now?
> 

I don't know, if it works fine why not use it when you need simple
serial connection?

It's also useful for kernel hackers who break early boot things :)

> > I agree here that the performance even with 256 vcpus would be terrible
> > and no 'real' users would be doing that until the infrastructure could
> > provide reasonable performance.
> >
> > The two uses I see for it are:
> >
> > 1. Stressing out the usermode code. One of the reasons qemu can't
> > properly do 64 vcpus now is not just due to the KVM kernel code, it's
> > also due to qemu itself. We're trying to avoid doing the same
> > with /tools/kvm.
> 
> It won't help without a 1024 cpu host.  As soon as you put a real 
> workload on the guest, it will thrash and any scaling issue in qemu or 
> tools/kvm will be drowned in the noise.
> 
> > 2. Preventing future scalability problems. Currently we can't do 1024
> > vcpus because it breaks coalesced MMIO - which is IMO not a valid reason
> > for not scaling up to 1024 vcpus (and by scaling I mean running without
> > errors, without regards to performance).
> 
> That's not what scaling means (not to say that it wouldn't be nice to 
> fix coalesced mmio).
> 
> btw, why are you so eager to run 1024 vcpu guests? usually, if you have 
> a need for such large systems, you're really performance sensitive.  
> It's not a good case for virtualization.
> 
> 

I may have went too far with 1024, I have only tested it on 254 vcpus so
far - I'll change that in my patch.

It's also not just a KVM issue. Take for example the RCU issue which we
were able to detect with /tools/kvm just by trying more than 30 vcpus
and noticing that RCU was broken with a recent kernel.

Testing the kernel on guests with large amount of vcpus or virtual
memory might prove beneficial not only for KVM itself.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 13:00                                                   ` Sasha Levin
@ 2011-07-14 13:05                                                     ` Avi Kivity
  2011-07-14 13:17                                                       ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 13:05 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 04:00 PM, Sasha Levin wrote:
> >
> >  Why?  virtio is mature.  It's not some early boot thing which fails and
> >  kills the guest.  Even if you get an oops, usually the guest is still alive.
>
> virtio is mature, /tools/kvm isn't :)
> >
> >  >  It's not just virtio which can fail running on virtio-console, it's also
> >  >  the threadpool, the eventfd mechanism and even the PCI management
> >  >  module. You can't really debug it if you can't depend on your debugging
> >  >  mechanism to properly work.
> >
> >  Wait, those are guest things, not host things.
>
> Yes, as you said in the previous mail, both KVM and virtio are very
> stable. /tools/kvm was the one who was being debugged most of the time.

I still don't follow.  The guest oopses? dmesg | less.  An issue with 
tools/kvm? gdb -p `pgrep kvm`.

> >  >  So far, serial is the simplest, most effective, and never-failing method
> >  >  we had for working on guests, I don't see how we can work without it at
> >  >  the moment.
> >
> >  I really can't remember the last time I used the serial console for the
> >  guest.  In the early early days, sure, but now?
> >
>
> I don't know, if it works fine why not use it when you need simple
> serial connection?
>
> It's also useful for kernel hackers who break early boot things :)

I'm not advocating removing it!  I'm just questioning the need for 
optimization.

> >  That's not what scaling means (not to say that it wouldn't be nice to
> >  fix coalesced mmio).
> >
> >  btw, why are you so eager to run 1024 vcpu guests? usually, if you have
> >  a need for such large systems, you're really performance sensitive.
> >  It's not a good case for virtualization.
> >
> >
>
> I may have went too far with 1024, I have only tested it on 254 vcpus so
> far - I'll change that in my patch.
>
> It's also not just a KVM issue. Take for example the RCU issue which we
> were able to detect with /tools/kvm just by trying more than 30 vcpus
> and noticing that RCU was broken with a recent kernel.
>
> Testing the kernel on guests with large amount of vcpus or virtual
> memory might prove beneficial not only for KVM itself.

Non-performance testing of really large guests is a valid use case, I agree.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 13:05                                                     ` Avi Kivity
@ 2011-07-14 13:17                                                       ` Pekka Enberg
  2011-07-14 13:23                                                         ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-14 13:17 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Thu, 2011-07-14 at 16:05 +0300, Avi Kivity wrote:
> On 07/14/2011 04:00 PM, Sasha Levin wrote:
> > >
> > >  Why?  virtio is mature.  It's not some early boot thing which fails and
> > >  kills the guest.  Even if you get an oops, usually the guest is still alive.
> >
> > virtio is mature, /tools/kvm isn't :)
> > >
> > >  >  It's not just virtio which can fail running on virtio-console, it's also
> > >  >  the threadpool, the eventfd mechanism and even the PCI management
> > >  >  module. You can't really debug it if you can't depend on your debugging
> > >  >  mechanism to properly work.
> > >
> > >  Wait, those are guest things, not host things.
> >
> > Yes, as you said in the previous mail, both KVM and virtio are very
> > stable. /tools/kvm was the one who was being debugged most of the time.
> 
> I still don't follow.  The guest oopses? dmesg | less.  An issue with 
> tools/kvm? gdb -p `pgrep kvm`.

When I was debugging tools/kvm virtio code, I used to 'instrument' the
guest kernel with printk() calls which helped a lot.

Also, a bug in tools/kvm can manifest in many interesting ways in the
guest kernel during boot, for example. You can't do dmesg then and gdb
won't save you. I think you've lived too long in the table KVM and Qemu
land to remember how important reliable printk() is for development.

			Pekka


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 13:17                                                       ` Pekka Enberg
@ 2011-07-14 13:23                                                         ` Avi Kivity
  2011-07-20  2:52                                                           ` Anthony Liguori
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-14 13:23 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/14/2011 04:17 PM, Pekka Enberg wrote:
> >
> >  I still don't follow.  The guest oopses? dmesg | less.  An issue with
> >  tools/kvm? gdb -p `pgrep kvm`.
>
> When I was debugging tools/kvm virtio code, I used to 'instrument' the
> guest kernel with printk() calls which helped a lot.
>

Sure, but do you really need it spewing out the serial port all the time?

> Also, a bug in tools/kvm can manifest in many interesting ways in the
> guest kernel during boot, for example. You can't do dmesg then and gdb
> won't save you. I think you've lived too long in the table KVM and Qemu
> land to remember how important reliable printk() is for development.

I guess.  Also I've switched to trace_printk() since it's much nicer 
(and intergrates with other ftrace features).

And again, I'm not against tools/kvm optimizing serial.  I just want 
better justification for socket-mmio.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
                     ` (2 preceding siblings ...)
  2011-07-06 13:00   ` Avi Kivity
@ 2011-07-20  2:42   ` Anthony Liguori
  2011-07-20  8:19     ` Avi Kivity
  3 siblings, 1 reply; 62+ messages in thread
From: Anthony Liguori @ 2011-07-20  2:42 UTC (permalink / raw)
  To: Sasha Levin
  Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

On 07/05/2011 11:37 PM, Sasha Levin wrote:
> The new flag allows passing a connected socket instead of an
> eventfd to be notified of writes or reads to the specified memory region.
>
> Instead of signaling an event, On write - the value written to the memory
> region is written to the pipe.
> On read - a notification of the read is sent to the host, and a response
> is expected with the value to be 'read'.
>
> Using a socket instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.

I suspect a normal uart is a bad use case for this.

The THR can only hold a single value, a guest is not supposed to write 
to the THR again until the THRE flag in the LSR is set which indicates 
that the THR is empty.

In fact, when THRE is raised, the device emulation should raise an 
interrupt and that's what should trigger the guest to write another 
value to the THR.

So in practice, I don't see how it would work without relying on some 
specific behavior of the current Linux uart guest code.

But that said, I think this is an interesting mechanism.  I'd be curious 
how well it worked for VGA planar access which is what the current 
coalesced I/O is most useful for.

Regards,

Anthony Liguori

>
> Cc: Avi Kivity<avi@redhat.com>
> Cc: Ingo Molnar<mingo@elte.hu>
> Cc: Marcelo Tosatti<mtosatti@redhat.com>
> Cc: Michael S. Tsirkin<mst@redhat.com>
> Cc: Pekka Enberg<penberg@kernel.org>
> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
> ---
>   Documentation/virtual/kvm/api.txt |   18 ++++-
>   include/linux/kvm.h               |    9 ++
>   virt/kvm/eventfd.c                |  153 ++++++++++++++++++++++++++++++++-----
>   3 files changed, 161 insertions(+), 19 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 317d86a..74f0946 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
>
>   This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
>   within the guest.  A guest write in the registered address will signal the
> -provided event instead of triggering an exit.
> +provided event or write to the provided socket instead of triggering an exit.
>
>   struct kvm_ioeventfd {
>   	__u64 datamatch;
> @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>   The following flags are defined:
>
>   #define KVM_IOEVENTFD_FLAG_DATAMATCH (1<<  kvm_ioeventfd_flag_nr_datamatch)
> @@ -1348,6 +1355,7 @@ The following flags are defined:
>   #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1<<  kvm_ioeventfd_flag_nr_deassign)
>   #define KVM_IOEVENTFD_FLAG_READ      (1<<  kvm_ioeventfd_flag_nr_read)
>   #define KVM_IOEVENTFD_FLAG_NOWRITE   (1<<  kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1<<  kvm_ioeventfd_flag_nr_socket)
>
>   If datamatch flag is set, the event will be signaled only if the written value
>   to the registered address is equal to datamatch in struct kvm_ioeventfd.
> @@ -1359,6 +1367,14 @@ passed in datamatch.
>   If the nowrite flag is set, the event won't be signaled when the specified address
>   is being written to.
>
> +If the socket flag is set, fd is expected to be a connected AF_UNIX
> +SOCK_SEQPACKET socket. Once a guest write in the registered address is
> +detected - a struct kvm_ioeventfd_data which describes the write will be
> +written to the socket.
> +On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and
> +would wait for a response with a struct kvm_ioeventfd_data containing the
> +value which should be 'read' by the guest.
> +
>
>   5. The kvm_run structure
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 8a12711..ff3d808 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -389,6 +389,7 @@ enum {
>   	kvm_ioeventfd_flag_nr_deassign,
>   	kvm_ioeventfd_flag_nr_read,
>   	kvm_ioeventfd_flag_nr_nowrite,
> +	kvm_ioeventfd_flag_nr_socket,
>   	kvm_ioeventfd_flag_nr_max,
>   };
>
> @@ -397,6 +398,7 @@ enum {
>   #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1<<  kvm_ioeventfd_flag_nr_deassign)
>   #define KVM_IOEVENTFD_FLAG_READ      (1<<  kvm_ioeventfd_flag_nr_read)
>   #define KVM_IOEVENTFD_FLAG_NOWRITE   (1<<  kvm_ioeventfd_flag_nr_nowrite)
> +#define KVM_IOEVENTFD_FLAG_SOCKET    (1<<  kvm_ioeventfd_flag_nr_socket)
>
>   #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1<<  kvm_ioeventfd_flag_nr_max) - 1)
>
> @@ -409,6 +411,13 @@ struct kvm_ioeventfd {
>   	__u8  pad[36];
>   };
>
> +struct kvm_ioeventfd_data {
> +	__u64 data;
> +	__u64 addr;
> +	__u32 len;
> +	__u8  is_write;
> +};
> +
>   /* for KVM_ENABLE_CAP */
>   struct kvm_enable_cap {
>   	/* in */
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 5f2d203..d1d63b3 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -32,6 +32,7 @@
>   #include<linux/eventfd.h>
>   #include<linux/kernel.h>
>   #include<linux/slab.h>
> +#include<linux/net.h>
>
>   #include "iodev.h"
>
> @@ -413,10 +414,11 @@ module_exit(irqfd_module_exit);
>
>   /*
>    * --------------------------------------------------------------------
> - * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
> + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
> + *            a socket write.
>    *
> - * userspace can register a PIO/MMIO address with an eventfd for receiving
> - * notification when the memory has been touched.
> + * userspace can register a PIO/MMIO address with an eventfd or a
> + * socket for receiving notification when the memory has been touched.
>    * --------------------------------------------------------------------
>    */
>
> @@ -424,7 +426,10 @@ struct _ioeventfd {
>   	struct list_head     list;
>   	u64                  addr;
>   	int                  length;
> -	struct eventfd_ctx  *eventfd;
> +	union {
> +		struct socket       *sock;
> +		struct eventfd_ctx  *eventfd;
> +	};
>   	u64                  datamatch;
>   	struct kvm_io_device dev;
>   	bool                 wildcard;
> @@ -441,7 +446,11 @@ to_ioeventfd(struct kvm_io_device *dev)
>   static void
>   ioeventfd_release(struct _ioeventfd *p)
>   {
> -	eventfd_ctx_put(p->eventfd);
> +	if (p->eventfd)
> +		eventfd_ctx_put(p->eventfd);
> +	else
> +		sockfd_put(p->sock);
> +
>   	list_del(&p->list);
>   	kfree(p);
>   }
> @@ -510,12 +519,65 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
>   	return _val == p->datamatch ? true : false;
>   }
>
> +static ssize_t socket_write(struct socket *sock, const void *buf, size_t count)
> +{
> +	mm_segment_t old_fs;
> +	ssize_t res;
> +	struct msghdr msg;
> +	struct iovec iov;
> +
> +	iov = (struct iovec) {
> +		.iov_base = (void *)buf,
> +		.iov_len  = count,
> +	};
> +
> +	msg = (struct msghdr) {
> +		.msg_iov =&iov,
> +		.msg_iovlen = 1,
> +	};
> +
> +	old_fs = get_fs();
> +	set_fs(get_ds());
> +	/* The cast to a user pointer is valid due to the set_fs() */
> +	res = sock_sendmsg(sock,&msg, count);
> +	set_fs(old_fs);
> +
> +	return res;
> +}
> +
> +static ssize_t socket_read(struct socket *sock, void *buf, size_t count)
> +{
> +	mm_segment_t old_fs;
> +	ssize_t res;
> +	struct msghdr msg;
> +	struct iovec iov;
> +
> +	iov = (struct iovec) {
> +		.iov_base = (void *)buf,
> +		.iov_len  = count,
> +	};
> +
> +	msg = (struct msghdr) {
> +		.msg_iov =&iov,
> +		.msg_iovlen = 1,
> +	};
> +
> +	old_fs = get_fs();
> +	set_fs(get_ds());
> +	/* The cast to a user pointer is valid due to the set_fs() */
> +	res = sock_recvmsg(sock,&msg, count, 0);
> +	set_fs(old_fs);
> +
> +	return res;
> +}
> +
>   /* MMIO/PIO writes trigger an event if the addr/val match */
>   static int
>   ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
>   		const void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on writes isn't requested */
>   	if (!p->track_writes)
> @@ -524,7 +586,18 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.data = get_val(val, len),
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 1,
> +	};
> +
> +	if (p->sock)
> +		socket_write(p->sock,&data, sizeof(data));
> +	else
> +		eventfd_signal(p->eventfd, 1);
> +
>   	return 0;
>   }
>
> @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   		void *val)
>   {
>   	struct _ioeventfd *p = to_ioeventfd(this);
> +	struct kvm_ioeventfd_data data;
>
>   	/* Exit if signaling on reads isn't requested */
>   	if (!p->track_reads)
> @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
>   	if (!ioeventfd_in_range(p, addr, len, val))
>   		return -EOPNOTSUPP;
>
> -	eventfd_signal(p->eventfd, 1);
> +	data = (struct kvm_ioeventfd_data) {
> +		.addr = addr,
> +		.len = len,
> +		.is_write = 0,
> +	};
> +
> +	if (p->sock) {
> +		socket_write(p->sock,&data, sizeof(data));
> +		socket_read(p->sock,&data, sizeof(data));
> +		set_val(val, len, data.data);
> +	} else {
> +		set_val(val, len, p->datamatch);
> +		eventfd_signal(p->eventfd, 1);
> +	}
> +
>   	return 0;
>   }
>
> @@ -585,7 +673,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	int                       pio = args->flags&  KVM_IOEVENTFD_FLAG_PIO;
>   	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
>   	struct _ioeventfd        *p;
> -	struct eventfd_ctx       *eventfd;
> +	struct eventfd_ctx       *eventfd = NULL;
>   	int                       ret;
>
>   	/* check for range overflow */
> @@ -596,10 +684,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	if (args->flags&  ~KVM_IOEVENTFD_VALID_FLAG_MASK)
>   		return -EINVAL;
>
> -	eventfd = eventfd_ctx_fdget(args->fd);
> -	if (IS_ERR(eventfd))
> -		return PTR_ERR(eventfd);
> -
>   	p = kzalloc(sizeof(*p), GFP_KERNEL);
>   	if (!p) {
>   		ret = -ENOMEM;
> @@ -611,6 +695,20 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	p->length  = args->len;
>   	p->eventfd = eventfd;
>
> +	if (args->flags&  KVM_IOEVENTFD_FLAG_SOCKET) {
> +		ret = 0;
> +		p->sock = sockfd_lookup(args->fd,&ret);
> +		if (ret)
> +			goto fail;
> +	} else {
> +		ret = -EINVAL;
> +		eventfd = eventfd_ctx_fdget(args->fd);
> +		if (IS_ERR(eventfd))
> +			goto fail;
> +
> +		p->eventfd = eventfd;
> +	}
> +
>   	/* The datamatch feature is optional, otherwise this is a wildcard */
>   	if (args->flags&  KVM_IOEVENTFD_FLAG_DATAMATCH)
>   		p->datamatch = args->datamatch;
> @@ -649,8 +747,14 @@ unlock_fail:
>   	mutex_unlock(&kvm->slots_lock);
>
>   fail:
> +	if (eventfd)
> +		eventfd_ctx_put(eventfd);
> +
> +	if (p->sock)
> +		sockfd_put(p->sock);
> +
> +
>   	kfree(p);
> -	eventfd_ctx_put(eventfd);
>
>   	return ret;
>   }
> @@ -661,12 +765,21 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   	int                       pio = args->flags&  KVM_IOEVENTFD_FLAG_PIO;
>   	enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
>   	struct _ioeventfd        *p, *tmp;
> -	struct eventfd_ctx       *eventfd;
> +	struct eventfd_ctx       *eventfd = NULL;
> +	struct socket            *sock = NULL;
>   	int                       ret = -ENOENT;
>
> -	eventfd = eventfd_ctx_fdget(args->fd);
> -	if (IS_ERR(eventfd))
> -		return PTR_ERR(eventfd);
> +	if (args->flags&  KVM_IOEVENTFD_FLAG_SOCKET) {
> +		ret = 0;
> +		sock = sockfd_lookup(args->fd,&ret);
> +		if (ret)
> +			return PTR_ERR(sock);
> +	} else {
> +		ret = -EINVAL;
> +		eventfd = eventfd_ctx_fdget(args->fd);
> +		if (IS_ERR(eventfd))
> +			return PTR_ERR(eventfd);
> +	}
>
>   	mutex_lock(&kvm->slots_lock);
>
> @@ -674,6 +787,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>   		bool wildcard = !(args->flags&  KVM_IOEVENTFD_FLAG_DATAMATCH);
>
>   		if (p->eventfd != eventfd  ||
> +		    p->sock != sock        ||
>   		    p->addr != args->addr  ||
>   		    p->length != args->len ||
>   		    p->wildcard != wildcard)
> @@ -690,7 +804,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>
>   	mutex_unlock(&kvm->slots_lock);
>
> -	eventfd_ctx_put(eventfd);
> +	if (eventfd)
> +		eventfd_ctx_put(eventfd);
> +	if (sock)
> +		sockfd_put(sock);
>
>   	return ret;
>   }


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  7:07                     ` Avi Kivity
  2011-07-13  8:02                       ` Pekka Enberg
@ 2011-07-20  2:49                       ` Anthony Liguori
  2011-07-20  9:44                         ` Pekka Enberg
  2011-07-25 12:10                       ` Sasha Levin
  2 siblings, 1 reply; 62+ messages in thread
From: Anthony Liguori @ 2011-07-20  2:49 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On 07/13/2011 02:07 AM, Avi Kivity wrote:
> On 07/13/2011 09:45 AM, Pekka Enberg wrote:
>> >
>> > OK, what's the simplest thing we can do here to keep Avi happy and get
>> > the functionality of Sasha's original patch that helps us avoid guest
>> > exits for serial console? I agree with Avi that we don't want
>> > fragmented ABI but it seems to me there's no clear idea how to fix
>> > KVM_IOEVENTFD_FLAG_SOCKET corner cases, right?
>>
>> And btw, I didn't follow the discussion closely, but introducing a new
>> type of exit for a feature that's designed to _avoid exits_ doesn't
>> seem like a smart thing to do. Is it even possible to support sockets
>> sanely for this?
>
> First of all, this feature doesn't avoid exits.

I don't think it's possible to do correct UART emulation and avoid an exit.

And yeah, looking at your serial emulation, you're not properly updating 
the LSR register.  The fact that this works with Linux is just luck. 
You will encounter guests that poll LSR.THRE in order to write more data 
to the THR and you will no longer be able to treat THR as having no side 
effects.

Worse yet, if the Linux driver ever changes in the future in a way 
that's fully compliant to the UART spec, it'll break your emulation.

If you're not going to emulate a UART properly, why not just use a 
paravirtual serial device that you can keep simple?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-14 13:23                                                         ` Avi Kivity
@ 2011-07-20  2:52                                                           ` Anthony Liguori
  2011-07-20  6:16                                                             ` Sasha Levin
  0 siblings, 1 reply; 62+ messages in thread
From: Anthony Liguori @ 2011-07-20  2:52 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On 07/14/2011 08:23 AM, Avi Kivity wrote:
> On 07/14/2011 04:17 PM, Pekka Enberg wrote:
>> >
>> > I still don't follow. The guest oopses? dmesg | less. An issue with
>> > tools/kvm? gdb -p `pgrep kvm`.
>>
>> When I was debugging tools/kvm virtio code, I used to 'instrument' the
>> guest kernel with printk() calls which helped a lot.
>>
>
> Sure, but do you really need it spewing out the serial port all the time?
>
>> Also, a bug in tools/kvm can manifest in many interesting ways in the
>> guest kernel during boot, for example. You can't do dmesg then and gdb
>> won't save you. I think you've lived too long in the table KVM and Qemu
>> land to remember how important reliable printk() is for development.
>
> I guess. Also I've switched to trace_printk() since it's much nicer (and
> intergrates with other ftrace features).
>
> And again, I'm not against tools/kvm optimizing serial. I just want
> better justification for socket-mmio.

Just to be really thorough, the optimization is incorrect for UART 
emulation.  Maybe for a simple PIO based console where there were no 
guest visible side effects to a character write, this would be okay, but 
that is not how a UART works.

Just implement virtio-serial :-)  You can move data as fast as you'd 
like through it.

And if virtio-serial is too complicated, make a simpler version that 
doesn't have such crazy semantics.

Regards,

Anthony Liguori

>


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-20  2:52                                                           ` Anthony Liguori
@ 2011-07-20  6:16                                                             ` Sasha Levin
  2011-07-20  9:42                                                               ` Pekka Enberg
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-20  6:16 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Avi Kivity, Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On Tue, 2011-07-19 at 21:52 -0500, Anthony Liguori wrote:
> On 07/14/2011 08:23 AM, Avi Kivity wrote:
> > On 07/14/2011 04:17 PM, Pekka Enberg wrote:
> >> >
> >> > I still don't follow. The guest oopses? dmesg | less. An issue with
> >> > tools/kvm? gdb -p `pgrep kvm`.
> >>
> >> When I was debugging tools/kvm virtio code, I used to 'instrument' the
> >> guest kernel with printk() calls which helped a lot.
> >>
> >
> > Sure, but do you really need it spewing out the serial port all the time?
> >
> >> Also, a bug in tools/kvm can manifest in many interesting ways in the
> >> guest kernel during boot, for example. You can't do dmesg then and gdb
> >> won't save you. I think you've lived too long in the table KVM and Qemu
> >> land to remember how important reliable printk() is for development.
> >
> > I guess. Also I've switched to trace_printk() since it's much nicer (and
> > intergrates with other ftrace features).
> >
> > And again, I'm not against tools/kvm optimizing serial. I just want
> > better justification for socket-mmio.
> 
> Just to be really thorough, the optimization is incorrect for UART 
> emulation.  Maybe for a simple PIO based console where there were no 
> guest visible side effects to a character write, this would be okay, but 
> that is not how a UART works.

To be honest, I haven't checked if the implementation we have is correct
- I've just checked that it would benefit from the implementation of
socket ioeventfds.

I'll have to look at it again based on your review and probably fix it
to work correctly :)

> 
> Just implement virtio-serial :-)  You can move data as fast as you'd 
> like through it.
> 
> And if virtio-serial is too complicated, make a simpler version that 
> doesn't have such crazy semantics.

We do have virtio-serial support in /tools/kvm, you can try it using
'--console virtio'.

Currently I prefer using simple serial most of the time since it's the
most basic and simplest method of getting output from a guest, which is
quite useful when developing the core of /tools/kvm.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-20  2:42   ` Anthony Liguori
@ 2011-07-20  8:19     ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-20  8:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti,
	Michael S. Tsirkin, Pekka Enberg

On 07/20/2011 05:42 AM, Anthony Liguori wrote:
>
> I suspect a normal uart is a bad use case for this.
>
> The THR can only hold a single value, a guest is not supposed to write 
> to the THR again until the THRE flag in the LSR is set which indicates 
> that the THR is empty.
>
> In fact, when THRE is raised, the device emulation should raise an 
> interrupt and that's what should trigger the guest to write another 
> value to the THR.
>
> So in practice, I don't see how it would work without relying on some 
> specific behavior of the current Linux uart guest code.
>
> But that said, I think this is an interesting mechanism.  I'd be 
> curious how well it worked for VGA planar access which is what the 
> current coalesced I/O is most useful for.

Probably the interesting use case is to forward an entire BAR to a 
separate process or thread, perhaps implemented in the kernel.  However 
synchronization will be an issue - a PCI read is supposed to flush all 
pending PCI writes, and this is hard to enforce when the socket 
consumers are out of process.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-20  6:16                                                             ` Sasha Levin
@ 2011-07-20  9:42                                                               ` Pekka Enberg
  0 siblings, 0 replies; 62+ messages in thread
From: Pekka Enberg @ 2011-07-20  9:42 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Anthony Liguori, Avi Kivity, Michael S. Tsirkin, kvm,
	Ingo Molnar, Marcelo Tosatti

On Wed, 2011-07-20 at 09:16 +0300, Sasha Levin wrote:
> > Just implement virtio-serial :-)  You can move data as fast as you'd 
> > like through it.
> > 
> > And if virtio-serial is too complicated, make a simpler version that 
> > doesn't have such crazy semantics.
> 
> We do have virtio-serial support in /tools/kvm, you can try it using
> '--console virtio'.

That's virtio-console. I thought virtio-serial was a separate driver.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-20  2:49                       ` Anthony Liguori
@ 2011-07-20  9:44                         ` Pekka Enberg
  2011-07-20 21:10                           ` Anthony Liguori
  0 siblings, 1 reply; 62+ messages in thread
From: Pekka Enberg @ 2011-07-20  9:44 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Avi Kivity, Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On Tue, 2011-07-19 at 21:49 -0500, Anthony Liguori wrote:
> If you're not going to emulate a UART properly, why not just use a 
> paravirtual serial device that you can keep simple?

We want to emulate it correctly to make it robust. The problems you
pointed out were caused by me being under heavy dose of ignorance when I
wrote the emulation code. ;-)

			Pekka


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-20  9:44                         ` Pekka Enberg
@ 2011-07-20 21:10                           ` Anthony Liguori
  0 siblings, 0 replies; 62+ messages in thread
From: Anthony Liguori @ 2011-07-20 21:10 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Avi Kivity, Sasha Levin, Michael S. Tsirkin, kvm, Ingo Molnar,
	Marcelo Tosatti

On 07/20/2011 04:44 AM, Pekka Enberg wrote:
> On Tue, 2011-07-19 at 21:49 -0500, Anthony Liguori wrote:
>> If you're not going to emulate a UART properly, why not just use a
>> paravirtual serial device that you can keep simple?
>
> We want to emulate it correctly to make it robust. The problems you
> pointed out were caused by me being under heavy dose of ignorance when I
> wrote the emulation code. ;-)

A UART has a fixed frequency associated with it and the divider latch 
will select the actual frequency based on a division of the fixed frequency.

There is no way to send a byte over the serial line faster than the 
fixed frequency.  The UART is so simple that it cannot DMA and it does 
not have very much memory on it for buffering so this limitation is 
visible to the guest.  This means the driver has to be very involved in 
the flower control of the device.

There's no way around this.  You simply can't do 10gb/s over a serial 
line and still have something that looks and acts like a UART.

BTW, it used to be we were pretty loose with how fast we'd move data 
through the serial port in QEMU and Linux would actually throw warnings 
if it saw more data moving through the serial port than is theoretically 
possible on bare metal.  Those warnings were removed at some point but 
I'm sure if you dug up and kernel and ran it, you would run into them 
with your current emulation.

Regards,

Anthony Liguori

>
> 			Pekka
>


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-13  7:07                     ` Avi Kivity
  2011-07-13  8:02                       ` Pekka Enberg
  2011-07-20  2:49                       ` Anthony Liguori
@ 2011-07-25 12:10                       ` Sasha Levin
  2011-07-25 12:16                         ` Avi Kivity
  2 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-25 12:10 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> Second, introducing a new type of exit doesn't mean we see more exits.
> 
> Third, the new type of exit is probably not needed - we can use the 
> existing mmio/pio exits, and document that in certain cases the kernel 
> will fall back to these instead of delivering the I/O via the sockets 
> (userspace can then try writing itself).

Just waking this up since I want to send a new version and just want to
cover some things before that.

The problem with the original implementation was that if we receive a
signal while we wait for the host to provide a value to be read, we must
abort the operation and exit to do the signal.

What this caused was that read operations with side effects would break
(for example, when reading a byte would change the value in that byte).

The original plan was to notify the host that we ignored his answer via
the socket, and it should provide the response again via regular MMIO
exit, but I couldn't find a good way to pass it through the MMIO exit.
Also, This would complicate this operation on the host quite a bit.

What I did instead was to assume that if the socket write notifying the
host of a read operation went through ok, we can block on the socket
read request.

Does it sound ok? I know it's not what was originally planned, but to me
it looked like the most efficient approach.


-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-25 12:10                       ` Sasha Levin
@ 2011-07-25 12:16                         ` Avi Kivity
  2011-07-25 12:26                           ` Sasha Levin
  0 siblings, 1 reply; 62+ messages in thread
From: Avi Kivity @ 2011-07-25 12:16 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/25/2011 03:10 PM, Sasha Levin wrote:
> On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> >  Second, introducing a new type of exit doesn't mean we see more exits.
> >
> >  Third, the new type of exit is probably not needed - we can use the
> >  existing mmio/pio exits, and document that in certain cases the kernel
> >  will fall back to these instead of delivering the I/O via the sockets
> >  (userspace can then try writing itself).
>
> Just waking this up since I want to send a new version and just want to
> cover some things before that.
>
> The problem with the original implementation was that if we receive a
> signal while we wait for the host to provide a value to be read, we must
> abort the operation and exit to do the signal.
>
> What this caused was that read operations with side effects would break
> (for example, when reading a byte would change the value in that byte).
>
> The original plan was to notify the host that we ignored his answer via
> the socket, and it should provide the response again via regular MMIO
> exit, but I couldn't find a good way to pass it through the MMIO exit.
> Also, This would complicate this operation on the host quite a bit.
>
> What I did instead was to assume that if the socket write notifying the
> host of a read operation went through ok, we can block on the socket
> read request.
>
> Does it sound ok? I know it's not what was originally planned, but to me
> it looked like the most efficient approach.

You can't block when a signal is pending.  You can block, however, after 
you've exited with -EINTR and re-entered.

We need to document that if a vcpu exited with -EINTR, then any socket 
memory transactions need to be flushed before the vcpu's state can be 
considered stable (for live migration).  In fact it's true for any kind 
of exit.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-25 12:16                         ` Avi Kivity
@ 2011-07-25 12:26                           ` Sasha Levin
  2011-07-25 13:04                             ` Avi Kivity
  0 siblings, 1 reply; 62+ messages in thread
From: Sasha Levin @ 2011-07-25 12:26 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On Mon, 2011-07-25 at 15:16 +0300, Avi Kivity wrote:
> On 07/25/2011 03:10 PM, Sasha Levin wrote:
> > On Wed, 2011-07-13 at 10:07 +0300, Avi Kivity wrote:
> > >  Second, introducing a new type of exit doesn't mean we see more exits.
> > >
> > >  Third, the new type of exit is probably not needed - we can use the
> > >  existing mmio/pio exits, and document that in certain cases the kernel
> > >  will fall back to these instead of delivering the I/O via the sockets
> > >  (userspace can then try writing itself).
> >
> > Just waking this up since I want to send a new version and just want to
> > cover some things before that.
> >
> > The problem with the original implementation was that if we receive a
> > signal while we wait for the host to provide a value to be read, we must
> > abort the operation and exit to do the signal.
> >
> > What this caused was that read operations with side effects would break
> > (for example, when reading a byte would change the value in that byte).
> >
> > The original plan was to notify the host that we ignored his answer via
> > the socket, and it should provide the response again via regular MMIO
> > exit, but I couldn't find a good way to pass it through the MMIO exit.
> > Also, This would complicate this operation on the host quite a bit.
> >
> > What I did instead was to assume that if the socket write notifying the
> > host of a read operation went through ok, we can block on the socket
> > read request.
> >
> > Does it sound ok? I know it's not what was originally planned, but to me
> > it looked like the most efficient approach.
> 
> You can't block when a signal is pending.  You can block, however, after 
> you've exited with -EINTR and re-entered.
> 

What would happen with the MMIO then? I need to provide an answer before
I leave the read/write functions to exit with -EINTR, no?

> We need to document that if a vcpu exited with -EINTR, then any socket 
> memory transactions need to be flushed before the vcpu's state can be 
> considered stable (for live migration).  In fact it's true for any kind 
> of exit.
> 

-- 

Sasha.


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET
  2011-07-25 12:26                           ` Sasha Levin
@ 2011-07-25 13:04                             ` Avi Kivity
  0 siblings, 0 replies; 62+ messages in thread
From: Avi Kivity @ 2011-07-25 13:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Pekka Enberg, Michael S. Tsirkin, kvm, Ingo Molnar, Marcelo Tosatti

On 07/25/2011 03:26 PM, Sasha Levin wrote:
> >
> >  You can't block when a signal is pending.  You can block, however, after
> >  you've exited with -EINTR and re-entered.
> >
>
> What would happen with the MMIO then? I need to provide an answer before
> I leave the read/write functions to exit with -EINTR, no?

If you exit, no result is needed until you reenter.  You just have to 
remember that on the next KVM_RUN, instead of running the vcpu, you have 
to talk to the socket (either write or read, depending on where the 
signal got you).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2011-07-25 13:04 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-06  4:37 [PATCH 1/5] ioeventfd: Remove natural sized length limitation Sasha Levin
2011-07-06  4:37 ` [PATCH 2/5] ioeventfd: Add helper functions for reading and writing Sasha Levin
2011-07-06  4:37 ` [PATCH 3/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_READ Sasha Levin
2011-07-06  4:37 ` [PATCH 4/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_NOWRITE Sasha Levin
2011-07-06  4:37 ` [PATCH 5/5] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_SOCKET Sasha Levin
2011-07-06 11:42   ` Michael S. Tsirkin
2011-07-06 15:01     ` Sasha Levin
2011-07-06 17:58       ` Michael S. Tsirkin
2011-07-10  5:34         ` Sasha Levin
2011-07-10  8:05           ` Michael S. Tsirkin
2011-07-12 11:23             ` Sasha Levin
2011-07-12 11:26               ` Avi Kivity
2011-07-13  6:37                 ` Pekka Enberg
2011-07-13  6:45                   ` Pekka Enberg
2011-07-13  7:07                     ` Avi Kivity
2011-07-13  8:02                       ` Pekka Enberg
2011-07-13 12:57                         ` Avi Kivity
2011-07-13 13:00                           ` Pekka Enberg
2011-07-13 13:32                             ` Avi Kivity
2011-07-14  7:26                               ` Pekka Enberg
2011-07-14  8:07                                 ` Sasha Levin
2011-07-14  8:09                                 ` Avi Kivity
2011-07-14  8:14                                   ` Pekka Enberg
2011-07-14  8:28                                     ` Avi Kivity
2011-07-14  8:59                                       ` Pekka Enberg
2011-07-14  9:48                                         ` Avi Kivity
     [not found]                                           ` <CAOJsxLHSeRuTOoiJssyrELRx-eXok3WinLr_+_G4dB+yHNBKdg@mail.gmai! l.com>
2011-07-14 10:30                                           ` Pekka Enberg
2011-07-14 11:54                                             ` Avi Kivity
2011-07-14 12:32                                               ` Sasha Levin
2011-07-14 12:46                                                 ` Avi Kivity
2011-07-14 13:00                                                   ` Sasha Levin
2011-07-14 13:05                                                     ` Avi Kivity
2011-07-14 13:17                                                       ` Pekka Enberg
2011-07-14 13:23                                                         ` Avi Kivity
2011-07-20  2:52                                                           ` Anthony Liguori
2011-07-20  6:16                                                             ` Sasha Levin
2011-07-20  9:42                                                               ` Pekka Enberg
2011-07-14 12:37                                               ` Pekka Enberg
2011-07-14 12:48                                                 ` Avi Kivity
2011-07-14 12:52                                                   ` Pekka Enberg
2011-07-14 12:54                                                     ` Avi Kivity
2011-07-14  8:19                                   ` Gleb Natapov
2011-07-14  8:25                                   ` Michael S. Tsirkin
2011-07-14  8:29                                     ` Avi Kivity
2011-07-20  2:49                       ` Anthony Liguori
2011-07-20  9:44                         ` Pekka Enberg
2011-07-20 21:10                           ` Anthony Liguori
2011-07-25 12:10                       ` Sasha Levin
2011-07-25 12:16                         ` Avi Kivity
2011-07-25 12:26                           ` Sasha Levin
2011-07-25 13:04                             ` Avi Kivity
2011-07-13  7:51           ` Pekka Enberg
2011-07-13 10:04             ` Pekka Enberg
2011-07-13 10:26               ` Sasha Levin
2011-07-13 10:56                 ` Pekka Enberg
2011-07-13 11:14                   ` Pekka Enberg
2011-07-06 12:39   ` Avi Kivity
2011-07-06 12:58     ` Sasha Levin
2011-07-06 13:04       ` Avi Kivity
2011-07-06 13:00   ` Avi Kivity
2011-07-20  2:42   ` Anthony Liguori
2011-07-20  8:19     ` Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.