All of lore.kernel.org
 help / color / mirror / Atom feed
* [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
@ 2015-11-09 14:33 Anton Ivanov
  2015-11-09 15:03 ` Anton Ivanov
  0 siblings, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-09 14:33 UTC (permalink / raw)
  To: user-mode-linux-devel; +Cc: Anton Ivanov

Epoll based interrupt controller.

IMPROVES: IO loop performance - no per fd lookups, allowing for
15% IO speedup in minimal config going to 100s of % with many
devices - a N^N lookup is now replaced by a log(N)

ADDS: True Write IRQ functionality

OBSOLETES: The need to call reactivate_fd() in any driver which
has only read IRQ semantics. Write IRQs work, but will need to
be updated to use this fully.

Potentially (with a change in API) will allow both edge and level
IRQ semantics.

Pre-requisite for using packet mmap and multipacket read/write
which do not get along with poll() very well.

Signed-off-by: Anton Ivanov <aivanov@brocade.com>
---
 arch/um/drivers/line.c            |   5 +-
 arch/um/drivers/mconsole_kern.c   |   2 -
 arch/um/drivers/net_kern.c        |   1 -
 arch/um/drivers/port_kern.c       |   1 -
 arch/um/drivers/random.c          |   1 -
 arch/um/drivers/ubd_kern.c        |   1 -
 arch/um/include/shared/irq_user.h |  24 ++-
 arch/um/include/shared/os.h       |  14 +-
 arch/um/kernel/irq.c              | 412 ++++++++++++++++++++++----------------
 arch/um/os-Linux/irq.c            | 150 ++++++--------
 10 files changed, 329 insertions(+), 282 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 6208702..84384c8 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -283,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
 	if (err)
 		return err;
 	if (output)
-		err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
+		err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
 				     line_write_interrupt, IRQF_SHARED,
 				     driver->write_irq_name, data);
 	return err;
@@ -666,8 +667,6 @@ static irqreturn_t winch_interrupt(int irq, void *data)
 		tty_kref_put(tty);
 	}
  out:
-	if (winch->fd != -1)
-		reactivate_fd(winch->fd, WINCH_IRQ);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 29880c9..5e8881c 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -95,7 +95,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id)
 	}
 	if (!list_empty(&mc_requests))
 		schedule_work(&mconsole_work);
-	reactivate_fd(fd, MCONSOLE_IRQ);
 	return IRQ_HANDLED;
 }
 
@@ -243,7 +242,6 @@ void mconsole_stop(struct mc_request *req)
 		(*req->cmd->handler)(req);
 	}
 	os_set_fd_block(req->originating_fd, 0);
-	reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
 	mconsole_reply(req, "", 0, 0);
 }
 
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index f70dd54..82ea3a2 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -137,7 +137,6 @@ static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
 		schedule_work(&lp->work);
 		goto out;
 	}
-	reactivate_fd(lp->fd, UM_ETH_IRQ);
 
 out:
 	spin_unlock(&lp->lock);
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 40ca5cc..b0e9ff3 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -137,7 +137,6 @@ static void port_work_proc(struct work_struct *unused)
 		if (!port->has_connection)
 			continue;
 
-		reactivate_fd(port->fd, ACCEPT_IRQ);
 		while (port_accept(port))
 			;
 		port->has_connection = 0;
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index dd16c90..a392828 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -72,7 +72,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 				return ret ? : -EAGAIN;
 
 			atomic_inc(&host_sleep_count);
-			reactivate_fd(random_fd, RANDOM_IRQ);
 			add_sigio_fd(random_fd);
 
 			add_wait_queue(&host_read_wait, &wait);
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index e8ab93c..731982c 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -466,7 +466,6 @@ static void ubd_handler(void)
 		blk_end_request(req->req, 0, req->length);
 		kfree(req);
 	}
-	reactivate_fd(thread_fd, UBD_IRQ);
 
 	list_for_each_safe(list, next_ele, &restart){
 		ubd = container_of(list, struct ubd, restart);
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index df56330..0eca64c 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -9,16 +10,23 @@
 #include <sysdep/ptrace.h>
 
 struct irq_fd {
-	struct irq_fd *next;
-	void *id;
-	int fd;
-	int type;
-	int irq;
-	int events;
-	int current_events;
+        void *id;
+        int irq;
+        int events;
+};
+
+
+#define IRQ_READ  0
+#define IRQ_WRITE 1 
+#define IRQ_NONE 2
+#define MAX_IRQ_TYPE (IRQ_NONE + 1)
+
+struct irq_entry {
+        struct irq_entry *next;
+        int fd;
+	struct irq_fd * irq_array[MAX_IRQ_TYPE + 1];
 };
 
-enum { IRQ_READ, IRQ_WRITE };
 
 struct siginfo;
 extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 21d704b..c449839 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
  * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
@@ -284,15 +285,18 @@ extern void halt_skas(void);
 extern void reboot_skas(void);
 
 /* irq.c */
-extern int os_waiting_for_events(struct irq_fd *active_fds);
-extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
+
+extern int os_setup_epoll(int maxevents);
+extern int os_waiting_for_events_epoll(void *kernel_events, int maxevents);
+extern int os_add_epoll_fd (int events, int fd, void * data);
+extern int os_mod_epoll_fd (int events, int fd, void * data);
+extern int os_del_epoll_fd (int fd);
+
 extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
 		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
 extern void os_free_irq_later(struct irq_fd *active_fds,
 		int irq, void *dev_id);
-extern int os_get_pollfd(int i);
-extern void os_set_pollfd(int i, int fd);
-extern void os_set_ioignore(void);
+extern void os_close_epoll(void);
 
 /* sigio.c */
 extern int add_sigio_fd(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 23cb935..ff3069b 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,4 +1,7 @@
 /*
+ * Copyright (C) 2015 Brocade Communications Ltd
+ *	Author: Anton Ivanov aivanov@{brocade.com,kot-begemot.co.uk}
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
@@ -18,6 +21,61 @@
 #include <os.h>
 
 /*
+*	We are on the "kernel side" so we cannot pick up the sys/epoll.h
+*	So we lift out of it the applicable key definitions.
+*/
+
+
+enum EPOLL_EVENTS
+  {
+	EPOLLIN = 0x001,
+#define EPOLLIN EPOLLIN
+	EPOLLPRI = 0x002,
+#define EPOLLPRI EPOLLPRI
+	EPOLLOUT = 0x004,
+#define EPOLLOUT EPOLLOUT
+	EPOLLRDNORM = 0x040,
+#define EPOLLRDNORM EPOLLRDNORM
+	EPOLLRDBAND = 0x080,
+#define EPOLLRDBAND EPOLLRDBAND
+	EPOLLWRNORM = 0x100,
+#define EPOLLWRNORM EPOLLWRNORM
+	EPOLLWRBAND = 0x200,
+#define EPOLLWRBAND EPOLLWRBAND
+	EPOLLMSG = 0x400,
+#define EPOLLMSG EPOLLMSG
+	EPOLLERR = 0x008,
+#define EPOLLERR EPOLLERR
+	EPOLLHUP = 0x010,
+#define EPOLLHUP EPOLLHUP
+	EPOLLRDHUP = 0x2000,
+#define EPOLLRDHUP EPOLLRDHUP
+	EPOLLONESHOT = (1 << 30),
+#define EPOLLONESHOT EPOLLONESHOT
+	EPOLLET = (1 << 31)
+#define EPOLLET EPOLLET
+  };
+
+
+typedef union epoll_data
+{
+	void *ptr;
+	int fd;
+	uint32_t u32;
+	uint64_t u64;
+} epoll_data_t;
+
+struct epoll_event
+{
+	uint32_t events;	/* Epoll events */
+	epoll_data_t data;	/* User data variable */
+} __attribute__ ((__packed__));
+
+#define MAX_EPOLL_EVENTS 16
+
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+
+/*
  * This list is accessed under irq_lock, except in sigio_handler,
  * where it is safe from being modified.  IRQ handlers won't change it -
  * if an IRQ source has vanished, it will be freed by free_irqs just
@@ -25,44 +83,92 @@
  * list of irqs to free, with its own locking, coming back here to
  * remove list elements, taking the irq_lock to do so.
  */
-static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
+static struct irq_entry *active_fds = NULL;
 
 extern void free_irqs(void);
 
+
+static DEFINE_SPINLOCK(irq_lock);
+
+
+/*
+ * Principles of Operation:
+ * Each Epoll structure contains a pointer pointing back to an array
+ * with irq entries for read, write and none and their matching event
+ * masks.
+ * This allows us to stop looking up "who talked"
+ * We no longer need to enable/disable any polls while we process them
+ * epoll will take care of that. The exemption to this (for now) are
+ * character devices because of their own internal buffering, which
+ * needs to be updated to leverage the new write IRQ semantics.
+ * We can now support both read and write IRQs and have separate IRQs
+ * for read and write ops.
+ */
+
+
 void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 {
 	struct irq_fd *irq_fd;
-	int n;
+	struct irq_entry *irq_entry;
+	unsigned long flags;
+
+	int n, i, j;
 
 	while (1) {
-		n = os_waiting_for_events(active_fds);
-		if (n <= 0) {
-			if (n == -EINTR)
-				continue;
-			else break;
-		}
 
-		for (irq_fd = active_fds; irq_fd != NULL;
-		     irq_fd = irq_fd->next) {
-			if (irq_fd->current_events != 0) {
-				irq_fd->current_events = 0;
-				do_IRQ(irq_fd->irq, regs);
-			}
+		spin_lock_irqsave(&irq_lock, flags);
+
+		n = os_waiting_for_events_epoll(
+			&epoll_events, MAX_EPOLL_EVENTS
+		);
+
+
+ 		if (n <= 0) {
+			if (n == -EINTR) { continue; }
+			else { break; }
 		}
+
+
+		for (i = 0; i < n ; i++) {
+			/* start from the data ptr, walk the tree branch */
+			irq_entry = (struct irq_entry *) epoll_events[i].data.ptr;
+			for (j = 0; j < MAX_IRQ_TYPE ; j ++ ) {
+				irq_fd = irq_entry->irq_array[j];
+				if (irq_fd != NULL) {
+					if (epoll_events[i].events & irq_fd->events) {
+						do_IRQ(irq_fd->irq, regs);
+					}
+				}
+ 			}
+ 		}
+		spin_unlock_irqrestore(&irq_lock, flags);
 	}
 
 	free_irqs();
 }
 
-static DEFINE_SPINLOCK(irq_lock);
+static int update_events(struct irq_entry * irq_entry)
+{
+	int i;
+	int events = 0;
+	struct irq_fd * irq_fd;
+	for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
+		irq_fd = irq_entry->irq_array[i];
+		if (irq_fd != NULL) {
+			events = irq_fd->events | events;
+		}
+	}
+	/* os_add_epoll will call os_mod_epoll if this already exists */
+	return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
+}
+
 
 static int activate_fd(int irq, int fd, int type, void *dev_id)
 {
-	struct pollfd *tmp_pfd;
-	struct irq_fd *new_fd, *irq_fd;
+	struct irq_fd *new_fd;
+	struct irq_entry * irq_entry;
 	unsigned long flags;
-	int events, err, n;
+	int  i, err, events;
 
 	err = os_set_fd_async(fd);
 	if (err < 0)
@@ -74,186 +180,152 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
 		goto out;
 
 	if (type == IRQ_READ)
-		events = UM_POLLIN | UM_POLLPRI;
-	else events = UM_POLLOUT;
-	*new_fd = ((struct irq_fd) { .next  		= NULL,
-				     .id 		= dev_id,
-				     .fd 		= fd,
-				     .type 		= type,
-				     .irq 		= irq,
-				     .events 		= events,
-				     .current_events 	= 0 } );
-
-	err = -EBUSY;
-	spin_lock_irqsave(&irq_lock, flags);
-	for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
-		if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
-			printk(KERN_ERR "Registering fd %d twice\n", fd);
-			printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
-			printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
-			       dev_id);
-			goto out_unlock;
-		}
-	}
-
+		events |= EPOLLIN | EPOLLPRI;
 	if (type == IRQ_WRITE)
-		fd = -1;
+		events |= EPOLLOUT;
 
-	tmp_pfd = NULL;
-	n = 0;
+	*new_fd = ((struct irq_fd) {
+		.id 		= dev_id,
+		.irq 		= irq,
+		.events 	= events
+	});
 
-	while (1) {
-		n = os_create_pollfd(fd, events, tmp_pfd, n);
-		if (n == 0)
-			break;
+	err = -EBUSY;
 
-		/*
-		 * n > 0
-		 * It means we couldn't put new pollfd to current pollfds
-		 * and tmp_fds is NULL or too small for new pollfds array.
-		 * Needed size is equal to n as minimum.
-		 *
-		 * Here we have to drop the lock in order to call
-		 * kmalloc, which might sleep.
-		 * If something else came in and changed the pollfds array
-		 * so we will not be able to put new pollfd struct to pollfds
-		 * then we free the buffer tmp_fds and try again.
-		 */
-		spin_unlock_irqrestore(&irq_lock, flags);
-		kfree(tmp_pfd);
+	spin_lock_irqsave(&irq_lock, flags);
 
-		tmp_pfd = kmalloc(n, GFP_KERNEL);
-		if (tmp_pfd == NULL)
-			goto out_kfree;
+	for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+		if (irq_entry->fd == fd) break;
+	}
 
-		spin_lock_irqsave(&irq_lock, flags);
+	if (irq_entry == NULL) {
+		irq_entry = kmalloc(sizeof(struct irq_entry), GFP_KERNEL);
+		if (irq_entry == NULL) {
+			printk(KERN_ERR
+				"Failed to allocate new IRQ entry\n");
+			kfree(new_fd);
+			goto out;
+		}
+		irq_entry->fd = fd;
+		for (i = 0; i < MAX_IRQ_TYPE; i++) {
+			irq_entry->irq_array[i] = NULL;
+		}
+		irq_entry->next = active_fds;
+		active_fds = irq_entry;
 	}
 
-	*last_irq_ptr = new_fd;
-	last_irq_ptr = &new_fd->next;
+	if (irq_entry->irq_array[type] != NULL) {
+		printk(KERN_ERR
+			"Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
+			irq, fd, type, dev_id
+		);
+		goto out_unlock;
+	} else {
+		irq_entry->irq_array[type] = new_fd;
+	}
 
+	update_events(irq_entry);
+	
 	spin_unlock_irqrestore(&irq_lock, flags);
 
-	/*
-	 * This calls activate_fd, so it has to be outside the critical
-	 * section.
-	 */
-	maybe_sigio_broken(fd, (type == IRQ_READ));
+	maybe_sigio_broken(fd, (type != IRQ_NONE));
 
 	return 0;
 
  out_unlock:
 	spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
 	kfree(new_fd);
  out:
 	return err;
 }
 
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
-{
-	unsigned long flags;
 
-	spin_lock_irqsave(&irq_lock, flags);
-	os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
-	spin_unlock_irqrestore(&irq_lock, flags);
-}
-
-struct irq_and_dev {
-	int irq;
-	void *dev;
-};
-
-static int same_irq_and_dev(struct irq_fd *irq, void *d)
+static void do_free_by_irq_and_dev(
+	struct irq_entry* irq_entry,
+	unsigned int irq,
+	void * dev
+)
 {
-	struct irq_and_dev *data = d;
-
-	return ((irq->irq == data->irq) && (irq->id == data->dev));
+	int i;
+	struct irq_fd * to_free;
+	for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
+		if (irq_entry->irq_array[i] != NULL) {
+			if (
+				(irq_entry->irq_array[i]->irq == irq) &&
+				(irq_entry->irq_array[i]->id == dev)
+			) {
+				to_free = irq_entry->irq_array[i];
+				irq_entry->irq_array[i] = NULL;
+				update_events(irq_entry);
+				kfree(to_free);
+			}
+		}
+	}
 }
 
-static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
+void free_irq_by_fd(int fd)
 {
-	struct irq_and_dev data = ((struct irq_and_dev) { .irq  = irq,
-							  .dev  = dev });
 
-	free_irq_by_cb(same_irq_and_dev, &data);
-}
+	struct irq_entry *irq_entry, *prev = NULL;
+	unsigned long flags;
+	int i;
 
-static int same_fd(struct irq_fd *irq, void *fd)
-{
-	return (irq->fd == *((int *)fd));
+	spin_lock_irqsave(&irq_lock, flags);
+	for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+		if (irq_entry->fd == irq_entry->fd) {
+			os_del_epoll_fd(fd);   /* ignore err, just do it */
+			for (i = 0; i < MAX_IRQ_TYPE ; i++) {
+				if (irq_entry->irq_array[i] != NULL) {
+					kfree(irq_entry->irq_array[i]);
+				}
+			}
+			if (prev == NULL) {
+				active_fds = irq_entry->next;
+			} else {
+				prev->next = irq_entry->next;
+			}
+			kfree(irq_entry);
+		} else {
+			prev = irq_entry;
+		}
+	}
+	spin_unlock_irqrestore(&irq_lock, flags);
+	
 }
 
-void free_irq_by_fd(int fd)
-{
-	free_irq_by_cb(same_fd, &fd);
-}
 
-/* Must be called with irq_lock held */
-static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
+static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
 {
-	struct irq_fd *irq;
-	int i = 0;
-	int fdi;
-
-	for (irq = active_fds; irq != NULL; irq = irq->next) {
-		if ((irq->fd == fd) && (irq->irq == irqnum))
-			break;
-		i++;
-	}
-	if (irq == NULL) {
-		printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
-		       fd);
-		goto out;
-	}
-	fdi = os_get_pollfd(i);
-	if ((fdi != -1) && (fdi != fd)) {
-		printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
-		       "and pollfds, fd %d vs %d, need %d\n", irq->fd,
-		       fdi, fd);
-		irq = NULL;
-		goto out;
-	}
-	*index_out = i;
- out:
-	return irq;
-}
 
-void reactivate_fd(int fd, int irqnum)
-{
-	struct irq_fd *irq;
+	struct irq_entry *irq_entry;
 	unsigned long flags;
-	int i;
 
 	spin_lock_irqsave(&irq_lock, flags);
-	irq = find_irq_by_fd(fd, irqnum, &i);
-	if (irq == NULL) {
-		spin_unlock_irqrestore(&irq_lock, flags);
-		return;
+	for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+		do_free_by_irq_and_dev(irq_entry, irq, dev);
 	}
-	os_set_pollfd(i, irq->fd);
 	spin_unlock_irqrestore(&irq_lock, flags);
-
-	add_sigio_fd(fd);
+	
 }
 
-void deactivate_fd(int fd, int irqnum)
+
+void reactivate_fd(int fd, int irqnum)
 {
-	struct irq_fd *irq;
+	struct irq_entry *irq_entry;
 	unsigned long flags;
-	int i;
-
 	spin_lock_irqsave(&irq_lock, flags);
-	irq = find_irq_by_fd(fd, irqnum, &i);
-	if (irq == NULL) {
-		spin_unlock_irqrestore(&irq_lock, flags);
-		return;
+	for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+		if (irq_entry->fd == fd) {
+			update_events(irq_entry);
+		}
 	}
-
-	os_set_pollfd(i, -1);
 	spin_unlock_irqrestore(&irq_lock, flags);
+	
+}
 
-	ignore_sigio_fd(fd);
+void deactivate_fd(int fd, int irqnum)
+{
+	os_del_epoll_fd(fd);   /* ignore err, just do it */
 }
 EXPORT_SYMBOL(deactivate_fd);
 
@@ -265,17 +337,17 @@ EXPORT_SYMBOL(deactivate_fd);
  */
 int deactivate_all_fds(void)
 {
-	struct irq_fd *irq;
+	struct irq_entry * irq_entry;
 	int err;
 
-	for (irq = active_fds; irq != NULL; irq = irq->next) {
-		err = os_clear_fd_async(irq->fd);
-		if (err)
-			return err;
+	for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+		os_del_epoll_fd(irq_entry->fd);   /* ignore err, just do it */
+		err = os_clear_fd_async(irq_entry->fd);
+		if (err) {
+			printk(KERN_ERR "Clear FD async failed with %d", err);
+		}
 	}
-	/* If there is a signal already queued, after unblocking ignore it */
-	os_set_ioignore();
-
+	os_close_epoll();
 	return 0;
 }
 
@@ -308,13 +380,13 @@ int um_request_irq(unsigned int irq, int fd, int type,
 {
 	int err;
 
-	if (fd != -1) {
+	err = request_irq(irq, handler, irqflags, devname, dev_id);
+
+	if ((!err) && (fd != -1)) {
 		err = activate_fd(irq, fd, type, dev_id);
-		if (err)
-			return err;
 	}
 
-	return request_irq(irq, handler, irqflags, devname, dev_id);
+	return err;
 }
 
 EXPORT_SYMBOL(um_request_irq);
@@ -352,9 +424,9 @@ void __init init_IRQ(void)
 	int i;
 
 	irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
-
-	for (i = 1; i < NR_IRQS; i++)
+	for (i = 1; i < NR_IRQS - 1 ; i++)
 		irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
+	os_setup_epoll(MAX_EPOLL_EVENTS);
 }
 
 /*
@@ -382,11 +454,11 @@ void __init init_IRQ(void)
  * thread_info.
  *
  * There are three cases -
- *     The first interrupt on the stack - sets up the thread_info and
+ *	 The first interrupt on the stack - sets up the thread_info and
  * handles the interrupt
- *     A nested interrupt interrupting the copying of the thread_info -
+ *	 A nested interrupt interrupting the copying of the thread_info -
  * can't handle the interrupt, as the stack is in an unknown state
- *     A nested interrupt not interrupting the copying of the
+ *	 A nested interrupt not interrupting the copying of the
  * thread_info - doesn't do any setup, just handles the interrupt
  *
  * The first job is to figure out whether we interrupted stack setup.
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74..81b135a 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -6,6 +7,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <poll.h>
+#include <sys/epoll.h>
 #include <signal.h>
 #include <string.h>
 #include <irq_user.h>
@@ -16,120 +18,88 @@
  * Locked by irq_lock in arch/um/kernel/irq.c.  Changed by os_create_pollfd
  * and os_free_irq_by_cb, which are called under irq_lock.
  */
-static struct pollfd *pollfds = NULL;
-static int pollfds_num = 0;
-static int pollfds_size = 0;
 
-int os_waiting_for_events(struct irq_fd *active_fds)
+/* epoll support */
+
+
+static int epollfd = -1;
+
+int os_setup_epoll(int maxevents) {
+	epollfd = epoll_create(maxevents);
+	return epollfd;
+}
+
+int os_waiting_for_events_epoll(void *kernel_events, int maxevents)
 {
-	struct irq_fd *irq_fd;
-	int i, n, err;
+	int n, err;
 
-	n = poll(pollfds, pollfds_num, 0);
+	n = epoll_wait(epollfd,
+		(struct epoll_event *) kernel_events, maxevents, 0);
 	if (n < 0) {
 		err = -errno;
 		if (errno != EINTR)
-			printk(UM_KERN_ERR "os_waiting_for_events:"
-			       " poll returned %d, errno = %d\n", n, errno);
+			printk(
+				UM_KERN_ERR "os_waiting_for_events:"
+				" poll returned %d, error = %s\n", n,
+				strerror(errno)
+			);
 		return err;
 	}
 
-	if (n == 0)
-		return 0;
+	return n;
+}
 
-	irq_fd = active_fds;
+int os_add_epoll_fd (int events, int fd, void * data) {
+	struct epoll_event event;
+	int result;
 
-	for (i = 0; i < pollfds_num; i++) {
-		if (pollfds[i].revents != 0) {
-			irq_fd->current_events = pollfds[i].revents;
-			pollfds[i].fd = -1;
-		}
-		irq_fd = irq_fd->next;
+	event.data.ptr = data;
+	event.events = events | EPOLLET;
+	result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
+	if ((result) && (errno == EEXIST)) {
+		result = os_mod_epoll_fd (events, fd, data);
 	}
-	return n;
+	if (result) {
+		printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
+	}
+	return result;
 }
 
-int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
-{
-	if (pollfds_num == pollfds_size) {
-		if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
-			/* return min size needed for new pollfds area */
-			return (pollfds_size + 1) * sizeof(pollfds[0]);
-		}
-
-		if (pollfds != NULL) {
-			memcpy(tmp_pfd, pollfds,
-			       sizeof(pollfds[0]) * pollfds_size);
-			/* remove old pollfds */
-			kfree(pollfds);
-		}
-		pollfds = tmp_pfd;
-		pollfds_size++;
-	} else
-		kfree(tmp_pfd);	/* remove not used tmp_pfd */
-
-	pollfds[pollfds_num] = ((struct pollfd) { .fd		= fd,
-						  .events	= events,
-						  .revents	= 0 });
-	pollfds_num++;
-
-	return 0;
+int os_mod_epoll_fd (int events, int fd, void * data) {
+	struct epoll_event event;
+	int result;
+	event.data.ptr = data;
+	event.events = events;
+	result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
+	if (result) {
+		printk("epollctl mod err fd %d, %s\n", fd, strerror(errno));
+	}
+	return result;
 }
 
-void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
-		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
-{
-	struct irq_fd **prev;
-	int i = 0;
-
-	prev = &active_fds;
-	while (*prev != NULL) {
-		if ((*test)(*prev, arg)) {
-			struct irq_fd *old_fd = *prev;
-			if ((pollfds[i].fd != -1) &&
-			    (pollfds[i].fd != (*prev)->fd)) {
-				printk(UM_KERN_ERR "os_free_irq_by_cb - "
-				       "mismatch between active_fds and "
-				       "pollfds, fd %d vs %d\n",
-				       (*prev)->fd, pollfds[i].fd);
-				goto out;
-			}
-
-			pollfds_num--;
-
-			/*
-			 * This moves the *whole* array after pollfds[i]
-			 * (though it doesn't spot as such)!
-			 */
-			memmove(&pollfds[i], &pollfds[i + 1],
-			       (pollfds_num - i) * sizeof(pollfds[0]));
-			if (*last_irq_ptr2 == &old_fd->next)
-				*last_irq_ptr2 = prev;
-
-			*prev = (*prev)->next;
-			if (old_fd->type == IRQ_WRITE)
-				ignore_sigio_fd(old_fd->fd);
-			kfree(old_fd);
-			continue;
-		}
-		prev = &(*prev)->next;
-		i++;
+int os_del_epoll_fd (int fd) {
+	struct epoll_event event;
+	int result;
+	result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
+	if (result) {
+		printk("epollctl del err %s\n", strerror(errno));
 	}
- out:
-	return;
+	return result;
 }
 
-int os_get_pollfd(int i)
+void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
+		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
 {
-	return pollfds[i].fd;
+	printk("Someone invoking obsolete deactivate_by_CB!!!\n");
+	return;
 }
 
-void os_set_pollfd(int i, int fd)
+void os_set_ioignore(void)
 {
-	pollfds[i].fd = fd;
+	signal(SIGIO, SIG_IGN);
 }
 
-void os_set_ioignore(void)
+void os_close_epoll(void)
 {
-	signal(SIGIO, SIG_IGN);
+	os_close_file(epollfd);
 }
-- 
2.1.4


------------------------------------------------------------------------------
Presto, an open source distributed SQL query engine for big data, initially
developed by Facebook, enables you to easily query your data on Hadoop in a 
more interactive manner. Teradata is also now providing full enterprise
support for Presto. Download a free open source copy now.
http://pubads.g.doubleclick.net/gampad/clk?id=250295911&iu=/4140
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-09 14:33 [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0 Anton Ivanov
@ 2015-11-09 15:03 ` Anton Ivanov
  2015-11-10 18:42   ` Anton Ivanov
  2015-11-11 20:46   ` Thomas Meyer
  0 siblings, 2 replies; 14+ messages in thread
From: Anton Ivanov @ 2015-11-09 15:03 UTC (permalink / raw)
  To: user-mode-linux-devel

It throws a couple of harmless "epoll del fd" warnings on reboot which 
result the fact that disable_fd/enable_fd are not removed in the 
terminal/line code.

These are harmless and will go away once the term/line code gets support 
for real write IRQs in addition to read at some point in the future.

I have fixed the file descriptor leak in the reboot case.

A.

On 09/11/15 14:33, Anton Ivanov wrote:
> Epoll based interrupt controller.
>
> IMPROVES: IO loop performance - no per fd lookups, allowing for
> 15% IO speedup in minimal config going to 100s of % with many
> devices - a N^N lookup is now replaced by a log(N)
>
> ADDS: True Write IRQ functionality
>
> OBSOLETES: The need to call reactivate_fd() in any driver which
> has only read IRQ semantics. Write IRQs work, but will need to
> be updated to use this fully.
>
> Potentially (with a change in API) will allow both edge and level
> IRQ semantics.
>
> Pre-requisite for using packet mmap and multipacket read/write
> which do not get along with poll() very well.
>
> Signed-off-by: Anton Ivanov <aivanov@brocade.com>


------------------------------------------------------------------------------
Presto, an open source distributed SQL query engine for big data, initially
developed by Facebook, enables you to easily query your data on Hadoop in a 
more interactive manner. Teradata is also now providing full enterprise
support for Presto. Download a free open source copy now.
http://pubads.g.doubleclick.net/gampad/clk?id=250295911&iu=/4140
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-09 15:03 ` Anton Ivanov
@ 2015-11-10 18:42   ` Anton Ivanov
  2015-11-10 20:24     ` Richard Weinberger
  2015-11-11 20:46   ` Thomas Meyer
  1 sibling, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-10 18:42 UTC (permalink / raw)
  To: user-mode-linux-devel

Thomas Mayer found a couple of issues which crept up when porting v1 to 
the newer kernels.

It also crashes with his config - I am analyzing the differences between 
that and mine to find the culrpit.

Once I have fixed both I will resubmit v3. So for now this is just 
"subject for discussion", do not use yet.

A

On 09/11/15 15:03, Anton Ivanov wrote:
> It throws a couple of harmless "epoll del fd" warnings on reboot which 
> result the fact that disable_fd/enable_fd are not removed in the 
> terminal/line code.
>
> These are harmless and will go away once the term/line code gets 
> support for real write IRQs in addition to read at some point in the 
> future.
>
> I have fixed the file descriptor leak in the reboot case.
>
> A.
>
> On 09/11/15 14:33, Anton Ivanov wrote:
>> Epoll based interrupt controller.
>>
>> IMPROVES: IO loop performance - no per fd lookups, allowing for
>> 15% IO speedup in minimal config going to 100s of % with many
>> devices - a N^N lookup is now replaced by a log(N)
>>
>> ADDS: True Write IRQ functionality
>>
>> OBSOLETES: The need to call reactivate_fd() in any driver which
>> has only read IRQ semantics. Write IRQs work, but will need to
>> be updated to use this fully.
>>
>> Potentially (with a change in API) will allow both edge and level
>> IRQ semantics.
>>
>> Pre-requisite for using packet mmap and multipacket read/write
>> which do not get along with poll() very well.
>>
>> Signed-off-by: Anton Ivanov <aivanov@brocade.com>
>


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-10 18:42   ` Anton Ivanov
@ 2015-11-10 20:24     ` Richard Weinberger
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Weinberger @ 2015-11-10 20:24 UTC (permalink / raw)
  To: Anton Ivanov; +Cc: user-mode-linux-devel

On Tue, Nov 10, 2015 at 7:42 PM, Anton Ivanov
<anton.ivanov@kot-begemot.co.uk> wrote:
> Thomas Mayer found a couple of issues which crept up when porting v1 to
> the newer kernels.
>
> It also crashes with his config - I am analyzing the differences between
> that and mine to find the culrpit.
>
> Once I have fixed both I will resubmit v3. So for now this is just
> "subject for discussion", do not use yet.

There is no need to hurry. :-)
This is 4.5 material.

-- 
Thanks,
//richard

------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-09 15:03 ` Anton Ivanov
  2015-11-10 18:42   ` Anton Ivanov
@ 2015-11-11 20:46   ` Thomas Meyer
  2015-11-11 21:05     ` Richard Weinberger
  1 sibling, 1 reply; 14+ messages in thread
From: Thomas Meyer @ 2015-11-11 20:46 UTC (permalink / raw)
  To: user-mode-linux-devel

Am Montag, den 09.11.2015, 15:03 +0000 schrieb Anton Ivanov:
> It throws a couple of harmless "epoll del fd" warnings on reboot
> which 
> result the fact that disable_fd/enable_fd are not removed in the 
> terminal/line code.
> 
> These are harmless and will go away once the term/line code gets
> support 
> for real write IRQs in addition to read at some point in the future.
> 
> I have fixed the file descriptor leak in the reboot case.

Hi,

now with the list on copy!

Richard: can you make some sense out of these stack traces? I can
provide the config if you want!

I see a lot of bugs of type "BUG: spinlock recursion on CPU#0" with
this patch:

I did look over your patch and found two errors in the irq_lock
spinlock handling:

http://m3y3r.dyndns.org:9100/gerrit/#/c/2/1..2/arch/um/kernel/irq.c

But it still seems to miss something as above bugs still occurs, but
now the system boots up a bit more at least.

Example:
 [  225.570000] BUG: spinlock lockup suspected on CPU#0, chmod/516
 [  225.570000]  lock: irq_lock+0x0/0x18, .magic: dead4ead, .owner:
 [  225.570000] CPU: 0 PID: 516 Comm: chmod Not tainted 4.3.0-00010-
 g96b3e9d #100
 [  225.570000] Stack:
 [  225.570000]  69323490 6008bbd2 00000000 6b0fd580
 [  225.570000]  6083c440 600d191a 693234a0 603b8701
 [  225.570000]  693234e0 60085773 00000000 00000000
 [  225.570000] Call Trace:
 [  225.570000]  [<600d191a>] ? printk+0x0/0x94
 [  225.570000]  [<60047cc0>] ? __delay+0x0/0x30
 [  225.570000]  [<6002d84b>] show_stack+0xdb/0x1a0
 [  225.570000]  [<6008bbd2>] ? dump_stack_print_info+0xd2/0xf0
 [  225.570000]  [<600d191a>] ? printk+0x0/0x94
 [  225.570000]  [<603b8701>] dump_stack+0x2a/0x39
 [  225.570000]  [<60085773>] spin_dump+0xa3/0xd0
 [  225.570000]  [<600859ba>] do_raw_spin_lock+0x18a/0x1a0
 [  225.570000]  [<6063e443>] _raw_spin_lock_irqsave+0x33/0x40
 [  225.570000]  [<6002b7a0>] ? do_IRQ+0x0/0x60
 [  225.570000]  [<6002b836>] sigio_handler+0x36/0x120
 [  225.570000]  [<60043db1>] sig_handler_common+0x71/0xe0
 [  225.570000]  [<6063e566>] ? _raw_spin_unlock_irqrestore+0x26/0x30
 [  225.570000]  [<603d2a2b>] ? debug_check_no_obj_freed+0x28b/0x2a0
 [  225.570000]  [<6011e8bc>] ? cache_free_debugcheck+0x17c/0x470
 [  225.570000]  [<6011f520>] ? kfree+0x0/0x170
 [  225.570000]  [<60398377>] ? blk_update_bidi_request+0x27/0x90
 [  225.570000]  [<6003c41e>] ? ubd_intr+0x4e/0x140
 [  225.570000]  [<6011f629>] ? kfree+0x109/0x170
 [  225.570000]  [<60040d10>] ? os_read_file+0x0/0x30
 [  225.570000]  [<6011f520>] ? kfree+0x0/0x170
 [  225.570000]  [<60040d10>] ? os_read_file+0x0/0x30
 [  225.570000]  [<6011f520>] ? kfree+0x0/0x170
 [  225.570000]  [<60398790>] ? blk_end_request+0x0/0x20
 [  225.570000]  [<60043f60>] ? get_signals+0x0/0x10
 [  225.570000]  [<60040d35>] ? os_read_file+0x25/0x30
 [  225.570000]  [<6003c435>] ? ubd_intr+0x65/0x140
 [  225.570000]  [<60043d40>] ? sig_handler_common+0x0/0xe0
 [  225.570000]  [<600438c0>] ? timer_real_alarm_handler+0x0/0x50
 [  225.570000]  [<60043d1c>] unblock_signals+0x8c/0xb0
 [  225.570000]  [<60052c2c>] __do_softirq+0x9c/0x2a0
 [  225.570000]  [<6008c63f>] ? handle_irq_event+0x5f/0x80
 [  225.570000]  [<6005313c>] irq_exit+0xac/0xb0
 [  225.570000]  [<6008bcd3>] ? generic_handle_irq+0x33/0x40
 [  225.570000]  [<6002b7e7>] do_IRQ+0x47/0x60
 [  225.570000]  [<6002b7a0>] ? do_IRQ+0x0/0x60
 [  225.570000]  [<6002b8da>] sigio_handler+0xda/0x120
 [  225.570000]  [<60043db1>] sig_handler_common+0x71/0xe0
 [  225.570000]  [<6004691a>] ? unmap+0x4a/0x50
 [  225.570000]  [<60074ff6>] ? __might_sleep+0x66/0xf0
 [  225.570000]  [<600dcc90>] ? get_page_from_freelist+0x0/0xa80
 [  225.570000]  [<600f2440>] ? next_zones_zonelist+0x0/0x40
 [  225.570000]  [<600dda67>] ? __alloc_pages_nodemask+0x177/0xb10
 [  225.570000]  [<600ffa96>] ? do_set_pte+0xa6/0xc0
 [  225.570000]  [<600d2781>] ? unlock_page+0x31/0xa0
 [  225.570000]  [<600d3d64>] ? filemap_map_pages+0x324/0x340
 [  225.570000]  [<60046ce2>] ? wait_stub_done+0x62/0x140
 [  225.570000]  [<600462d0>] ? run_syscall_stub+0xf0/0x350
 [  225.570000]  [<60043f70>] ? set_signals+0x0/0x50
 [  225.570000]  [<600468ca>] ? map+0x4a/0x50
 [  225.570000]  [<6002e724>] ? flush_tlb_page+0x134/0x240
 [  225.570000]  [<601011b0>] ? handle_mm_fault+0x0/0x1620
 [  225.570000]  [<6002ed35>] ? handle_page_fault+0x265/0x350
 [  225.570000]  [<60043d40>] ? sig_handler_common+0x0/0xe0
 [  225.570000]  [<600438c0>] ? timer_real_alarm_handler+0x0/0x50
 [  225.570000]  [<60043d1c>] unblock_signals+0x8c/0xb0
 [  225.570000]  [<6063e53c>] _raw_spin_unlock_irq+0x1c/0x20
 [  225.570000]  [<60074999>] finish_task_switch+0x59/0xe0
 [  225.570000]  [<6004a88b>] ? arch_switch_to+0x2b/0x30
 [  225.570000]  [<60639141>] __schedule+0x1e1/0x5b0
 [  225.570000]  [<60638f60>] ? __schedule+0x0/0x5b0
 [  225.570000]  [<60639544>] schedule+0x34/0x90
 [  225.570000]  [<6002f426>] ? segv_handler+0x46/0x90
 [  225.570000]  [<6002c374>] interrupt_end+0xa4/0xb0
 [  225.570000]  [<60047373>] userspace+0x253/0x4e0
 [  225.570000]  [<60030303>] ? do_op_one_page+0x153/0x240
 [  225.570000]  [<600429af>] ? save_registers+0x1f/0x40
 [  225.570000]  [<6004a806>] ? arch_prctl+0x1f6/0x220
 [  225.570000]  [<6002c055>] fork_handler+0x85/0x90
 [  225.570000]

with kind regards
thomas


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-11 20:46   ` Thomas Meyer
@ 2015-11-11 21:05     ` Richard Weinberger
  2015-11-11 21:39       ` Anton Ivanov
  2015-11-12 12:29       ` Anton Ivanov
  0 siblings, 2 replies; 14+ messages in thread
From: Richard Weinberger @ 2015-11-11 21:05 UTC (permalink / raw)
  To: Thomas Meyer; +Cc: user-mode-linux-devel

On Wed, Nov 11, 2015 at 9:46 PM, Thomas Meyer <thomas@m3y3r.de> wrote:
> Am Montag, den 09.11.2015, 15:03 +0000 schrieb Anton Ivanov:
>> It throws a couple of harmless "epoll del fd" warnings on reboot
>> which
>> result the fact that disable_fd/enable_fd are not removed in the
>> terminal/line code.
>>
>> These are harmless and will go away once the term/line code gets
>> support
>> for real write IRQs in addition to read at some point in the future.
>>
>> I have fixed the file descriptor leak in the reboot case.
>
> Hi,
>
> now with the list on copy!
>
> Richard: can you make some sense out of these stack traces? I can
> provide the config if you want!
>
> I see a lot of bugs of type "BUG: spinlock recursion on CPU#0" with
> this patch:
>
> I did look over your patch and found two errors in the irq_lock
> spinlock handling:
>
> http://m3y3r.dyndns.org:9100/gerrit/#/c/2/1..2/arch/um/kernel/irq.c
>
> But it still seems to miss something as above bugs still occurs, but
> now the system boots up a bit more at least.
>
> Example:
>  [  225.570000] BUG: spinlock lockup suspected on CPU#0, chmod/516
>  [  225.570000]  lock: irq_lock+0x0/0x18, .magic: dead4ead, .owner:

Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
should be a no-op.
Do you have lock debugging enabled?

I this case I'd start gdb and inspect the memory. Maybe a stack corruption.

-- 
Thanks,
//richard

------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-11 21:05     ` Richard Weinberger
@ 2015-11-11 21:39       ` Anton Ivanov
  2015-11-11 21:49         ` stian
  2015-11-12 12:29       ` Anton Ivanov
  1 sibling, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-11 21:39 UTC (permalink / raw)
  To: user-mode-linux-devel

I do not see it with my config. I have cleaned up a few things and can 
send an updated version.

At the same time it is 100% reproducible using the config I got from 
Thomas. So this is somehow config dependent.

I have compared the configs and the biggest difference I can see is that 
Thomas has most of the debug options enabled.

Everything else is the same or not relevant (nat, etc - module options).

In any case - I cannot make sense of the traces - they show nested 
invocation of the sigio handler and nested interrupt invocation. If 
there is no stack corruption there should be no way to get that - the 
enable/disable and hard handler code in signal.c ensures that.

I will probably add some "manual" debug code to check if the nested 
invocation happens with the debug options off.

A.


On 11/11/15 21:05, Richard Weinberger wrote:
> On Wed, Nov 11, 2015 at 9:46 PM, Thomas Meyer <thomas@m3y3r.de> wrote:
>> Am Montag, den 09.11.2015, 15:03 +0000 schrieb Anton Ivanov:
>>> It throws a couple of harmless "epoll del fd" warnings on reboot
>>> which
>>> result the fact that disable_fd/enable_fd are not removed in the
>>> terminal/line code.
>>>
>>> These are harmless and will go away once the term/line code gets
>>> support
>>> for real write IRQs in addition to read at some point in the future.
>>>
>>> I have fixed the file descriptor leak in the reboot case.
>> Hi,
>>
>> now with the list on copy!
>>
>> Richard: can you make some sense out of these stack traces? I can
>> provide the config if you want!
>>
>> I see a lot of bugs of type "BUG: spinlock recursion on CPU#0" with
>> this patch:
>>
>> I did look over your patch and found two errors in the irq_lock
>> spinlock handling:
>>
>> http://m3y3r.dyndns.org:9100/gerrit/#/c/2/1..2/arch/um/kernel/irq.c
>>
>> But it still seems to miss something as above bugs still occurs, but
>> now the system boots up a bit more at least.
>>
>> Example:
>>   [  225.570000] BUG: spinlock lockup suspected on CPU#0, chmod/516
>>   [  225.570000]  lock: irq_lock+0x0/0x18, .magic: dead4ead, .owner:
> Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
> should be a no-op.
> Do you have lock debugging enabled?
>
> I this case I'd start gdb and inspect the memory. Maybe a stack corruption.
>


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-11 21:39       ` Anton Ivanov
@ 2015-11-11 21:49         ` stian
  2015-11-11 23:25           ` Anton Ivanov
  0 siblings, 1 reply; 14+ messages in thread
From: stian @ 2015-11-11 21:49 UTC (permalink / raw)
  To: user-mode-linux-devel

> I will probably add some "manual" debug code to check if the nested
> invocation happens with the debug options off.

Might be timing dependent and/or issue with nested blocking/unblocking 
of signals.
Or debug printing when it should not.

Just my two cents

Stian Skjelstad


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-11 21:49         ` stian
@ 2015-11-11 23:25           ` Anton Ivanov
  0 siblings, 0 replies; 14+ messages in thread
From: Anton Ivanov @ 2015-11-11 23:25 UTC (permalink / raw)
  To: user-mode-linux-devel

There is for some reason IRQ/sigio handler reentrancy.

Both the debug "by hand" I stuck into irq.c and Thomas traces confirm it.

I have no clue how this is possible as the code in signal.c should guard 
against that.

I will sleep on it and try to tackle this tomorrow - no idea what is 
going on, but this is what is happening here.

A.

On 11/11/15 21:49, stian@nixia.no wrote:
>> I will probably add some "manual" debug code to check if the nested
>> invocation happens with the debug options off.
> Might be timing dependent and/or issue with nested blocking/unblocking
> of signals.
> Or debug printing when it should not.
>
> Just my two cents
>
> Stian Skjelstad
>
>
> ------------------------------------------------------------------------------
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-11 21:05     ` Richard Weinberger
  2015-11-11 21:39       ` Anton Ivanov
@ 2015-11-12 12:29       ` Anton Ivanov
  2015-11-12 15:23         ` Anton Ivanov
  1 sibling, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-12 12:29 UTC (permalink / raw)
  To: user-mode-linux-devel

On 11/11/15 21:05, Richard Weinberger wrote:
> On Wed, Nov 11, 2015 at 9:46 PM, Thomas Meyer <thomas@m3y3r.de> wrote:
>> Am Montag, den 09.11.2015, 15:03 +0000 schrieb Anton Ivanov:
>>> It throws a couple of harmless "epoll del fd" warnings on reboot
>>> which
>>> result the fact that disable_fd/enable_fd are not removed in the
>>> terminal/line code.
>>>
>>> These are harmless and will go away once the term/line code gets
>>> support
>>> for real write IRQs in addition to read at some point in the future.
>>>
>>> I have fixed the file descriptor leak in the reboot case.
>> Hi,
>>
>> now with the list on copy!
>>
>> Richard: can you make some sense out of these stack traces? I can
>> provide the config if you want!
>>
>> I see a lot of bugs of type "BUG: spinlock recursion on CPU#0" with
>> this patch:
>>
>> I did look over your patch and found two errors in the irq_lock
>> spinlock handling:
>>
>> http://m3y3r.dyndns.org:9100/gerrit/#/c/2/1..2/arch/um/kernel/irq.c
>>
>> But it still seems to miss something as above bugs still occurs, but
>> now the system boots up a bit more at least.
>>
>> Example:
>>   [  225.570000] BUG: spinlock lockup suspected on CPU#0, chmod/516
>>   [  225.570000]  lock: irq_lock+0x0/0x18, .magic: dead4ead, .owner:
> Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
> should be a no-op.

In that case, if I understand correctly what is going on, there are a 
couple of places - the free_irqs(), activate_fd and the sigio handler 
itself, where it should not be a mutex, not a spinlock. It is there to 
ensure that you cannot use it in an interrupt context while it is being 
modified.

If spinlock is a NOP it fails to perform this duty. The code should also 
be different - it should return on try_lock so it does not deadlock so 
spinlock_irqsave is the wrong locking primitive as it does not have this 
functionality.

That is an issue both with this patch and with the original poll based 
controller - there free_irq, add_fd, reactivate_fd can all theoretically 
produce a race if you are adding/removing devices while under high IO load.

A.

> Do you have lock debugging enabled?
>
> I this case I'd start gdb and inspect the memory. Maybe a stack corruption.
>

------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-12 12:29       ` Anton Ivanov
@ 2015-11-12 15:23         ` Anton Ivanov
  2015-11-12 16:03           ` Anton Ivanov
  0 siblings, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-12 15:23 UTC (permalink / raw)
  To: user-mode-linux-devel

[snip]

>> Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
>> should be a no-op.
> In that case, if I understand correctly what is going on, there are a
> couple of places - the free_irqs(), activate_fd and the sigio handler
> itself, where it should not be a mutex, not a spinlock. It is there to
> ensure that you cannot use it in an interrupt context while it is being
> modified.
>
> If spinlock is a NOP it fails to perform this duty. The code should also
> be different - it should return on try_lock so it does not deadlock so
> spinlock_irqsave is the wrong locking primitive as it does not have this
> functionality.
>
> That is an issue both with this patch and with the original poll based
> controller - there free_irq, add_fd, reactivate_fd can all theoretically
> produce a race if you are adding/removing devices while under high IO load.

We, however cannot use mutex here as it is interrupt.

I tried with spin_trylock and finally got the correct behaviour. It 
throws an occasional warning here and there while inserting/removing 
devices, but works correctly with either config. No more BUGs.

A bare (not try) spinlock with UP/PREEMPT set as they are in UML 
actually does not guard anything effectively - it is a NOP. The try form 
is an exemption - if you look at spinlock.h it is actually "viable" even 
on UP. It will however throw a warning that it is activated in an 
inappropriate context if it hits an existing lock.

In theory - the code in signal.c should guard against nested interrupt 
invocation. I am still struggling to understand why it fails to work in 
practice.

This also leaves open the question on how to add/remove interrupts. If 
the spinlock does not actually guard the irq data structures properly 
modifying them in a safe manner becomes a very interesting exercise. I 
have it working with the try form, but that throws the occasional warning.

I am going to clean it up and re-submit so we have a "working version" 
which people can comment on.

A.

>
> A.
>
>> Do you have lock debugging enabled?
>>
>> I this case I'd start gdb and inspect the memory. Maybe a stack corruption.
>>
> ------------------------------------------------------------------------------
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>


------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-12 15:23         ` Anton Ivanov
@ 2015-11-12 16:03           ` Anton Ivanov
  2015-11-16  8:09             ` Anton Ivanov
  0 siblings, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-12 16:03 UTC (permalink / raw)
  To: user-mode-linux-devel

On 12/11/15 15:23, Anton Ivanov wrote:
> [snip]
>
>>> Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
>>> should be a no-op.
>> In that case, if I understand correctly what is going on, there are a
>> couple of places - the free_irqs(), activate_fd and the sigio handler
>> itself, where it should not be a mutex, not a spinlock. It is there to
>> ensure that you cannot use it in an interrupt context while it is being
>> modified.
>>
>> If spinlock is a NOP it fails to perform this duty. The code should also
>> be different - it should return on try_lock so it does not deadlock so
>> spinlock_irqsave is the wrong locking primitive as it does not have this
>> functionality.
>>
>> That is an issue both with this patch and with the original poll based
>> controller - there free_irq, add_fd, reactivate_fd can all theoretically
>> produce a race if you are adding/removing devices while under high IO load.
> We, however cannot use mutex here as it is interrupt.
>
> I tried with spin_trylock and finally got the correct behaviour. It
> throws an occasional warning here and there while inserting/removing
> devices, but works correctly with either config. No more BUGs.
>
> A bare (not try) spinlock with UP/PREEMPT set as they are in UML
> actually does not guard anything effectively - it is a NOP. The try form
> is an exemption - if you look at spinlock.h it is actually "viable" even
> on UP. It will however throw a warning that it is activated in an
> inappropriate context if it hits an existing lock.
>
> In theory - the code in signal.c should guard against nested interrupt
> invocation. I am still struggling to understand why it fails to work in
> practice.
>
> This also leaves open the question on how to add/remove interrupts. If
> the spinlock does not actually guard the irq data structures properly
> modifying them in a safe manner becomes a very interesting exercise. I
> have it working with the try form, but that throws the occasional warning.
>
> I am going to clean it up and re-submit so we have a "working version"
> which people can comment on.

Putting an extra guard around the signal handler in signal.c which 
prevents recursive invocation solves it completely.

I am going to clean it up, see if we need a similar guard around the 
timer interrupt and re-submit tomorrow morning.

>
> A.
>
>> A.
>>
>>> Do you have lock debugging enabled?
>>>
>>> I this case I'd start gdb and inspect the memory. Maybe a stack corruption.
>>>
>> ------------------------------------------------------------------------------
>> _______________________________________________
>> User-mode-linux-devel mailing list
>> User-mode-linux-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>>
>
> ------------------------------------------------------------------------------
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>

------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-12 16:03           ` Anton Ivanov
@ 2015-11-16  8:09             ` Anton Ivanov
  2015-11-18  8:33               ` Anton Ivanov
  0 siblings, 1 reply; 14+ messages in thread
From: Anton Ivanov @ 2015-11-16  8:09 UTC (permalink / raw)
  To: user-mode-linux-devel

On 12/11/15 16:03, Anton Ivanov wrote:
> On 12/11/15 15:23, Anton Ivanov wrote:
>> [snip]
>>
>>>> Hmmm, UML is UP and does not support PREEMPT, so all spinlocks
>>>> should be a no-op.
>>> In that case, if I understand correctly what is going on, there are a
>>> couple of places - the free_irqs(), activate_fd and the sigio handler
>>> itself, where it should not be a mutex, not a spinlock. It is there to
>>> ensure that you cannot use it in an interrupt context while it is being
>>> modified.
>>>
>>> If spinlock is a NOP it fails to perform this duty. The code should also
>>> be different - it should return on try_lock so it does not deadlock so
>>> spinlock_irqsave is the wrong locking primitive as it does not have this
>>> functionality.
>>>
>>> That is an issue both with this patch and with the original poll based
>>> controller - there free_irq, add_fd, reactivate_fd can all theoretically
>>> produce a race if you are adding/removing devices while under high IO load.
>> We, however cannot use mutex here as it is interrupt.
>>
>> I tried with spin_trylock and finally got the correct behaviour. It
>> throws an occasional warning here and there while inserting/removing
>> devices, but works correctly with either config. No more BUGs.
>>
>> A bare (not try) spinlock with UP/PREEMPT set as they are in UML
>> actually does not guard anything effectively - it is a NOP. The try form
>> is an exemption - if you look at spinlock.h it is actually "viable" even
>> on UP. It will however throw a warning that it is activated in an
>> inappropriate context if it hits an existing lock.
>>
>> In theory - the code in signal.c should guard against nested interrupt
>> invocation. I am still struggling to understand why it fails to work in
>> practice.
>>
>> This also leaves open the question on how to add/remove interrupts. If
>> the spinlock does not actually guard the irq data structures properly
>> modifying them in a safe manner becomes a very interesting exercise. I
>> have it working with the try form, but that throws the occasional warning.
>>
>> I am going to clean it up and re-submit so we have a "working version"
>> which people can comment on.
> Putting an extra guard around the signal handler in signal.c which
> prevents recursive invocation solves it completely.
>
> I am going to clean it up, see if we need a similar guard around the
> timer interrupt and re-submit tomorrow morning.

That worked. So in principle the proposed IRQ semantics are sound.

However, in practice forcing the re-invocation of IRQs and returing up 
the IRQ chain resulted in a significant performance drop - from 15% 
above the original to 15% below the original. By using a guard we also 
lose the possibility to have edge triggers in the future so overall this 
is a no-go.

I am going to "sleep on it" and come back to it later in the week to 
figure out why we do not see recursive invocation with the original and 
see it with the epoll one.

Once I get to the bottom of that, I will resubmit a fixed version.

A.

>
>> A.
>>
>>> A.
>>>
>>>> Do you have lock debugging enabled?
>>>>
>>>> I this case I'd start gdb and inspect the memory. Maybe a stack corruption.
>>>>
>>> ------------------------------------------------------------------------------
>>> _______________________________________________
>>> User-mode-linux-devel mailing list
>>> User-mode-linux-devel@lists.sourceforge.net
>>> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>>>
>> ------------------------------------------------------------------------------
>> _______________________________________________
>> User-mode-linux-devel mailing list
>> User-mode-linux-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>>
> ------------------------------------------------------------------------------
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
>


------------------------------------------------------------------------------
Presto, an open source distributed SQL query engine for big data, initially
developed by Facebook, enables you to easily query your data on Hadoop in a 
more interactive manner. Teradata is also now providing full enterprise
support for Presto. Download a free open source copy now.
http://pubads.g.doubleclick.net/gampad/clk?id=250295911&iu=/4140
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0
  2015-11-16  8:09             ` Anton Ivanov
@ 2015-11-18  8:33               ` Anton Ivanov
  0 siblings, 0 replies; 14+ messages in thread
From: Anton Ivanov @ 2015-11-18  8:33 UTC (permalink / raw)
  To: user-mode-linux-devel

Hi all,

I have looked through this and I still have not figured it out 
completely. Moving from the old poll controller to the new epoll one 
triggers deep recursive invocations of the interrupt handlers. I still 
do not understand why it does it so I will park it for now and come back 
to it next week.

I did, however, find a number of small issues. Namely, various patches 
and fixes over the years have used calls to block/unblock signals and 
block/unblock irqs in a few places where these can create a recursion 
race (even with the old controller).

If I understand os-Linux/irq.c correctly, block/unblock in UML does not 
block or unblock the signals. It blocks/unblocks the processing of them 
and unblock can (and will) result in the immediate processing any 
pending signals. So in most places, that should not be block/unblock 
(and respectively local_irq_enable/local_irq_disable which invoke that). 
It should be save+block and restore. Otherwise you recurse by invoking 
the IRQ handler the moment you unblock_signals().

Additionally, if you want just to wait and be interrupted by a signal, 
you do not need to enable/disable IRQs - signals are always received at 
present. If I understand the situation correctly, irq on/off only 
changes if they are processed or not.

I am going to roll my tree back to the timer patch now and go through 
the ones I found so far one by one and submit them separately. Once that 
is out of the way we can look again at the epoll controller patch. It 
has potential, but it makes all gremlins come out of the woodwork so we 
might as well get the gremlins out of the way first.

[snip]

A.

------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-11-18  8:33 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-09 14:33 [uml-devel] [PATCH v2] EPOLL Interrupt Controller V2.0 Anton Ivanov
2015-11-09 15:03 ` Anton Ivanov
2015-11-10 18:42   ` Anton Ivanov
2015-11-10 20:24     ` Richard Weinberger
2015-11-11 20:46   ` Thomas Meyer
2015-11-11 21:05     ` Richard Weinberger
2015-11-11 21:39       ` Anton Ivanov
2015-11-11 21:49         ` stian
2015-11-11 23:25           ` Anton Ivanov
2015-11-12 12:29       ` Anton Ivanov
2015-11-12 15:23         ` Anton Ivanov
2015-11-12 16:03           ` Anton Ivanov
2015-11-16  8:09             ` Anton Ivanov
2015-11-18  8:33               ` Anton Ivanov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.