LKML Archive on lore.kernel.org
 help / color / Atom feed
From: Roman Penyaev <rpenyaev@suse.de>
To: unlisted-recipients:; (no To-header on input)
Cc: Roman Penyaev <rpenyaev@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Davidlohr Bueso <dbueso@suse.de>, Jason Baron <jbaron@akamai.com>,
	Al Viro <viro@zeniv.linux.org.uk>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrea Parri <andrea.parri@amarulasolutions.com>,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC PATCH 05/15] epoll: introduce user header structure and user index for polling from userspace
Date: Wed,  9 Jan 2019 17:40:15 +0100
Message-ID: <20190109164025.24554-6-rpenyaev@suse.de> (raw)
In-Reply-To: <20190109164025.24554-1-rpenyaev@suse.de>

This one introduces main user structures: user header and user index.
Header describes current state of epoll, head and tail of the index
ring, epoll items at the end of the structure.

Index table is a ring, which is controlled by head and tail from the
user header.  Ring consists of u32 indeces, pointing to items in header,
which have been ready for polling.

Userspace has to call epoll_create1(EPOLL_USERPOLL) in order to start
using polling from user side.

Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrea Parri <andrea.parri@amarulasolutions.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 fs/eventpoll.c                 | 107 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/eventpoll.h |   3 +-
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2cc183e86a29..9ec682b6488f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -178,6 +178,42 @@ struct epitem {
 	struct epoll_event event;
 };
 
+#define EPOLL_USER_HEADER_SIZE  128
+#define EPOLL_USER_HEADER_MAGIC 0xeb01eb01
+
+enum {
+	EPOLL_USER_POLL_INACTIVE = 0, /* user poll disactivated */
+	EPOLL_USER_POLL_ACTIVE   = 1, /* user can continue busy polling */
+
+	/*
+	 * Always keep some slots ahead to be able to consume new events
+	 * from many threads, i.e. if N threads consume ring from userspace,
+	 * we have to keep N free slots ahead to avoid ring overlap.
+	 *
+	 * Probably this number should be reported to userspace in header.
+	 */
+	EPOLL_USER_EXTRA_INDEX_NR = 16 /* how many extra indeces keep in ring */
+};
+
+struct user_epitem {
+	unsigned int ready_events;
+	struct epoll_event event;
+};
+
+struct user_header {
+	unsigned int magic;          /* epoll user header magic */
+	unsigned int state;          /* epoll ring state */
+	unsigned int header_length;  /* length of the header + items */
+	unsigned int index_length;   /* length of the index ring */
+	unsigned int max_items_nr;   /* max num of items slots */
+	unsigned int max_index_nr;   /* max num of items indeces, always pow2 */
+	unsigned int head;           /* updated by userland */
+	unsigned int tail;           /* updated by kernel */
+	unsigned int padding[24];    /* Header size is 128 bytes */
+
+	struct user_epitem items[];
+};
+
 /*
  * This structure is stored inside the "private_data" member of the file
  * structure and represents the main data structure for the eventpoll
@@ -222,6 +258,36 @@ struct eventpoll {
 
 	struct file *file;
 
+	/* User header with array of items */
+	struct user_header *user_header;
+
+	/* User index, which acts as a ring of coming events */
+	unsigned int *user_index;
+
+	/* Actual length of user header, always aligned on page */
+	unsigned int header_length;
+
+	/* Actual length of user index, always aligned on page */
+	unsigned int index_length;
+
+	/* Number of event items */
+	unsigned int items_nr;
+
+	/* Items bitmap, is used to get a free bit for new registered epi */
+	unsigned long *items_bm;
+
+	/* Removed items bitmap, is used to postpone bit put */
+	unsigned long *removed_items_bm;
+
+	/* Length of both items bitmaps, always aligned on page */
+	unsigned int items_bm_length;
+
+	/*
+	 * Where events are routed: to kernel lists or to user ring.
+	 * Always false for epfd created without EPOLL_USERPOLL.
+	 */
+	bool events_to_uring;
+
 	/* used to optimize loop detection check */
 	int visited;
 	struct list_head visited_list_link;
@@ -876,6 +942,10 @@ static void ep_free(struct eventpoll *ep)
 	mutex_destroy(&ep->mtx);
 	free_uid(ep->user);
 	wakeup_source_unregister(ep->ws);
+	vfree(ep->user_header);
+	vfree(ep->user_index);
+	vfree(ep->items_bm);
+	vfree(ep->removed_items_bm);
 	kfree(ep);
 }
 
@@ -1028,7 +1098,7 @@ void eventpoll_release_file(struct file *file)
 	mutex_unlock(&epmutex);
 }
 
-static int ep_alloc(struct eventpoll **pep)
+static int ep_alloc(struct eventpoll **pep, int flags)
 {
 	int error;
 	struct user_struct *user;
@@ -1040,6 +1110,31 @@ static int ep_alloc(struct eventpoll **pep)
 	if (unlikely(!ep))
 		goto free_uid;
 
+	if (flags & EPOLL_USERPOLL) {
+		ep->user_header = vmalloc_user(PAGE_SIZE);
+		ep->user_index = vmalloc_user(PAGE_SIZE);
+		ep->items_bm = vzalloc(PAGE_SIZE);
+		ep->removed_items_bm = vzalloc(PAGE_SIZE);
+		ep->events_to_uring = true;
+		if (!ep->user_header || !ep->user_index)
+			goto free_ep;
+		if (!ep->items_bm || !ep->removed_items_bm)
+			goto free_ep;
+
+		ep->header_length   = PAGE_SIZE;
+		ep->index_length    = PAGE_SIZE;
+		ep->items_bm_length = PAGE_SIZE;
+
+		*ep->user_header = (typeof(*ep->user_header)) {
+			.magic         = EPOLL_USER_HEADER_MAGIC,
+			.state         = EPOLL_USER_POLL_ACTIVE,
+			.header_length = ep->header_length,
+			.index_length  = ep->index_length,
+			.max_items_nr  = ep_max_items_nr(ep),
+			.max_index_nr  = ep_max_index_nr(ep),
+		};
+	}
+
 	mutex_init(&ep->mtx);
 	rwlock_init(&ep->lock);
 	init_waitqueue_head(&ep->wq);
@@ -1053,6 +1148,12 @@ static int ep_alloc(struct eventpoll **pep)
 
 	return 0;
 
+free_ep:
+	vfree(ep->user_header);
+	vfree(ep->user_index);
+	vfree(ep->items_bm);
+	vfree(ep->removed_items_bm);
+	kfree(ep);
 free_uid:
 	free_uid(user);
 	return error;
@@ -2066,12 +2167,12 @@ static int do_epoll_create(int flags)
 	/* Check the EPOLL_* constant for consistency.  */
 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
 
-	if (flags & ~EPOLL_CLOEXEC)
+	if (flags & ~(EPOLL_CLOEXEC | EPOLL_USERPOLL))
 		return -EINVAL;
 	/*
 	 * Create the internal data structure ("struct eventpoll").
 	 */
-	error = ep_alloc(&ep);
+	error = ep_alloc(&ep, flags);
 	if (error < 0)
 		return error;
 	/*
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 39dfc29f0f52..b0a565f6c6c3 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -20,7 +20,8 @@
 #include <linux/types.h>
 
 /* Flags for epoll_create1.  */
-#define EPOLL_CLOEXEC O_CLOEXEC
+#define EPOLL_CLOEXEC  O_CLOEXEC
+#define EPOLL_USERPOLL 1
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
-- 
2.19.1


  parent reply index

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-09 16:40 [RFC 00/15] epoll: support pollable epoll " Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 01/15] mm/vmalloc: add new 'alignment' field for vm_struct structure Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 02/15] mm/vmalloc: move common logic from __vmalloc_area_node to a separate func Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 03/15] mm/vmalloc: introduce new vrealloc() call and its subsidiary reach analog Roman Penyaev
2019-01-09 16:50   ` Matthew Wilcox
2019-01-10 10:08     ` Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 04/15] epoll: move private helpers from a header to the source Roman Penyaev
2019-01-09 16:40 ` Roman Penyaev [this message]
2019-01-09 16:40 ` [RFC PATCH 06/15] epoll: introduce various of helpers for user structure lengths calculations Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 07/15] epoll: extend epitem struct with new members for polling from userspace Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 08/15] epoll: some sanity flags checks for epoll syscalls for polled epfd " Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 09/15] epoll: introduce stand-alone helpers for polling " Roman Penyaev
2019-01-09 17:29   ` Linus Torvalds
2019-01-10 10:03     ` Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 10/15] epoll: support polling from userspace for ep_insert() Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 11/15] epoll: offload polling to a work in case of epfd polled from userspace Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 12/15] epoll: support polling from userspace for ep_remove() Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 13/15] epoll: support polling from userspace for ep_modify() Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 14/15] epoll: support polling from userspace for ep_poll() Roman Penyaev
2019-01-09 16:40 ` [RFC PATCH 15/15] epoll: support mapping for epfd when polled from userspace Roman Penyaev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190109164025.24554-6-rpenyaev@suse.de \
    --to=rpenyaev@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=andrea.parri@amarulasolutions.com \
    --cc=dbueso@suse.de \
    --cc=jbaron@akamai.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git
	git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git
	git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git
	git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org
	public-inbox-index lkml

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git