From: Roman Penyaev <rpenyaev@suse.de> To: unlisted-recipients:; (no To-header on input) Cc: Roman Penyaev <rpenyaev@suse.de>, Andrew Morton <akpm@linux-foundation.org>, Davidlohr Bueso <dbueso@suse.de>, Jason Baron <jbaron@akamai.com>, Al Viro <viro@zeniv.linux.org.uk>, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>, Linus Torvalds <torvalds@linux-foundation.org>, Andrea Parri <andrea.parri@amarulasolutions.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 09/15] epoll: introduce stand-alone helpers for polling from userspace Date: Wed, 9 Jan 2019 17:40:19 +0100 Message-ID: <20190109164025.24554-10-rpenyaev@suse.de> (raw) In-Reply-To: <20190109164025.24554-1-rpenyaev@suse.de> ep_vrealloc*() realloc user header, user index or bitmap memory ep_get_bit() gets free bit from bitmap, if free bit is not found - bitmap is expanded on PAGE_SIZE ep_expand_user_is_required() helper which returna true if expand for different memory chunks is required ep_shrink_user_is_required() helper which returna new size if shrink for different memory chunks is required ep_expand_user_*() expands user header or user index ep_shrink_user_*() shrinks user header, user index or bitmaps. In case of srink there is an important procedure of moving sparsed bits at the end to the beginning of the bitmap, in order to free pages at the end. ep_route_events_to_*() routes events to klists or to uring. Should be called under write lock, when all events are stopped. ep_free_user_item() marks item inside user pointer as freed, i.e. atomically exchanges ready_events to 0. Also puts item bit or postponed it to period, when user goes to kernel. ep_add_event_to_uring() adds new event to user ring. Firstly mark user item as ready and if item was observed as not ready - fill in user index. ep_transfer_events_and_shrunk_uring() shrinks if needed and transfers events in klists to uring under the write lock. Signed-off-by: Roman Penyaev <rpenyaev@suse.de> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: Jason Baron <jbaron@akamai.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrea Parri <andrea.parri@amarulasolutions.com> Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- fs/eventpoll.c | 420 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 420 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bdaec59a847e..36c451c26681 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -929,6 +929,238 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +static int ep_vrealloc(void **pptr, unsigned int size) +{ + void *old = *pptr, *new; + + new = vrealloc(old, size); + if (unlikely(!new)) + return -ENOMEM; + if (unlikely(new == old)) + return 0; + + *pptr = new; + vfree(old); + + return 0; +} + +static int ep_vrealloc_bm(struct eventpoll *ep, unsigned int bm_len) +{ + unsigned long *bm, *removed_bm; + + /* Reallocate all at once */ + bm = vrealloc(ep->items_bm, bm_len); + removed_bm = vrealloc(ep->removed_items_bm, bm_len); + + if (unlikely(!bm || !removed_bm)) { + vfree(bm); + vfree(removed_bm); + + return -ENOMEM; + } + ep->items_bm = bm; + ep->removed_items_bm = removed_bm; + ep->items_bm_length = bm_len; + + return 0; +} + +static int ep_get_bit(struct eventpoll *ep) +{ + unsigned int max_nr; + int bit, start_bit; + bool was_set; + + lockdep_assert_held(&ep->mtx); + + start_bit = 0; +again: + max_nr = ep_max_items_bm_nr(ep); + bit = find_next_zero_bit(ep->items_bm, max_nr, start_bit); + if (bit >= max_nr) { + unsigned int bm_len; + int rc; + + start_bit = max_nr; + bm_len = ep->items_bm_length + PAGE_SIZE; + + rc = ep_vrealloc_bm(ep, bm_len); + if (unlikely(rc)) + return rc; + + goto again; + } + + was_set = test_and_set_bit(bit, ep->items_bm); + WARN_ON(was_set); + + return bit; +} + +static inline bool ep_expand_user_items_is_required(struct eventpoll *ep) +{ + return (ep->items_nr >= ep_max_items_nr(ep)); +} + +static inline bool ep_expand_user_index_is_required(struct eventpoll *ep) +{ + return (ep->items_nr + EPOLL_USER_EXTRA_INDEX_NR) + >= ep_max_index_nr(ep); +} + +static inline bool ep_expand_user_is_required(struct eventpoll *ep) +{ + return ep_expand_user_items_is_required(ep) || + ep_expand_user_index_is_required(ep); +} + +static inline unsigned int ep_shrunk_user_index_length(struct eventpoll *ep) +{ + unsigned int len, nr; + + nr = ep->items_nr + EPOLL_USER_EXTRA_INDEX_NR; + len = PAGE_ALIGN(to_index_length(nr) + (PAGE_SIZE >> 1)); + if (len < ep->index_length) + return len; + + return 0; +} + +static inline unsigned int ep_shrunk_user_items_length(struct eventpoll *ep) +{ + unsigned int len; + + len = PAGE_ALIGN(to_items_length(ep->items_nr) + (PAGE_SIZE >> 1)); + if (len < ep->header_length) + return len; + + return 0; +} + +static inline unsigned int ep_shrunk_items_bm_length(struct eventpoll *ep) +{ + unsigned int len; + + len = PAGE_ALIGN(to_items_bm_length(ep->items_nr) + (PAGE_SIZE >> 1)); + if (len < ep->items_bm_length) + return len; + + return 0; +} + +static inline bool ep_shrink_user_is_required(struct eventpoll *ep) +{ + return ep_shrunk_user_items_length(ep) != 0 || + ep_shrunk_user_index_length(ep) != 0 || + ep_shrunk_items_bm_length(ep) != 0; +} + +static inline void ep_route_events_to_klists(struct eventpoll *ep) +{ + WARN_ON(!ep_polled_by_user(ep)); + ep->events_to_uring = false; + ep->user_header->state = EPOLL_USER_POLL_INACTIVE; + /* Make sure userspace sees INACTIVE state ASAP */ + smp_wmb(); +} + +static inline void ep_route_events_to_uring(struct eventpoll *ep) +{ + WARN_ON(!ep_polled_by_user(ep)); + ep->events_to_uring = true; + /* Commit all previous writes to user header */ + smp_wmb(); + ep->user_header->state = EPOLL_USER_POLL_ACTIVE; +} + +static inline bool ep_events_routed_to_klists(struct eventpoll *ep) +{ + return !ep->events_to_uring; +} + +static inline bool ep_events_routed_to_uring(struct eventpoll *ep) +{ + return ep->events_to_uring; +} + +static inline bool ep_free_user_item(struct epitem *epi) +{ + struct eventpoll *ep = epi->ep; + struct user_epitem *uitem; + + bool events_to_klist = false; + + lockdep_assert_held(&ep->mtx); + + ep->items_nr--; + + uitem = &ep->user_header->items[epi->bit]; + + /* Firstly drop item events passed from userland */ + memset(&uitem->event, 0, sizeof(uitem->event)); + + /* + * If event is not signaled yet and has been already consumed by + * userspace it is safe to reuse the bit immediately, i.e. just + * put it. If userspace has not been yet consumed this event + * we set the bit in removed bitmap in order to put it later. + */ + if (xchg(&uitem->ready_events, 0)) { + set_bit(epi->bit, ep->removed_items_bm); + events_to_klist = true; + } else { + /* + * Should not be reordered with memset above, thus unlock + * semantics. + */ + clear_bit_unlock(epi->bit, ep->items_bm); + events_to_klist = ep_shrink_user_is_required(ep); + } + + return events_to_klist; +} + +static bool ep_add_event_to_uring(struct epitem *epi, __poll_t pollflags) +{ + struct eventpoll *ep = epi->ep; + struct user_epitem *uitem; + bool added = false; + + if (WARN_ON(!pollflags)) + return false; + + uitem = &ep->user_header->items[epi->bit]; + if (!__atomic_fetch_or(&uitem->ready_events, pollflags, + __ATOMIC_ACQUIRE)) { + unsigned int i, *item_idx, index_mask; + + /* + * Item was not ready before, thus we have to insert + * new index to the ring. + */ + + index_mask = ep_max_index_nr(ep) - 1; + i = __atomic_fetch_add(&ep->user_header->tail, 1, + __ATOMIC_ACQUIRE); + item_idx = &ep->user_index[i & index_mask]; + + /* Signal with a bit, which is > 0 */ + *item_idx = epi->bit + 1; + + /* + * Want index update be flushed from CPU write buffer and + * immediately visible on userspace side to avoid long busy + * loops. + */ + smp_wmb(); + + added = true; + } + + return added; +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -1695,6 +1927,44 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) wakeup_source_unregister(ws); } +static int ep_expand_user_items(struct eventpoll *ep) +{ + unsigned int len; + int rc; + + if (!ep_expand_user_items_is_required(ep)) + /* Expanding is not needed */ + return 0; + + len = ep->header_length + PAGE_SIZE; + rc = ep_vrealloc((void **)&ep->user_header, len); + if (unlikely(rc)) + return rc; + + ep->header_length = len; + + return 0; +} + +static int ep_expand_user_index(struct eventpoll *ep) +{ + unsigned int len; + int rc; + + if (!ep_expand_user_index_is_required(ep)) + /* Expanding is not needed */ + return 0; + + len = ep->index_length + PAGE_SIZE; + rc = ep_vrealloc((void **)&ep->user_index, len); + if (unlikely(rc)) + return rc; + + ep->index_length = len; + + return 0; +} + /* * Must be called with "mtx" held. */ @@ -2010,6 +2280,156 @@ static inline struct timespec64 ep_set_mstimeout(long ms) return timespec64_add_safe(now, ts); } +static int ep_shrink_user_index(struct eventpoll *ep) +{ + unsigned int len; + int rc; + + len = ep_shrunk_user_index_length(ep); + if (!len) + /* Shrinking is not needed */ + return 0; + + rc = ep_vrealloc((void **)&ep->user_index, len); + if (unlikely(rc)) + return rc; + + ep->index_length = len; + + return 0; +} + +static int ep_shrink_user_items_and_bm(struct eventpoll *ep) +{ + unsigned int header_len, bm_len; + unsigned int bit, last_bit = UINT_MAX; + int rc; + + struct rb_node *rbp; + struct epitem *epi; + + lockdep_assert_held(&ep->mtx); + + header_len = ep_shrunk_user_items_length(ep); + bm_len = ep_shrunk_items_bm_length(ep); + if (!header_len && !bm_len) + /* Shrinking is not needed */ + return 0; + + /* + * Find left most last bit + */ + if (header_len) + last_bit = to_items_nr(header_len); + if (bm_len) + last_bit = min(last_bit, to_items_bm_nr(header_len)); + + if (WARN_ON(last_bit <= ep->items_nr)) + return -EINVAL; + + /* + * Find bits from the right and move them to the left in order to + * free space on the right. + * + * This is not nice, because O(n), but frankly this operation should + * be quite rare. If not - let's switch to idr or something similar + * (but that obviously will consume more memory). + * + */ + bit = 0; + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + + if (epi->bit >= last_bit) { + /* Find first available bit from left */ + bit = find_next_zero_bit(ep->items_bm, last_bit, bit); + if (WARN_ON(bit >= last_bit)) + return -EINVAL; + + /* Clear old bit from right */ + clear_bit(epi->bit, ep->items_bm); + + /* + * Set item bit and advance an iterator for the + * following find_next_zero_bit() call. + */ + epi->bit = bit++; + } + } + + /* + * Reallocate memory and commit lengths + */ + if (header_len) { + rc = ep_vrealloc((void **)&ep->user_header, header_len); + if (unlikely(rc)) + return rc; + + ep->header_length = header_len; + } + if (bm_len) { + rc = ep_vrealloc_bm(ep, bm_len); + if (unlikely(rc)) + return rc; + } + + return 0; +} + +static inline void ep_put_postponed_user_items_bits(struct eventpoll *ep) +{ + size_t sz, i; + + lockdep_assert_held(&ep->mtx); + + sz = ep->items_bm_length >> ilog2(sizeof(ep->items_bm[0])); + for (i = 0; i < sz; i++) { + ep->items_bm[i] &= ~(ep->removed_items_bm[i]); + ep->removed_items_bm[i] = 0ul; + } +} + +static int ep_transfer_events_and_shrink_uring(struct eventpoll *ep) +{ + struct epitem *epi, *tmp; + int rc = 0; + + mutex_lock(&ep->mtx); + if (ep_events_routed_to_uring(ep)) + /* A bit late */ + goto unlock; + + /* Here at this point we are sure uring is empty */ + ep_put_postponed_user_items_bits(ep); + + rc = ep_shrink_user_index(ep); + if (unlikely(rc)) + goto unlock; + + rc = ep_shrink_user_items_and_bm(ep); + if (unlikely(rc)) + goto unlock; + + /* Commit lengths to userspace, but state is not yet ACTIVE */ + ep->user_header->index_length = ep->index_length; + ep->user_header->header_length = ep->header_length; + + write_lock_irq(&ep->lock); + /* Atomically transfer events from klists to uring */ + list_for_each_entry_safe(epi, tmp, &ep->rdllist, rdllink) { + ep_add_event_to_uring(epi, epi->ready_events); + list_del_init(&epi->rdllink); + epi->ready_events = 0; + } + ep_route_events_to_uring(ep); + write_unlock_irq(&ep->lock); + +unlock: + mutex_unlock(&ep->mtx); + + return rc; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller supplied * event buffer. -- 2.19.1
next prev parent reply index Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-01-09 16:40 [RFC 00/15] epoll: support pollable epoll " Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 01/15] mm/vmalloc: add new 'alignment' field for vm_struct structure Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 02/15] mm/vmalloc: move common logic from __vmalloc_area_node to a separate func Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 03/15] mm/vmalloc: introduce new vrealloc() call and its subsidiary reach analog Roman Penyaev 2019-01-09 16:50 ` Matthew Wilcox 2019-01-10 10:08 ` Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 04/15] epoll: move private helpers from a header to the source Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 05/15] epoll: introduce user header structure and user index for polling from userspace Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 06/15] epoll: introduce various of helpers for user structure lengths calculations Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 07/15] epoll: extend epitem struct with new members for polling from userspace Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 08/15] epoll: some sanity flags checks for epoll syscalls for polled epfd " Roman Penyaev 2019-01-09 16:40 ` Roman Penyaev [this message] 2019-01-09 17:29 ` [RFC PATCH 09/15] epoll: introduce stand-alone helpers for polling " Linus Torvalds 2019-01-10 10:03 ` Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 10/15] epoll: support polling from userspace for ep_insert() Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 11/15] epoll: offload polling to a work in case of epfd polled from userspace Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 12/15] epoll: support polling from userspace for ep_remove() Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 13/15] epoll: support polling from userspace for ep_modify() Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 14/15] epoll: support polling from userspace for ep_poll() Roman Penyaev 2019-01-09 16:40 ` [RFC PATCH 15/15] epoll: support mapping for epfd when polled from userspace Roman Penyaev
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20190109164025.24554-10-rpenyaev@suse.de \ --to=rpenyaev@suse.de \ --cc=akpm@linux-foundation.org \ --cc=andrea.parri@amarulasolutions.com \ --cc=dbueso@suse.de \ --cc=jbaron@akamai.com \ --cc=linux-fsdevel@vger.kernel.org \ --cc=linux-kernel@vger.kernel.org \ --cc=paulmck@linux.vnet.ibm.com \ --cc=torvalds@linux-foundation.org \ --cc=viro@zeniv.linux.org.uk \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
LKML Archive on lore.kernel.org Archives are clonable: git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git git clone --mirror https://lore.kernel.org/lkml/7 lkml/git/7.git git clone --mirror https://lore.kernel.org/lkml/8 lkml/git/8.git git clone --mirror https://lore.kernel.org/lkml/9 lkml/git/9.git # If you have public-inbox 1.1+ installed, you may # initialize and index your mirror using the following commands: public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \ linux-kernel@vger.kernel.org public-inbox-index lkml Example config snippet for mirrors Newsgroup available over NNTP: nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel AGPL code for this site: git clone https://public-inbox.org/public-inbox.git