All of lore.kernel.org
 help / color / mirror / Atom feed
* userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
@ 2016-08-11 13:51 Evgeny Yakovlev
  2016-08-11 17:17 ` Andrea Arcangeli
  0 siblings, 1 reply; 3+ messages in thread
From: Evgeny Yakovlev @ 2016-08-11 13:51 UTC (permalink / raw)
  To: linux-mm; +Cc: aarcange

We're experimenting with userfaultfd write protect implementation on 
Andrea's tree and it looks like there is a problem if we combine 
MODE_MISSING and MODE_WP in one region.

You can find a test case below together with detailed problem 
description. Please take a look, maybe we're doing something wrong?

Will be happy to provide any additional info if needed.

/*
  * This testcase reproduces a problem with userfaultfd writeprotect 
feature on
  * http://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git, HEAD 
a22d71c
  * gcc ufdtest.c -std=gnu99 -lpthread -o ufdtest
  *
  * 1. Allocate a private RW region and register it with MODE_MISSING | 
MODE_WP.
  * 2. Fork a UFD thread and begin writing to memory from main thread.
  *
  * Expected behavior:
  * Recv pagefaults with UFFD_PAGEFAULT_FLAG_WRITE set, handle them with 
zeropage
  *
  * Actual behavior:
  * We recv to pagefaults for each page:
  *
  * 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we 
resolve
  * with zeropage
  *
  * 2. Second fault immediately follows the first one with the same address
  * and has UFFD_PAGEFAULT_FLAG_WRITE | UFFD_PAGEFAULT_FLAG_WP set.
  * If we ignore this second fault then main thread never wakes up
  * If we try to resolve it with !WP then main thread received SIGBUS.
  *
  * If we register that region only with MODE_MISSING _or_ MODE_WP then 
we get
  * no problems, i.e. only missing faults or WP faults are seen.
  */

#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>

#define _GNU_SOURCE
#include <fcntl.h>
#include <poll.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/user.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <asm/types.h>
#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
#include <pthread.h>

#if !(defined(__linux__) && defined(__NR_userfaultfd))
#   error Need userfaultfd
#endif

#define DIE(fmt, ...) do { \
     fprintf(stderr, fmt, ##__VA_ARGS__); \
     fprintf(stderr, "\n"); \
     assert(0); \
} while(0);

#define DPRINTF(fmt, ...) do { \
     printf("%s: " fmt, __func__, ##__VA_ARGS__); \
     printf("\n"); \
} while(0);

static int g_ufd = -1;

static bool ufd_version_check(void)
{
     struct uffdio_api api_struct;
     uint64_t ioctl_mask;

     api_struct.api = UFFD_API;
     api_struct.features = 0;
     if (ioctl(g_ufd, UFFDIO_API, &api_struct)) {
         DIE("UFFDIO_API failed: %s", strerror(errno));
     }

     ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
                  (__u64)1 << _UFFDIO_UNREGISTER;
     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
         DIE("Missing features: %llx", ~api_struct.ioctls & ioctl_mask);
     }

     return true;
}

static void ufd_zeropage(__u64 page)
{
     struct uffdio_zeropage zero_struct;
     zero_struct.range.start = page;
     zero_struct.range.len = getpagesize();
     zero_struct.mode = 0;

     if (ioctl(g_ufd, UFFDIO_ZEROPAGE, &zero_struct)) {
         DIE("zeropage ioctl failed");
     }
}

static void ufd_writeprotect(__u64 page, bool readonly)
{
     struct uffdio_writeprotect wp_struct;
     wp_struct.range.start = page;
     wp_struct.range.len = PAGE_SIZE;
     if (readonly) {
         wp_struct.mode = UFFDIO_WRITEPROTECT_MODE_WP;
     } else {
         wp_struct.mode = 0;
     }

     if (ioctl(g_ufd, UFFDIO_WRITEPROTECT, &wp_struct)) {
         DIE("ioctl failed: %s", strerror(errno));
     }
}

static void* ufd_worker(void* arg)
{
     while(1) {
         DPRINTF("Reading from ufd");

         struct uffd_msg msg;
         int ret = read(g_ufd, &msg, sizeof(msg));
         if (ret != sizeof(msg)) {
             if (errno == EAGAIN) {
                 continue;
             }

             if (ret < 0) {
                 DIE("Failed to read full message: %s", strerror(errno));
             } else {
                 DIE("Read %d bytes, expected %zd", ret, sizeof(msg));
             }
         }

         if (msg.event != UFFD_EVENT_PAGEFAULT) {
             DIE("unexpected event 0x%x", msg.event);
         }

         __u64 page = msg.arg.pagefault.address & ~(PAGE_SIZE - 1ull);
         DPRINTF("Pagefault @ 0x%llx, flags 0x%llx",
                 page, msg.arg.pagefault.flags);

         bool is_write_fault =
             (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) != 0;
         bool is_wp_fault =
             (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) != 0;

         if (!is_write_fault || (is_write_fault && !is_wp_fault)) {
             ufd_zeropage(page);
             DPRINTF("0x%llx zeropaged", page);
         } else if (is_wp_fault) {
             DPRINTF("unexpected WP fault on 0x%llx", page);

             // If you remove this main thread will sleep forever
             ufd_writeprotect(page, false);
         }
     }

     DIE("Unreachable");
     return NULL;
}

int main(void)
{
     int res = 0;

     g_ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
     if (g_ufd < 0) {
         DIE("userfaultfd not available: %s", strerror(errno));
     }

     if (!ufd_version_check()) {
         DIE("UFFDIO_API not supported");
     }

     size_t len = 1024 * 1024 * 1024;
     void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE,
             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     if (mem == MAP_FAILED) {
         DIE("mmap failed: %s", strerror(errno));
     }

     struct uffdio_register reg_struct;
     reg_struct.range.start = (uintptr_t)mem;
     reg_struct.range.len = len;
     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING | 
UFFDIO_REGISTER_MODE_WP;

     if (ioctl(g_ufd, UFFDIO_REGISTER, &reg_struct)) {
         DIE("userfault register: %s", strerror(errno));
     }

     uint64_t feature_mask = 1ull << _UFFDIO_WAKE |
                             1ull << _UFFDIO_ZEROPAGE |
                             1ull << _UFFDIO_WRITEPROTECT;
     if ((reg_struct.ioctls & feature_mask) != feature_mask) {
         DIE("Missing range features: %llx", ~reg_struct.ioctls & 
feature_mask);
     }

     DPRINTF("Registered range %p:%zu", mem, len);
     DPRINTF("UFD features: 0x%x", reg_struct.ioctls);

     pthread_t worker;
     if (0 != pthread_create(&worker, NULL, ufd_worker, NULL)) {
         DIE("Failed to start ufd worker thread");
     }

     volatile uint8_t* pdata = (uint8_t*)mem;
     for (int i = 0; i < (len / PAGE_SIZE); ++i) {
         pdata[0] = (uint8_t)rand();
     }

     DPRINTF("done!");
     pthread_join(worker, NULL);
     return EXIT_SUCCESS;
}



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
  2016-08-11 13:51 userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions Evgeny Yakovlev
@ 2016-08-11 17:17 ` Andrea Arcangeli
  2016-08-12 13:43   ` Evgeny Yakovlev
  0 siblings, 1 reply; 3+ messages in thread
From: Andrea Arcangeli @ 2016-08-11 17:17 UTC (permalink / raw)
  To: Evgeny Yakovlev; +Cc: linux-mm, Mike Rapoport

Hello Evgeny,

On Thu, Aug 11, 2016 at 04:51:30PM +0300, Evgeny Yakovlev wrote:
>   * 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we 
> resolve
>   * with zeropage

What if you resolve it with bzero(4096);UFFDIO_COPY? Does the problem
go away?

If the zeropage is mapped by UFFDIO_ZEROPAGE, there's no way to turn
that into a writable zeropage ever again because
userfaultfd_writeprotect is basically a no-vma-mangling mmap_sem-read
mprotect and it can't trigger faults. Instead a fault in do_wp_page is
required to get rid of the zeropage and copy it off.

If the problem goes away if you s/UFFDIO_ZEROPAGE/bzero(4096);
UFFDIO_COPY/ as I would expect, there would be two ways to solve it:

1) forbid UFFDIO_ZEROPAGE and not return the UFFDIO_ZEROPAGE ioctl in
   uffdio_register.ioctls, if UFFDIO_REGISTER is called with
   uffdio_register.mode = ...WP|..MISSING so userland is aware it
   can't use that.

2) teach UFFDIO_WRITEPROTECT not just to mangle pagetables but also
   trigger a write fault on any zeropage if it's called with
   uffdio_writeprotect.mode without UFFDIO_WRITEPROTECT_MODE_WP being
   set. This will require a bit more work to fix.

The latter would increase performance if not all zeropages needs to be
turned writable.

Feedback welcome on what solution would you prefer.

Thanks,
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
  2016-08-11 17:17 ` Andrea Arcangeli
@ 2016-08-12 13:43   ` Evgeny Yakovlev
  0 siblings, 0 replies; 3+ messages in thread
From: Evgeny Yakovlev @ 2016-08-12 13:43 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-mm, Mike Rapoport


Hello Andrea,

On 11.08.2016 20:17, Andrea Arcangeli wrote:
> Hello Evgeny,
>
> On Thu, Aug 11, 2016 at 04:51:30PM +0300, Evgeny Yakovlev wrote:
>>    * 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we
>> resolve
>>    * with zeropage
> What if you resolve it with bzero(4096);UFFDIO_COPY? Does the problem
> go away?

Yes, i don't see additional WP fault now, only expected missing write fault.

> If the zeropage is mapped by UFFDIO_ZEROPAGE, there's no way to turn
> that into a writable zeropage ever again because
> userfaultfd_writeprotect is basically a no-vma-mangling mmap_sem-read
> mprotect and it can't trigger faults. Instead a fault in do_wp_page is
> required to get rid of the zeropage and copy it off.

Maybe i am missing something but why do we then get WP faults on that 
page right after we UFFDIO_ZEROPAGE it? We never call writeprotect on 
zeropaged page and still get a WP fault on it which we can't resolve 
properly.

> If the problem goes away if you s/UFFDIO_ZEROPAGE/bzero(4096);
> UFFDIO_COPY/ as I would expect, there would be two ways to solve it:
>
> 1) forbid UFFDIO_ZEROPAGE and not return the UFFDIO_ZEROPAGE ioctl in
>     uffdio_register.ioctls, if UFFDIO_REGISTER is called with
>     uffdio_register.mode = ...WP|..MISSING so userland is aware it
>     can't use that.
>
> 2) teach UFFDIO_WRITEPROTECT not just to mangle pagetables but also
>     trigger a write fault on any zeropage if it's called with
>     uffdio_writeprotect.mode without UFFDIO_WRITEPROTECT_MODE_WP being
>     set. This will require a bit more work to fix.
>
> The latter would increase performance if not all zeropages needs to be
> turned writable.
>
> Feedback welcome on what solution would you prefer.

Our use case is as follows. We have a huge region and most of it we need 
to be writable. Most of the time we just gradually resolve missing 
faults as they appear. We only enable write protection on some selective 
already present pages to have a way to track attempted page modification 
for a short period of time. We register initial region as MISSING | WP 
so that we don't have to register a new page-sized region each time we 
need to write-protect a single page inside a region.

>
> Thanks,
> Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2016-08-12 13:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-11 13:51 userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions Evgeny Yakovlev
2016-08-11 17:17 ` Andrea Arcangeli
2016-08-12 13:43   ` Evgeny Yakovlev

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.