* userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
@ 2016-08-11 13:51 Evgeny Yakovlev
2016-08-11 17:17 ` Andrea Arcangeli
0 siblings, 1 reply; 3+ messages in thread
From: Evgeny Yakovlev @ 2016-08-11 13:51 UTC (permalink / raw)
To: linux-mm; +Cc: aarcange
We're experimenting with userfaultfd write protect implementation on
Andrea's tree and it looks like there is a problem if we combine
MODE_MISSING and MODE_WP in one region.
You can find a test case below together with detailed problem
description. Please take a look, maybe we're doing something wrong?
Will be happy to provide any additional info if needed.
/*
* This testcase reproduces a problem with userfaultfd writeprotect
feature on
* http://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git, HEAD
a22d71c
* gcc ufdtest.c -std=gnu99 -lpthread -o ufdtest
*
* 1. Allocate a private RW region and register it with MODE_MISSING |
MODE_WP.
* 2. Fork a UFD thread and begin writing to memory from main thread.
*
* Expected behavior:
* Recv pagefaults with UFFD_PAGEFAULT_FLAG_WRITE set, handle them with
zeropage
*
* Actual behavior:
* We recv to pagefaults for each page:
*
* 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we
resolve
* with zeropage
*
* 2. Second fault immediately follows the first one with the same address
* and has UFFD_PAGEFAULT_FLAG_WRITE | UFFD_PAGEFAULT_FLAG_WP set.
* If we ignore this second fault then main thread never wakes up
* If we try to resolve it with !WP then main thread received SIGBUS.
*
* If we register that region only with MODE_MISSING _or_ MODE_WP then
we get
* no problems, i.e. only missing faults or WP faults are seen.
*/
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#define _GNU_SOURCE
#include <fcntl.h>
#include <poll.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/user.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <asm/types.h>
#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
#include <pthread.h>
#if !(defined(__linux__) && defined(__NR_userfaultfd))
# error Need userfaultfd
#endif
#define DIE(fmt, ...) do { \
fprintf(stderr, fmt, ##__VA_ARGS__); \
fprintf(stderr, "\n"); \
assert(0); \
} while(0);
#define DPRINTF(fmt, ...) do { \
printf("%s: " fmt, __func__, ##__VA_ARGS__); \
printf("\n"); \
} while(0);
static int g_ufd = -1;
static bool ufd_version_check(void)
{
struct uffdio_api api_struct;
uint64_t ioctl_mask;
api_struct.api = UFFD_API;
api_struct.features = 0;
if (ioctl(g_ufd, UFFDIO_API, &api_struct)) {
DIE("UFFDIO_API failed: %s", strerror(errno));
}
ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
(__u64)1 << _UFFDIO_UNREGISTER;
if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
DIE("Missing features: %llx", ~api_struct.ioctls & ioctl_mask);
}
return true;
}
static void ufd_zeropage(__u64 page)
{
struct uffdio_zeropage zero_struct;
zero_struct.range.start = page;
zero_struct.range.len = getpagesize();
zero_struct.mode = 0;
if (ioctl(g_ufd, UFFDIO_ZEROPAGE, &zero_struct)) {
DIE("zeropage ioctl failed");
}
}
static void ufd_writeprotect(__u64 page, bool readonly)
{
struct uffdio_writeprotect wp_struct;
wp_struct.range.start = page;
wp_struct.range.len = PAGE_SIZE;
if (readonly) {
wp_struct.mode = UFFDIO_WRITEPROTECT_MODE_WP;
} else {
wp_struct.mode = 0;
}
if (ioctl(g_ufd, UFFDIO_WRITEPROTECT, &wp_struct)) {
DIE("ioctl failed: %s", strerror(errno));
}
}
static void* ufd_worker(void* arg)
{
while(1) {
DPRINTF("Reading from ufd");
struct uffd_msg msg;
int ret = read(g_ufd, &msg, sizeof(msg));
if (ret != sizeof(msg)) {
if (errno == EAGAIN) {
continue;
}
if (ret < 0) {
DIE("Failed to read full message: %s", strerror(errno));
} else {
DIE("Read %d bytes, expected %zd", ret, sizeof(msg));
}
}
if (msg.event != UFFD_EVENT_PAGEFAULT) {
DIE("unexpected event 0x%x", msg.event);
}
__u64 page = msg.arg.pagefault.address & ~(PAGE_SIZE - 1ull);
DPRINTF("Pagefault @ 0x%llx, flags 0x%llx",
page, msg.arg.pagefault.flags);
bool is_write_fault =
(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) != 0;
bool is_wp_fault =
(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) != 0;
if (!is_write_fault || (is_write_fault && !is_wp_fault)) {
ufd_zeropage(page);
DPRINTF("0x%llx zeropaged", page);
} else if (is_wp_fault) {
DPRINTF("unexpected WP fault on 0x%llx", page);
// If you remove this main thread will sleep forever
ufd_writeprotect(page, false);
}
}
DIE("Unreachable");
return NULL;
}
int main(void)
{
int res = 0;
g_ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
if (g_ufd < 0) {
DIE("userfaultfd not available: %s", strerror(errno));
}
if (!ufd_version_check()) {
DIE("UFFDIO_API not supported");
}
size_t len = 1024 * 1024 * 1024;
void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) {
DIE("mmap failed: %s", strerror(errno));
}
struct uffdio_register reg_struct;
reg_struct.range.start = (uintptr_t)mem;
reg_struct.range.len = len;
reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING |
UFFDIO_REGISTER_MODE_WP;
if (ioctl(g_ufd, UFFDIO_REGISTER, ®_struct)) {
DIE("userfault register: %s", strerror(errno));
}
uint64_t feature_mask = 1ull << _UFFDIO_WAKE |
1ull << _UFFDIO_ZEROPAGE |
1ull << _UFFDIO_WRITEPROTECT;
if ((reg_struct.ioctls & feature_mask) != feature_mask) {
DIE("Missing range features: %llx", ~reg_struct.ioctls &
feature_mask);
}
DPRINTF("Registered range %p:%zu", mem, len);
DPRINTF("UFD features: 0x%x", reg_struct.ioctls);
pthread_t worker;
if (0 != pthread_create(&worker, NULL, ufd_worker, NULL)) {
DIE("Failed to start ufd worker thread");
}
volatile uint8_t* pdata = (uint8_t*)mem;
for (int i = 0; i < (len / PAGE_SIZE); ++i) {
pdata[0] = (uint8_t)rand();
}
DPRINTF("done!");
pthread_join(worker, NULL);
return EXIT_SUCCESS;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
2016-08-11 13:51 userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions Evgeny Yakovlev
@ 2016-08-11 17:17 ` Andrea Arcangeli
2016-08-12 13:43 ` Evgeny Yakovlev
0 siblings, 1 reply; 3+ messages in thread
From: Andrea Arcangeli @ 2016-08-11 17:17 UTC (permalink / raw)
To: Evgeny Yakovlev; +Cc: linux-mm, Mike Rapoport
Hello Evgeny,
On Thu, Aug 11, 2016 at 04:51:30PM +0300, Evgeny Yakovlev wrote:
> * 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we
> resolve
> * with zeropage
What if you resolve it with bzero(4096);UFFDIO_COPY? Does the problem
go away?
If the zeropage is mapped by UFFDIO_ZEROPAGE, there's no way to turn
that into a writable zeropage ever again because
userfaultfd_writeprotect is basically a no-vma-mangling mmap_sem-read
mprotect and it can't trigger faults. Instead a fault in do_wp_page is
required to get rid of the zeropage and copy it off.
If the problem goes away if you s/UFFDIO_ZEROPAGE/bzero(4096);
UFFDIO_COPY/ as I would expect, there would be two ways to solve it:
1) forbid UFFDIO_ZEROPAGE and not return the UFFDIO_ZEROPAGE ioctl in
uffdio_register.ioctls, if UFFDIO_REGISTER is called with
uffdio_register.mode = ...WP|..MISSING so userland is aware it
can't use that.
2) teach UFFDIO_WRITEPROTECT not just to mangle pagetables but also
trigger a write fault on any zeropage if it's called with
uffdio_writeprotect.mode without UFFDIO_WRITEPROTECT_MODE_WP being
set. This will require a bit more work to fix.
The latter would increase performance if not all zeropages needs to be
turned writable.
Feedback welcome on what solution would you prefer.
Thanks,
Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions
2016-08-11 17:17 ` Andrea Arcangeli
@ 2016-08-12 13:43 ` Evgeny Yakovlev
0 siblings, 0 replies; 3+ messages in thread
From: Evgeny Yakovlev @ 2016-08-12 13:43 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: linux-mm, Mike Rapoport
Hello Andrea,
On 11.08.2016 20:17, Andrea Arcangeli wrote:
> Hello Evgeny,
>
> On Thu, Aug 11, 2016 at 04:51:30PM +0300, Evgeny Yakovlev wrote:
>> * 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we
>> resolve
>> * with zeropage
> What if you resolve it with bzero(4096);UFFDIO_COPY? Does the problem
> go away?
Yes, i don't see additional WP fault now, only expected missing write fault.
> If the zeropage is mapped by UFFDIO_ZEROPAGE, there's no way to turn
> that into a writable zeropage ever again because
> userfaultfd_writeprotect is basically a no-vma-mangling mmap_sem-read
> mprotect and it can't trigger faults. Instead a fault in do_wp_page is
> required to get rid of the zeropage and copy it off.
Maybe i am missing something but why do we then get WP faults on that
page right after we UFFDIO_ZEROPAGE it? We never call writeprotect on
zeropaged page and still get a WP fault on it which we can't resolve
properly.
> If the problem goes away if you s/UFFDIO_ZEROPAGE/bzero(4096);
> UFFDIO_COPY/ as I would expect, there would be two ways to solve it:
>
> 1) forbid UFFDIO_ZEROPAGE and not return the UFFDIO_ZEROPAGE ioctl in
> uffdio_register.ioctls, if UFFDIO_REGISTER is called with
> uffdio_register.mode = ...WP|..MISSING so userland is aware it
> can't use that.
>
> 2) teach UFFDIO_WRITEPROTECT not just to mangle pagetables but also
> trigger a write fault on any zeropage if it's called with
> uffdio_writeprotect.mode without UFFDIO_WRITEPROTECT_MODE_WP being
> set. This will require a bit more work to fix.
>
> The latter would increase performance if not all zeropages needs to be
> turned writable.
>
> Feedback welcome on what solution would you prefer.
Our use case is as follows. We have a huge region and most of it we need
to be writable. Most of the time we just gradually resolve missing
faults as they appear. We only enable write protection on some selective
already present pages to have a way to track attempted page modification
for a short period of time. We register initial region as MISSING | WP
so that we don't have to register a new page-sized region each time we
need to write-protect a single page inside a region.
>
> Thanks,
> Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2016-08-12 13:43 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-11 13:51 userfaultfd: unexpected behavior with MODE_MISSING | MODE_WP regions Evgeny Yakovlev
2016-08-11 17:17 ` Andrea Arcangeli
2016-08-12 13:43 ` Evgeny Yakovlev
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.