All of lore.kernel.org
 help / color / mirror / Atom feed
* mmap dio write failure
@ 2017-01-20  4:40 Xiong Zhou
       [not found] ` <20170120044007.kwevjo7nawwolagy-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-01-20  4:40 UTC (permalink / raw)
  To: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w; +Cc: eguan-H+wXaHxf7aLQT0dZR+AlfA

Hi,

At first, I am not sure whether this is an issue.

mmap a file in a DAX mountpoint, open another file
in a non-DAX mountpoint with O_DIRECT, write the
mapped area to the other file.

This write Success on pmem ramdisk(memmap=2G!20G like)
This write Fail(Bad address) on nvdimm pmem devices.
This write Fail(Bad address) on brd based ramdisk.

If we skip the O_DIRECT flag, all tests pass.

If we write from DAX to DAX, all tests pass.
If we write from non-DAX to DAX, all tests pass.

Kernel version: Linus tree commit 44b4b46.

I have checked back to v4.6 testing on nvdimm devices,
all the same results. I do remember that this test
passed on nvdimms back to May 2016 and i have some
notes for that. However things changed a lot, test
scripts, kernel code, even the nvdimm and machine
firmweare.

Thanks,
Xiong

sh-4.2# cat tbad.sh
#!/bin/bash
[ -z "$1" ] && exit 1
DEV="$1"
MNT=/tbdmnt
cc t_mmap_dio.c
mkdir -p $MNT
wipefs -af $DEV
mkfs.xfs -fq $DEV && \
mount -o dax $DEV $MNT && \
xfs_io -f -c "w -W 0 268435456" $MNT/ts > /dev/null && \
xfs_io -f -c "w -W 0 268435456" ./td > /dev/null
if ./a.out $MNT/ts ./td 16777216 "$DEV" ; then
	echo PASS
else
	echo FAIL
fi
umount $MNT

sh-4.2# cat t_mmap_dio.c 
/*
 * This programme was originally written by
 *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
 */
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <libaio.h>
#include <errno.h>
#include <sys/time.h>

void usage(char *prog)
{
	fprintf(stderr,
		"usage: %s <src file> <dest file> <size> <msg>\n",
		prog);
	exit(1);
}

void err_exit(char *op, unsigned long len, char *s)
{
	fprintf(stderr, "%s(%s) len %lu %s\n",
		op, strerror(errno), len, s);
	exit(1);
}

int main(int argc, char **argv)
{
	int fd, fd2, ret;
	char *map;
	unsigned long len;

	if (argc < 4)
		usage(basename(argv[0]));

	len = strtoul(argv[3], NULL, 10);
	if (errno == ERANGE)
		err_exit("strtoul", 0, argv[4]);

	/* Open source file and mmap*/
	fd = open(argv[1], O_RDWR, 0644);
	if (fd < 0)
		err_exit("open s", len, argv[4]);

	map = (char *)mmap(NULL, len,
		PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (map == MAP_FAILED)
		err_exit("mmap", len, argv[4]);

	/* Open dest file with O_DIRECT */
	fd2 = open(argv[2], O_RDWR|O_DIRECT, 0644);
	if (fd2 < 0)
		err_exit("open d", len, argv[4]);

	/* First, test storing to dest file from source mapping */
	ret = write(fd2, map, len);
	if (ret != len)
		err_exit("write", len, argv[4]);

	ret = (int)lseek(fd2, 0, SEEK_SET);
	if (ret == -1)
		err_exit("lseek", len, argv[4]);

	/* Next, test reading from dest file into source mapping */
	ret = read(fd2, map, len);
	if (ret != len)
		err_exit("read", len, argv[4]);
	ret = msync(map, len, MS_SYNC);
	if (ret < 0)
		err_exit("msync", len, argv[4]);

	ret = munmap(map, len);
	if (ret < 0)
		err_exit("munmap", len, argv[4]);

	ret = close(fd);
	if (ret < 0)
		err_exit("clsoe fd", len, argv[4]);

	ret = close(fd2);
	if (ret < 0)
		err_exit("close fd2", len, argv[4]);

	exit(0);
}

sh-4.2# ndctl list -N
[
  {
    "dev":"namespace3.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem3"
  },
  {
    "dev":"namespace2.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem2"
  },
  {
    "dev":"namespace1.0",
    "mode":"memory",
    "size":2147483648,
    "blockdev":"pmem1"
  },
  {
    "dev":"namespace0.0",
    "mode":"memory",
    "size":2147483648,
    "blockdev":"pmem0"
  }
]

sh-4.2# modinfo brd
filename:       /lib/modules/4.10.0-rc4-master-44b4b46+/kernel/drivers/block/brd.ko
alias:          rd
alias:          block-major-1-*
license:        GPL
srcversion:     25AABF2EF57F6A37AFFEBA6
depends:        
intree:         Y
vermagic:       4.10.0-rc4-master-44b4b46+ SMP mod_unload modversions 
parm:           rd_nr:Maximum number of brd devices (int)
parm:           rd_size:Size of each RAM disk in kbytes. (ulong)
parm:           max_part:Num Minors to reserve between devices (int)

sh-4.2# uname -r
4.10.0-rc4-master-44b4b46+

sh-4.2# bash tbad.sh /dev/pmem0
/dev/pmem0: 4 bytes were erased at offset 0x00000000 (xfs): 58 46 53 42
PASS

sh-4.2# bash tbad.sh /dev/pmem2
/dev/pmem2: 4 bytes were erased at offset 0x00000000 (xfs): 58 46 53 42
write(Bad address) len 16777216 /dev/pmem2
FAIL

sh-4.2# bash tbad.sh /dev/ram0
/dev/ram0: 4 bytes were erased at offset 0x00000000 (xfs): 58 46 53 42
write(Bad address) len 16777216 /dev/ram0
FAIL

sh-4.2# df .
Filesystem                              1K-blocks     Used Available Use% Mounted on
/dev/mapper/rhxxxxxxxxxxxxxxxxx-01-root  52399104 43658792   8740312  84% /
sh-4.2# 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: mmap dio write failure
       [not found] ` <20170120044007.kwevjo7nawwolagy-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-01-20  6:04   ` Dan Williams
  2017-02-07 10:01     ` Xiong Zhou
  2017-02-08  3:51   ` read failure (was Re: mmap dio write failure) Xiong Zhou
  1 sibling, 1 reply; 12+ messages in thread
From: Dan Williams @ 2017-01-20  6:04 UTC (permalink / raw)
  To: Xiong Zhou; +Cc: linux-nvdimm, Eryu Guan

On Thu, Jan 19, 2017 at 8:40 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> Hi,
>
> At first, I am not sure whether this is an issue.
>
> mmap a file in a DAX mountpoint, open another file
> in a non-DAX mountpoint with O_DIRECT, write the
> mapped area to the other file.
>
> This write Success on pmem ramdisk(memmap=2G!20G like)
> This write Fail(Bad address) on nvdimm pmem devices.
> This write Fail(Bad address) on brd based ramdisk.
>
> If we skip the O_DIRECT flag, all tests pass.
>
> If we write from DAX to DAX, all tests pass.
> If we write from non-DAX to DAX, all tests pass.
>
> Kernel version: Linus tree commit 44b4b46.
>
> I have checked back to v4.6 testing on nvdimm devices,
> all the same results. I do remember that this test
> passed on nvdimms back to May 2016 and i have some
> notes for that. However things changed a lot, test
> scripts, kernel code, even the nvdimm and machine
> firmweare.
>

This is expected and is the difference between a namespace in "raw"
mode and a namespace in "memory" mode. You can check your namespace's
mode with "ndctl list" (ndctl is packaged in Fedora).

The reason why memmap=ss!nn namespaces work by default is that we
assume they are relatively small and can afford to allocate struct
page in system memory. We don't make the same assumption with
NFIT-defined namespaces. They might be so large that trying to
allocate struct page for them could consume all of system memory. So
you have to convert them into "memory" mode and make a decision at the
time as to whether you want to use a portion of the pmem capacity as
struct page storage, or to go ahead and allocate struct page from
system memory.  By default ndctl will opt to reserve space from pmem
with a command like:

    ndctl create-namespace --reconfig=namespace0.0 --mode=memory --force

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: mmap dio write failure
  2017-01-20  6:04   ` Dan Williams
@ 2017-02-07 10:01     ` Xiong Zhou
  0 siblings, 0 replies; 12+ messages in thread
From: Xiong Zhou @ 2017-02-07 10:01 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm, Eryu Guan

On Thu, Jan 19, 2017 at 10:04:01PM -0800, Dan Williams wrote:
> On Thu, Jan 19, 2017 at 8:40 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > Hi,
> >
> > At first, I am not sure whether this is an issue.
> >
> > mmap a file in a DAX mountpoint, open another file
> > in a non-DAX mountpoint with O_DIRECT, write the
> > mapped area to the other file.
> >
> > This write Success on pmem ramdisk(memmap=2G!20G like)
> > This write Fail(Bad address) on nvdimm pmem devices.
> > This write Fail(Bad address) on brd based ramdisk.
> >
> > If we skip the O_DIRECT flag, all tests pass.
> >
> > If we write from DAX to DAX, all tests pass.
> > If we write from non-DAX to DAX, all tests pass.
> >
> > Kernel version: Linus tree commit 44b4b46.
> >
> > I have checked back to v4.6 testing on nvdimm devices,
> > all the same results. I do remember that this test
> > passed on nvdimms back to May 2016 and i have some
> > notes for that. However things changed a lot, test
> > scripts, kernel code, even the nvdimm and machine
> > firmweare.
> >
> 
> This is expected and is the difference between a namespace in "raw"
> mode and a namespace in "memory" mode. You can check your namespace's
> mode with "ndctl list" (ndctl is packaged in Fedora).
> 
> The reason why memmap=ss!nn namespaces work by default is that we
> assume they are relatively small and can afford to allocate struct
> page in system memory. We don't make the same assumption with
> NFIT-defined namespaces. They might be so large that trying to
> allocate struct page for them could consume all of system memory. So
> you have to convert them into "memory" mode and make a decision at the
> time as to whether you want to use a portion of the pmem capacity as
> struct page storage, or to go ahead and allocate struct page from
> system memory.  By default ndctl will opt to reserve space from pmem
> with a command like:
> 
>     ndctl create-namespace --reconfig=namespace0.0 --mode=memory --force

Thanks for the info!

Changing mode does work for the test.

Is that write failure(Bad address) expected even CONFIG_NVDIMM_PFN=y ?

Refer to Documentation/filesystems/dax.txt,
"
Calling get_user_pages() on a range of user memory that has been mmaped
from a DAX file will fail when there are no 'struct page' to describe
those pages.  This problem has been addressed in some device drivers
by adding optional struct page support for pages under the control of
the driver (see CONFIG_NVDIMM_PFN in drivers/nvdimm for an example of
how to do this). In the non struct page cases O_DIRECT reads/writes to
those memory ranges from a non-DAX file will fail (note that O_DIRECT
reads/writes _of a DAX file_ do work, it is the memory that is being
accessed that is key here).  Other things that will not work in the
non struct page case include RDMA, sendfile() and splice().
"

And why brd based ramdisk failed the same way ?  It's ram after all :)


Thanks,
Xiong

^ permalink raw reply	[flat|nested] 12+ messages in thread

* read failure (was Re: mmap dio write failure)
       [not found] ` <20170120044007.kwevjo7nawwolagy-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  2017-01-20  6:04   ` Dan Williams
@ 2017-02-08  3:51   ` Xiong Zhou
       [not found]     ` <20170208035105.lvewz5ce7xbu5zud-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  1 sibling, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-02-08  3:51 UTC (permalink / raw)
  To: linux-nvdimm-y27Ovi1pjclAfugRpC6u6w; +Cc: eguan-H+wXaHxf7aLQT0dZR+AlfA

[-- Attachment #1: Type: text/plain, Size: 1060 bytes --]

On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> Hi,
> 
> At first, I am not sure whether this is an issue.
> 
> mmap a file in a DAX mountpoint, open another file
> in a non-DAX mountpoint with O_DIRECT, write the
> mapped area to the other file.
> 
> This write Success on pmem ramdisk(memmap=2G!20G like)
> This write Fail(Bad address) on nvdimm pmem devices.
> This write Fail(Bad address) on brd based ramdisk.
> 
> If we skip the O_DIRECT flag, all tests pass.
> 
> If we write from DAX to DAX, all tests pass.
> If we write from non-DAX to DAX, all tests pass.
> 
snip..

To falloc instead of pwrite while initiating test files,
( Thanks Ross! :) 
the write call returned success, however the following
read back to mmaped area FAILED the same way:

return (Bad address) on raw-mode nvdimm device;
return (Success)     on memory-mode nvdimm device;
return (Bad address) on brd based ramdisk.

Also, this only happends with O_DIRECT flag on.

This smells like an issue to me, still looking into why
read can't get that page..

Thanks,
Xiong

[-- Attachment #2: t_mmap_dio.c --]
[-- Type: text/plain, Size: 2290 bytes --]

/*
 * This programme was originally written by
 *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
 *
 * Copyright (C) 2016, Red Hat, Inc.
 */
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <libaio.h>
#include <errno.h>
#include <sys/time.h>

void usage(char *prog)
{
	fprintf(stderr,
		"usage: %s <src file> <dest file> <size> <msg>\n",
		prog);
	exit(1);
}

void err_exit(char *op, unsigned long len, char *s)
{
	fprintf(stderr, "%s(%s) len %lu %s\n",
		op, strerror(errno), len, s);
	exit(1);
}

int main(int argc, char **argv)
{
	int fd, fd2, ret, dio = 1;
	char *map;
	char *msg;
	char *sfile;
	char *dfile;
	unsigned long len, opt;

	if (argc < 4)
		usage(basename(argv[0]));

	while ((opt = getopt(argc, argv, "b")) != -1)
		dio = 0;

	sfile = argv[optind];
	dfile = argv[optind + 1];
	msg = argv[optind + 3];
	len = strtoul(argv[optind + 2], NULL, 10);
	if (errno == ERANGE)
		err_exit("strtoul", 0, msg);

	/* Open source file and mmap*/
	fd = open(sfile, O_RDWR, 0644);
	if (fd < 0)
		err_exit("open src", len, msg);

	map = (char *)mmap(NULL, len,
		PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (map == MAP_FAILED)
		err_exit("mmap", len, msg);

	if (dio == 1) {
		/* Open dest file with O_DIRECT */
		fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	} else {
		/* Open dest file without O_DIRECT */
		fd2 = open(dfile, O_RDWR, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	}

	/* First, test storing to dest file from source mapping */
	ret = write(fd2, map, len);
	if (ret != len)
		err_exit("write", len, msg);

	ret = fsync(fd2);
	if (ret != 0)
		err_exit("fsync", len, msg);

	ret = (int)lseek(fd2, 0, SEEK_SET);
	if (ret == -1)
		err_exit("lseek", len, msg);

	/* Next, test reading from dest file into source mapping */
	ret = read(fd2, map, len);
	if (ret != len)
		err_exit("read", len, msg);
	ret = msync(map, len, MS_SYNC);
	if (ret < 0)
		err_exit("msync", len, msg);

	ret = munmap(map, len);
	if (ret < 0)
		err_exit("munmap", len, msg);

	ret = close(fd);
	if (ret < 0)
		err_exit("clsoe fd", len, msg);

	ret = close(fd2);
	if (ret < 0)
		err_exit("close fd2", len, msg);

	exit(0);
}

[-- Attachment #3: Type: text/plain, Size: 178 bytes --]

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]     ` <20170208035105.lvewz5ce7xbu5zud-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-02-08  4:10       ` Dan Williams
       [not found]         ` <CAPcyv4iF2xH=9FHDMqb8OYMFO45GkiEEKf1y73Q7scGQSJniag-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Williams @ 2017-02-08  4:10 UTC (permalink / raw)
  To: Xiong Zhou; +Cc: linux-nvdimm, Eryu Guan

On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
>> Hi,
>>
>> At first, I am not sure whether this is an issue.
>>
>> mmap a file in a DAX mountpoint, open another file
>> in a non-DAX mountpoint with O_DIRECT, write the
>> mapped area to the other file.
>>
>> This write Success on pmem ramdisk(memmap=2G!20G like)
>> This write Fail(Bad address) on nvdimm pmem devices.
>> This write Fail(Bad address) on brd based ramdisk.
>>
>> If we skip the O_DIRECT flag, all tests pass.
>>
>> If we write from DAX to DAX, all tests pass.
>> If we write from non-DAX to DAX, all tests pass.
>>
> snip..
>
> To falloc instead of pwrite while initiating test files,
> ( Thanks Ross! :)
> the write call returned success, however the following
> read back to mmaped area FAILED the same way:
>
> return (Bad address) on raw-mode nvdimm device;
> return (Success)     on memory-mode nvdimm device;
> return (Bad address) on brd based ramdisk.
>
> Also, this only happends with O_DIRECT flag on.
>
> This smells like an issue to me, still looking into why
> read can't get that page..
>

Why does it smell like an issue? Any path that calls get_user_pages()
and a DAX mmap range will fail without "memory" mode. I'm of the
opinion that we should disable DAX altogether for raw-mode.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]         ` <CAPcyv4iF2xH=9FHDMqb8OYMFO45GkiEEKf1y73Q7scGQSJniag-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-02-08  4:49           ` Xiong Zhou
       [not found]             ` <20170208044959.rq3ofmsjth2oeu2u-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-02-08  4:49 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm, Eryu Guan

On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> >> Hi,
> >>
> >> At first, I am not sure whether this is an issue.
> >>
> >> mmap a file in a DAX mountpoint, open another file
> >> in a non-DAX mountpoint with O_DIRECT, write the
> >> mapped area to the other file.
> >>
> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> >> This write Fail(Bad address) on nvdimm pmem devices.
> >> This write Fail(Bad address) on brd based ramdisk.
> >>
> >> If we skip the O_DIRECT flag, all tests pass.
> >>
> >> If we write from DAX to DAX, all tests pass.
> >> If we write from non-DAX to DAX, all tests pass.
> >>
> > snip..
> >
> > To falloc instead of pwrite while initiating test files,
> > ( Thanks Ross! :)
> > the write call returned success, however the following
> > read back to mmaped area FAILED the same way:
> >
> > return (Bad address) on raw-mode nvdimm device;
> > return (Success)     on memory-mode nvdimm device;
> > return (Bad address) on brd based ramdisk.
> >
> > Also, this only happends with O_DIRECT flag on.
> >
> > This smells like an issue to me, still looking into why
> > read can't get that page..
> >
> 
> Why does it smell like an issue? Any path that calls get_user_pages()

Because the write call gets its page and succeeds, while read back fails.
__get_user_pages on the same address first pass, then fail.

Thanks,
Xiong

> and a DAX mmap range will fail without "memory" mode. I'm of the
> opinion that we should disable DAX altogether for raw-mode.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]             ` <20170208044959.rq3ofmsjth2oeu2u-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-02-08  5:05               ` Dan Williams
       [not found]                 ` <CAPcyv4iCz9SqJkoKYr-RWxV4FmqPB1Wfkx_A_jE-S5Unpo-5Pw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Williams @ 2017-02-08  5:05 UTC (permalink / raw)
  To: Xiong Zhou; +Cc: linux-nvdimm, Eryu Guan

On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
>> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
>> >> Hi,
>> >>
>> >> At first, I am not sure whether this is an issue.
>> >>
>> >> mmap a file in a DAX mountpoint, open another file
>> >> in a non-DAX mountpoint with O_DIRECT, write the
>> >> mapped area to the other file.
>> >>
>> >> This write Success on pmem ramdisk(memmap=2G!20G like)
>> >> This write Fail(Bad address) on nvdimm pmem devices.
>> >> This write Fail(Bad address) on brd based ramdisk.
>> >>
>> >> If we skip the O_DIRECT flag, all tests pass.
>> >>
>> >> If we write from DAX to DAX, all tests pass.
>> >> If we write from non-DAX to DAX, all tests pass.
>> >>
>> > snip..
>> >
>> > To falloc instead of pwrite while initiating test files,
>> > ( Thanks Ross! :)
>> > the write call returned success, however the following
>> > read back to mmaped area FAILED the same way:
>> >
>> > return (Bad address) on raw-mode nvdimm device;
>> > return (Success)     on memory-mode nvdimm device;
>> > return (Bad address) on brd based ramdisk.
>> >
>> > Also, this only happends with O_DIRECT flag on.
>> >
>> > This smells like an issue to me, still looking into why
>> > read can't get that page..
>> >
>>
>> Why does it smell like an issue? Any path that calls get_user_pages()
>
> Because the write call gets its page and succeeds, while read back fails.
> __get_user_pages on the same address first pass, then fail.

Ok, I might have misread your description. Can you tell me the exact
reproduction steps so I can give it a try?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]                 ` <CAPcyv4iCz9SqJkoKYr-RWxV4FmqPB1Wfkx_A_jE-S5Unpo-5Pw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-02-08  6:56                   ` Xiong Zhou
       [not found]                     ` <20170208065651.ab74cytferq4ha57-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-02-08  6:56 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm, Eryu Guan

On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> >> >> Hi,
> >> >>
> >> >> At first, I am not sure whether this is an issue.
> >> >>
> >> >> mmap a file in a DAX mountpoint, open another file
> >> >> in a non-DAX mountpoint with O_DIRECT, write the
> >> >> mapped area to the other file.
> >> >>
> >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> >> >> This write Fail(Bad address) on nvdimm pmem devices.
> >> >> This write Fail(Bad address) on brd based ramdisk.
> >> >>
> >> >> If we skip the O_DIRECT flag, all tests pass.
> >> >>
> >> >> If we write from DAX to DAX, all tests pass.
> >> >> If we write from non-DAX to DAX, all tests pass.
> >> >>
> >> > snip..
> >> >
> >> > To falloc instead of pwrite while initiating test files,
> >> > ( Thanks Ross! :)
> >> > the write call returned success, however the following
> >> > read back to mmaped area FAILED the same way:
> >> >
> >> > return (Bad address) on raw-mode nvdimm device;
> >> > return (Success)     on memory-mode nvdimm device;
> >> > return (Bad address) on brd based ramdisk.
> >> >
> >> > Also, this only happends with O_DIRECT flag on.
> >> >
> >> > This smells like an issue to me, still looking into why
> >> > read can't get that page..
> >> >
> >>
> >> Why does it smell like an issue? Any path that calls get_user_pages()
> >
> > Because the write call gets its page and succeeds, while read back fails.
> > __get_user_pages on the same address first pass, then fail.
> 
> Ok, I might have misread your description. Can you tell me the exact
> reproduction steps so I can give it a try?

Reproducer attached.

You need root to run this, assuming your pmem device is /dev/pmem0.

Steps:
  tar zxf mmap_dio_dax.tar.gz 
  cd mmap_dio_dax
  sh test.sh /dev/pmem0

Thanks for your time!

----- my log -------------
sh-4.2# uname -r
4.10.0-rc7-master-f7d6040+
sh-4.2# whoami 
root
sh-4.2# pwd
/root
sh-4.2# tar zxf mmap_dio_dax.tar.gz 
sh-4.2# cd mmap_dio_dax
sh-4.2# sh test.sh /dev/pmem0
dio PASS
buffered IO PASS
sh-4.2# sh test.sh /dev/pmem2
read(Bad address) len 16777216 /dev/pmem2
dio FAIL
buffered IO PASS
sh-4.2# modprobe brd rd_size=$((1*1024*1024))
sh-4.2# sh test.sh /dev/ram0
read(Bad address) len 16777216 /dev/ram0
dio FAIL
buffered IO PASS
sh-4.2# 
sh-4.2# ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"memory",
    "size":8453619712,
    "uuid":"0013265e-06ff-4397-b62a-3078c1346cbc",
    "blockdev":"pmem1"
  },
  {
    "dev":"namespace3.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem3"
  },
  {
    "dev":"namespace0.0",
    "mode":"memory",
    "size":8453619712,
    "uuid":"c31e0719-00b3-4ffd-848c-659a74350ae5",
    "blockdev":"pmem0"
  },
  {
    "dev":"namespace2.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem2"
  }
]
sh-4.2#

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]                     ` <20170208065651.ab74cytferq4ha57-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-02-08  7:09                       ` Xiong Zhou
       [not found]                         ` <20170208070907.m6cwwz47hrow5vui-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-02-08  7:09 UTC (permalink / raw)
  To: Xiong Zhou; +Cc: linux-nvdimm, Eryu Guan

On Wed, Feb 08, 2017 at 02:56:51PM +0800, Xiong Zhou wrote:
> On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
> > On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> > >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> > >> >> Hi,
> > >> >>
> > >> >> At first, I am not sure whether this is an issue.
> > >> >>
> > >> >> mmap a file in a DAX mountpoint, open another file
> > >> >> in a non-DAX mountpoint with O_DIRECT, write the
> > >> >> mapped area to the other file.
> > >> >>
> > >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> > >> >> This write Fail(Bad address) on nvdimm pmem devices.
> > >> >> This write Fail(Bad address) on brd based ramdisk.
> > >> >>
> > >> >> If we skip the O_DIRECT flag, all tests pass.
> > >> >>
> > >> >> If we write from DAX to DAX, all tests pass.
> > >> >> If we write from non-DAX to DAX, all tests pass.
> > >> >>
> > >> > snip..
> > >> >
> > >> > To falloc instead of pwrite while initiating test files,
> > >> > ( Thanks Ross! :)
> > >> > the write call returned success, however the following
> > >> > read back to mmaped area FAILED the same way:
> > >> >
> > >> > return (Bad address) on raw-mode nvdimm device;
> > >> > return (Success)     on memory-mode nvdimm device;
> > >> > return (Bad address) on brd based ramdisk.
> > >> >
> > >> > Also, this only happends with O_DIRECT flag on.
> > >> >
> > >> > This smells like an issue to me, still looking into why
> > >> > read can't get that page..
> > >> >
> > >>
> > >> Why does it smell like an issue? Any path that calls get_user_pages()
> > >
> > > Because the write call gets its page and succeeds, while read back fails.
> > > __get_user_pages on the same address first pass, then fail.
> > 
> > Ok, I might have misread your description. Can you tell me the exact
> > reproduction steps so I can give it a try?
> 
> Reproducer attached.
> 

----- test.sh -----------------------------------------------------
#!/bin/bash
[ -z "$1" ] && { echo "$0 <dev>"; exit 1; }

DEV="$1"
MNT=/tbdmnt
cc t_mmap_dio.c
mkdir -p $MNT
wipefs -af $DEV > /dev/null
#mkfs.xfs -fq -d su=2m,sw=1 $DEV && \
mkfs.xfs -fq $DEV && \
mount -o dax $DEV $MNT && \
#xfs_io -f -c "w 0 268435456" $MNT/ts > /dev/null && \
#xfs_io -f -c "w 0 268435456" /root/td > /dev/null
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null && \
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
if ./a.out $MNT/ts /root/td 16777216 "$DEV" ; then
	echo dio PASS
else
	echo dio FAIL
fi

rm -f $MNT/ts /root/td
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null

if ./a.out -b $MNT/ts /root/td 16777216 "$DEV" ; then
	echo buffered IO PASS
else
	echo buffered IO FAIL
fi
umount $MNT

-------------------------------------------------------------------



---- t_mmap_dio.c ------------------------------------------------------------
/*
 * This programme was originally written by
 *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
 *
 * Copyright (C) 2016, Red Hat, Inc.
 */
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <libaio.h>
#include <errno.h>
#include <sys/time.h>

void usage(char *prog)
{
	fprintf(stderr,
		"usage: %s <src file> <dest file> <size> <msg>\n",
		prog);
	exit(1);
}

void err_exit(char *op, unsigned long len, char *s)
{
	fprintf(stderr, "%s(%s) len %lu %s\n",
		op, strerror(errno), len, s);
	exit(1);
}

int main(int argc, char **argv)
{
	int fd, fd2, ret, dio = 1;
	char *map;
	char *msg;
	char *sfile;
	char *dfile;
	unsigned long len, opt;

	if (argc < 4)
		usage(basename(argv[0]));

	while ((opt = getopt(argc, argv, "b")) != -1)
		dio = 0;

	sfile = argv[optind];
	dfile = argv[optind + 1];
	msg = argv[optind + 3];
	len = strtoul(argv[optind + 2], NULL, 10);
	if (errno == ERANGE)
		err_exit("strtoul", 0, msg);

	/* Open source file and mmap*/
	fd = open(sfile, O_RDWR, 0644);
	if (fd < 0)
		err_exit("open src", len, msg);

	map = (char *)mmap(NULL, len,
		PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (map == MAP_FAILED)
		err_exit("mmap", len, msg);

	if (dio == 1) {
		/* Open dest file with O_DIRECT */
		fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	} else {
		/* Open dest file without O_DIRECT */
		fd2 = open(dfile, O_RDWR, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	}

	/* First, test storing to dest file from source mapping */
	ret = write(fd2, map, len);
	if (ret != len)
		err_exit("write", len, msg);

	ret = fsync(fd2);
	if (ret != 0)
		err_exit("fsync", len, msg);

	ret = (int)lseek(fd2, 0, SEEK_SET);
	if (ret == -1)
		err_exit("lseek", len, msg);

	/* Next, test reading from dest file into source mapping */
	ret = read(fd2, map, len);
	if (ret != len)
		err_exit("read", len, msg);
	ret = msync(map, len, MS_SYNC);
	if (ret < 0)
		err_exit("msync", len, msg);

	ret = munmap(map, len);
	if (ret < 0)
		err_exit("munmap", len, msg);

	ret = close(fd);
	if (ret < 0)
		err_exit("clsoe fd", len, msg);

	ret = close(fd2);
	if (ret < 0)
		err_exit("close fd2", len, msg);

	exit(0);
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]                         ` <20170208070907.m6cwwz47hrow5vui-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-02-08  7:22                           ` Xiong Zhou
       [not found]                             ` <20170208072253.erl4esloglvq2zet-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Xiong Zhou @ 2017-02-08  7:22 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm, Eryu Guan

On Wed, Feb 08, 2017 at 03:09:07PM +0800, Xiong Zhou wrote:
> On Wed, Feb 08, 2017 at 02:56:51PM +0800, Xiong Zhou wrote:
> > On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
> > > On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> > > >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > > >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> > > >> >> Hi,
> > > >> >>
> > > >> >> At first, I am not sure whether this is an issue.
> > > >> >>
> > > >> >> mmap a file in a DAX mountpoint, open another file
> > > >> >> in a non-DAX mountpoint with O_DIRECT, write the
> > > >> >> mapped area to the other file.
> > > >> >>
> > > >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> > > >> >> This write Fail(Bad address) on nvdimm pmem devices.
> > > >> >> This write Fail(Bad address) on brd based ramdisk.
> > > >> >>
> > > >> >> If we skip the O_DIRECT flag, all tests pass.
> > > >> >>
> > > >> >> If we write from DAX to DAX, all tests pass.
> > > >> >> If we write from non-DAX to DAX, all tests pass.
> > > >> >>
> > > >> > snip..
> > > >> >
> > > >> > To falloc instead of pwrite while initiating test files,
> > > >> > ( Thanks Ross! :)
> > > >> > the write call returned success, however the following
> > > >> > read back to mmaped area FAILED the same way:
> > > >> >
> > > >> > return (Bad address) on raw-mode nvdimm device;
> > > >> > return (Success)     on memory-mode nvdimm device;
> > > >> > return (Bad address) on brd based ramdisk.
> > > >> >
> > > >> > Also, this only happends with O_DIRECT flag on.
> > > >> >
> > > >> > This smells like an issue to me, still looking into why
> > > >> > read can't get that page..
> > > >> >
> > > >>
> > > >> Why does it smell like an issue? Any path that calls get_user_pages()
> > > >
> > > > Because the write call gets its page and succeeds, while read back fails.
> > > > __get_user_pages on the same address first pass, then fail.
> > > 
> > > Ok, I might have misread your description. Can you tell me the exact
> > > reproduction steps so I can give it a try?
> > 
> > Reproducer attached.
> > 

Attachment issue..

You need root to run this, assuming your pmem device is /dev/pmem0.

Steps:
  sh test.sh /dev/pmem0

Thanks for your time!

----- test.sh --------------------------------------
#!/bin/bash
[ -z "$1" ] && { echo "$0 <dev>"; exit 1; }

DEV="$1"
MNT=/tbdmnt
cc t_mmap_dio.c
mkdir -p $MNT
wipefs -af $DEV > /dev/null
#mkfs.xfs -fq -d su=2m,sw=1 $DEV && \
mkfs.xfs -fq $DEV && \
mount -o dax $DEV $MNT && \
#xfs_io -f -c "w 0 268435456" $MNT/ts > /dev/null && \
#xfs_io -f -c "w 0 268435456" /root/td > /dev/null
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null && \
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
if ./a.out $MNT/ts /root/td 16777216 "$DEV" ; then
	echo dio PASS
else
	echo dio FAIL
fi

rm -f $MNT/ts /root/td
xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null
xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null

if ./a.out -b $MNT/ts /root/td 16777216 "$DEV" ; then
	echo buffered IO PASS
else
	echo buffered IO FAIL
fi
umount $MNT

--------------------------------------------------------


----- t_mmap_dio.c ----------------------------------
/*
 * This programme was originally written by
 *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
 *
 * Copyright (C) 2016, Red Hat, Inc.
 */
#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <libaio.h>
#include <errno.h>
#include <sys/time.h>

void usage(char *prog)
{
	fprintf(stderr,
		"usage: %s <src file> <dest file> <size> <msg>\n",
		prog);
	exit(1);
}

void err_exit(char *op, unsigned long len, char *s)
{
	fprintf(stderr, "%s(%s) len %lu %s\n",
		op, strerror(errno), len, s);
	exit(1);
}

int main(int argc, char **argv)
{
	int fd, fd2, ret, dio = 1;
	char *map;
	char *msg;
	char *sfile;
	char *dfile;
	unsigned long len, opt;

	if (argc < 4)
		usage(basename(argv[0]));

	while ((opt = getopt(argc, argv, "b")) != -1)
		dio = 0;

	sfile = argv[optind];
	dfile = argv[optind + 1];
	msg = argv[optind + 3];
	len = strtoul(argv[optind + 2], NULL, 10);
	if (errno == ERANGE)
		err_exit("strtoul", 0, msg);

	/* Open source file and mmap*/
	fd = open(sfile, O_RDWR, 0644);
	if (fd < 0)
		err_exit("open src", len, msg);

	map = (char *)mmap(NULL, len,
		PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (map == MAP_FAILED)
		err_exit("mmap", len, msg);

	if (dio == 1) {
		/* Open dest file with O_DIRECT */
		fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	} else {
		/* Open dest file without O_DIRECT */
		fd2 = open(dfile, O_RDWR, 0644);
		if (fd2 < 0)
			err_exit("open dest", len, msg);
	}

	/* First, test storing to dest file from source mapping */
	ret = write(fd2, map, len);
	if (ret != len)
		err_exit("write", len, msg);

	ret = fsync(fd2);
	if (ret != 0)
		err_exit("fsync", len, msg);

	ret = (int)lseek(fd2, 0, SEEK_SET);
	if (ret == -1)
		err_exit("lseek", len, msg);

	/* Next, test reading from dest file into source mapping */
	ret = read(fd2, map, len);
	if (ret != len)
		err_exit("read", len, msg);
	ret = msync(map, len, MS_SYNC);
	if (ret < 0)
		err_exit("msync", len, msg);

	ret = munmap(map, len);
	if (ret < 0)
		err_exit("munmap", len, msg);

	ret = close(fd);
	if (ret < 0)
		err_exit("clsoe fd", len, msg);

	ret = close(fd2);
	if (ret < 0)
		err_exit("close fd2", len, msg);

	exit(0);
}

----------------------------------------------
----- my log -------------
sh-4.2# uname -r
4.10.0-rc7-master-f7d6040+
sh-4.2# whoami 
root
sh-4.2# pwd
/root
sh-4.2# sh test.sh /dev/pmem0
dio PASS
buffered IO PASS
sh-4.2# sh test.sh /dev/pmem2
read(Bad address) len 16777216 /dev/pmem2
dio FAIL
buffered IO PASS
sh-4.2# modprobe brd rd_size=$((1*1024*1024))
sh-4.2# sh test.sh /dev/ram0
read(Bad address) len 16777216 /dev/ram0
dio FAIL
buffered IO PASS
sh-4.2# 
sh-4.2# ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"memory",
    "size":8453619712,
    "uuid":"0013265e-06ff-4397-b62a-3078c1346cbc",
    "blockdev":"pmem1"
  },
  {
    "dev":"namespace3.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem3"
  },
  {
    "dev":"namespace0.0",
    "mode":"memory",
    "size":8453619712,
    "uuid":"c31e0719-00b3-4ffd-848c-659a74350ae5",
    "blockdev":"pmem0"
  },
  {
    "dev":"namespace2.0",
    "mode":"raw",
    "size":8589934592,
    "blockdev":"pmem2"
  }
]
sh-4.2#
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw@public.gmane.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]                             ` <20170208072253.erl4esloglvq2zet-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
@ 2017-02-22 23:05                               ` Dan Williams
       [not found]                                 ` <CAPcyv4gKtLp07ta6PmWKrFJa9RZOPhShFmBcXiRS2izturw4QQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Dan Williams @ 2017-02-22 23:05 UTC (permalink / raw)
  To: Xiong Zhou; +Cc: linux-nvdimm, Eryu Guan

On Tue, Feb 7, 2017 at 11:22 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> On Wed, Feb 08, 2017 at 03:09:07PM +0800, Xiong Zhou wrote:
>> On Wed, Feb 08, 2017 at 02:56:51PM +0800, Xiong Zhou wrote:
>> > On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
>> > > On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> > > > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
>> > > >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> > > >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
>> > > >> >> Hi,
>> > > >> >>
>> > > >> >> At first, I am not sure whether this is an issue.
>> > > >> >>
>> > > >> >> mmap a file in a DAX mountpoint, open another file
>> > > >> >> in a non-DAX mountpoint with O_DIRECT, write the
>> > > >> >> mapped area to the other file.
>> > > >> >>
>> > > >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
>> > > >> >> This write Fail(Bad address) on nvdimm pmem devices.
>> > > >> >> This write Fail(Bad address) on brd based ramdisk.
>> > > >> >>
>> > > >> >> If we skip the O_DIRECT flag, all tests pass.
>> > > >> >>
>> > > >> >> If we write from DAX to DAX, all tests pass.
>> > > >> >> If we write from non-DAX to DAX, all tests pass.
>> > > >> >>
>> > > >> > snip..
>> > > >> >
>> > > >> > To falloc instead of pwrite while initiating test files,
>> > > >> > ( Thanks Ross! :)
>> > > >> > the write call returned success, however the following
>> > > >> > read back to mmaped area FAILED the same way:
>> > > >> >
>> > > >> > return (Bad address) on raw-mode nvdimm device;
>> > > >> > return (Success)     on memory-mode nvdimm device;
>> > > >> > return (Bad address) on brd based ramdisk.
>> > > >> >
>> > > >> > Also, this only happends with O_DIRECT flag on.
>> > > >> >
>> > > >> > This smells like an issue to me, still looking into why
>> > > >> > read can't get that page..
>> > > >> >
>> > > >>
>> > > >> Why does it smell like an issue? Any path that calls get_user_pages()
>> > > >
>> > > > Because the write call gets its page and succeeds, while read back fails.
>> > > > __get_user_pages on the same address first pass, then fail.
>> > >
>> > > Ok, I might have misread your description. Can you tell me the exact
>> > > reproduction steps so I can give it a try?
>> >
>> > Reproducer attached.
>> >
>
> Attachment issue..
>
> You need root to run this, assuming your pmem device is /dev/pmem0.
>
> Steps:
>   sh test.sh /dev/pmem0
>
> Thanks for your time!
>
> ----- test.sh --------------------------------------
> #!/bin/bash
> [ -z "$1" ] && { echo "$0 <dev>"; exit 1; }
>
> DEV="$1"
> MNT=/tbdmnt
> cc t_mmap_dio.c
> mkdir -p $MNT
> wipefs -af $DEV > /dev/null
> #mkfs.xfs -fq -d su=2m,sw=1 $DEV && \
> mkfs.xfs -fq $DEV && \
> mount -o dax $DEV $MNT && \
> #xfs_io -f -c "w 0 268435456" $MNT/ts > /dev/null && \
> #xfs_io -f -c "w 0 268435456" /root/td > /dev/null
> xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null && \
> xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
> if ./a.out $MNT/ts /root/td 16777216 "$DEV" ; then
>         echo dio PASS
> else
>         echo dio FAIL
> fi
>
> rm -f $MNT/ts /root/td
> xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null
> xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
>
> if ./a.out -b $MNT/ts /root/td 16777216 "$DEV" ; then
>         echo buffered IO PASS
> else
>         echo buffered IO FAIL
> fi
> umount $MNT
>
> --------------------------------------------------------
>
>
> ----- t_mmap_dio.c ----------------------------------
> /*
>  * This programme was originally written by
>  *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>  *
>  * Copyright (C) 2016, Red Hat, Inc.
>  */
> #define _GNU_SOURCE 1
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <sys/mman.h>
> #include <libaio.h>
> #include <errno.h>
> #include <sys/time.h>
>
> void usage(char *prog)
> {
>         fprintf(stderr,
>                 "usage: %s <src file> <dest file> <size> <msg>\n",
>                 prog);
>         exit(1);
> }
>
> void err_exit(char *op, unsigned long len, char *s)
> {
>         fprintf(stderr, "%s(%s) len %lu %s\n",
>                 op, strerror(errno), len, s);
>         exit(1);
> }
>
> int main(int argc, char **argv)
> {
>         int fd, fd2, ret, dio = 1;
>         char *map;
>         char *msg;
>         char *sfile;
>         char *dfile;
>         unsigned long len, opt;
>
>         if (argc < 4)
>                 usage(basename(argv[0]));
>
>         while ((opt = getopt(argc, argv, "b")) != -1)
>                 dio = 0;
>
>         sfile = argv[optind];
>         dfile = argv[optind + 1];
>         msg = argv[optind + 3];
>         len = strtoul(argv[optind + 2], NULL, 10);
>         if (errno == ERANGE)
>                 err_exit("strtoul", 0, msg);
>
>         /* Open source file and mmap*/
>         fd = open(sfile, O_RDWR, 0644);
>         if (fd < 0)
>                 err_exit("open src", len, msg);
>
>         map = (char *)mmap(NULL, len,
>                 PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
>         if (map == MAP_FAILED)
>                 err_exit("mmap", len, msg);
>
>         if (dio == 1) {
>                 /* Open dest file with O_DIRECT */
>                 fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
>                 if (fd2 < 0)
>                         err_exit("open dest", len, msg);
>         } else {
>                 /* Open dest file without O_DIRECT */
>                 fd2 = open(dfile, O_RDWR, 0644);
>                 if (fd2 < 0)
>                         err_exit("open dest", len, msg);
>         }
>
>         /* First, test storing to dest file from source mapping */
>         ret = write(fd2, map, len);
>         if (ret != len)
>                 err_exit("write", len, msg);
>
>         ret = fsync(fd2);
>         if (ret != 0)
>                 err_exit("fsync", len, msg);
>
>         ret = (int)lseek(fd2, 0, SEEK_SET);
>         if (ret == -1)
>                 err_exit("lseek", len, msg);
>
>         /* Next, test reading from dest file into source mapping */
>         ret = read(fd2, map, len);
>         if (ret != len)
>                 err_exit("read", len, msg);
>         ret = msync(map, len, MS_SYNC);
>         if (ret < 0)
>                 err_exit("msync", len, msg);
>
>         ret = munmap(map, len);
>         if (ret < 0)
>                 err_exit("munmap", len, msg);
>
>         ret = close(fd);
>         if (ret < 0)
>                 err_exit("clsoe fd", len, msg);
>
>         ret = close(fd2);
>         if (ret < 0)
>                 err_exit("close fd2", len, msg);
>
>         exit(0);
> }
>
> ----------------------------------------------
> ----- my log -------------

Thanks for the reproducer!

> sh-4.2# uname -r
> 4.10.0-rc7-master-f7d6040+
> sh-4.2# whoami
> root
> sh-4.2# pwd
> /root
> sh-4.2# sh test.sh /dev/pmem0
> dio PASS
> buffered IO PASS
> sh-4.2# sh test.sh /dev/pmem2
> read(Bad address) len 16777216 /dev/pmem2
> dio FAIL
> buffered IO PASS

This is expected. In the raw case we can't do the direct-I/O access to
read() into the buffer since there's no page. The reason the write()
from the buffer succeeds is because the extent is unwritten, so the
filesystem uses the zero page.

This is why the:

xfs_io -f -c 'w 0 268435456' /tbdmnt/ts

...setup fails at the write(), while the:

xfs_io -f -c 'falloc 0 268435456' /tbdmnt/ts

...setup fails later at the read() when the test switches from hitting
the zero page to trying to lookup a "dax" page.

> sh-4.2# modprobe brd rd_size=$((1*1024*1024))
> sh-4.2# sh test.sh /dev/ram0
> read(Bad address) len 16777216 /dev/ram0
> dio FAIL

This fails because dax on /dev/ramX does not support direct-I/O. The
write() works for the same "zero-page" reason above, but the read()
fails because the pte entry for the mapping is marked pte_special()
and we don't have a ->find_special_page() in the vm_ops to go from pte
back to the page that the brd driver is using.  I don't think this is
a problem worth solving since brd is more of a test vehicle than a
production driver.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: read failure (was Re: mmap dio write failure)
       [not found]                                 ` <CAPcyv4gKtLp07ta6PmWKrFJa9RZOPhShFmBcXiRS2izturw4QQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-02-23  9:53                                   ` Xiong Zhou
  0 siblings, 0 replies; 12+ messages in thread
From: Xiong Zhou @ 2017-02-23  9:53 UTC (permalink / raw)
  To: Dan Williams; +Cc: linux-nvdimm, Eryu Guan

On Wed, Feb 22, 2017 at 03:05:29PM -0800, Dan Williams wrote:
> On Tue, Feb 7, 2017 at 11:22 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> > On Wed, Feb 08, 2017 at 03:09:07PM +0800, Xiong Zhou wrote:
> >> On Wed, Feb 08, 2017 at 02:56:51PM +0800, Xiong Zhou wrote:
> >> > On Tue, Feb 07, 2017 at 09:05:21PM -0800, Dan Williams wrote:
> >> > > On Tue, Feb 7, 2017 at 8:49 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >> > > > On Tue, Feb 07, 2017 at 08:10:14PM -0800, Dan Williams wrote:
> >> > > >> On Tue, Feb 7, 2017 at 7:51 PM, Xiong Zhou <xzhou-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> >> > > >> > On Fri, Jan 20, 2017 at 12:40:07PM +0800, Xiong Zhou wrote:
> >> > > >> >> Hi,
> >> > > >> >>
> >> > > >> >> At first, I am not sure whether this is an issue.
> >> > > >> >>
> >> > > >> >> mmap a file in a DAX mountpoint, open another file
> >> > > >> >> in a non-DAX mountpoint with O_DIRECT, write the
> >> > > >> >> mapped area to the other file.
> >> > > >> >>
> >> > > >> >> This write Success on pmem ramdisk(memmap=2G!20G like)
> >> > > >> >> This write Fail(Bad address) on nvdimm pmem devices.
> >> > > >> >> This write Fail(Bad address) on brd based ramdisk.
> >> > > >> >>
> >> > > >> >> If we skip the O_DIRECT flag, all tests pass.
> >> > > >> >>
> >> > > >> >> If we write from DAX to DAX, all tests pass.
> >> > > >> >> If we write from non-DAX to DAX, all tests pass.
> >> > > >> >>
> >> > > >> > snip..
> >> > > >> >
> >> > > >> > To falloc instead of pwrite while initiating test files,
> >> > > >> > ( Thanks Ross! :)
> >> > > >> > the write call returned success, however the following
> >> > > >> > read back to mmaped area FAILED the same way:
> >> > > >> >
> >> > > >> > return (Bad address) on raw-mode nvdimm device;
> >> > > >> > return (Success)     on memory-mode nvdimm device;
> >> > > >> > return (Bad address) on brd based ramdisk.
> >> > > >> >
> >> > > >> > Also, this only happends with O_DIRECT flag on.
> >> > > >> >
> >> > > >> > This smells like an issue to me, still looking into why
> >> > > >> > read can't get that page..
> >> > > >> >
> >> > > >>
> >> > > >> Why does it smell like an issue? Any path that calls get_user_pages()
> >> > > >
> >> > > > Because the write call gets its page and succeeds, while read back fails.
> >> > > > __get_user_pages on the same address first pass, then fail.
> >> > >
> >> > > Ok, I might have misread your description. Can you tell me the exact
> >> > > reproduction steps so I can give it a try?
> >> >
> >> > Reproducer attached.
> >> >
> >
> > Attachment issue..
> >
> > You need root to run this, assuming your pmem device is /dev/pmem0.
> >
> > Steps:
> >   sh test.sh /dev/pmem0
> >
> > Thanks for your time!
> >
> > ----- test.sh --------------------------------------
> > #!/bin/bash
> > [ -z "$1" ] && { echo "$0 <dev>"; exit 1; }
> >
> > DEV="$1"
> > MNT=/tbdmnt
> > cc t_mmap_dio.c
> > mkdir -p $MNT
> > wipefs -af $DEV > /dev/null
> > #mkfs.xfs -fq -d su=2m,sw=1 $DEV && \
> > mkfs.xfs -fq $DEV && \
> > mount -o dax $DEV $MNT && \
> > #xfs_io -f -c "w 0 268435456" $MNT/ts > /dev/null && \
> > #xfs_io -f -c "w 0 268435456" /root/td > /dev/null
> > xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null && \
> > xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
> > if ./a.out $MNT/ts /root/td 16777216 "$DEV" ; then
> >         echo dio PASS
> > else
> >         echo dio FAIL
> > fi
> >
> > rm -f $MNT/ts /root/td
> > xfs_io -f -c "falloc 0 268435456" $MNT/ts > /dev/null
> > xfs_io -f -c "falloc 0 268435456" /root/td > /dev/null
> >
> > if ./a.out -b $MNT/ts /root/td 16777216 "$DEV" ; then
> >         echo buffered IO PASS
> > else
> >         echo buffered IO FAIL
> > fi
> > umount $MNT
> >
> > --------------------------------------------------------
> >
> >
> > ----- t_mmap_dio.c ----------------------------------
> > /*
> >  * This programme was originally written by
> >  *     Jeff Moyer <jmoyer-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> >  *
> >  * Copyright (C) 2016, Red Hat, Inc.
> >  */
> > #define _GNU_SOURCE 1
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <string.h>
> > #include <unistd.h>
> > #include <fcntl.h>
> > #include <sys/mman.h>
> > #include <libaio.h>
> > #include <errno.h>
> > #include <sys/time.h>
> >
> > void usage(char *prog)
> > {
> >         fprintf(stderr,
> >                 "usage: %s <src file> <dest file> <size> <msg>\n",
> >                 prog);
> >         exit(1);
> > }
> >
> > void err_exit(char *op, unsigned long len, char *s)
> > {
> >         fprintf(stderr, "%s(%s) len %lu %s\n",
> >                 op, strerror(errno), len, s);
> >         exit(1);
> > }
> >
> > int main(int argc, char **argv)
> > {
> >         int fd, fd2, ret, dio = 1;
> >         char *map;
> >         char *msg;
> >         char *sfile;
> >         char *dfile;
> >         unsigned long len, opt;
> >
> >         if (argc < 4)
> >                 usage(basename(argv[0]));
> >
> >         while ((opt = getopt(argc, argv, "b")) != -1)
> >                 dio = 0;
> >
> >         sfile = argv[optind];
> >         dfile = argv[optind + 1];
> >         msg = argv[optind + 3];
> >         len = strtoul(argv[optind + 2], NULL, 10);
> >         if (errno == ERANGE)
> >                 err_exit("strtoul", 0, msg);
> >
> >         /* Open source file and mmap*/
> >         fd = open(sfile, O_RDWR, 0644);
> >         if (fd < 0)
> >                 err_exit("open src", len, msg);
> >
> >         map = (char *)mmap(NULL, len,
> >                 PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
> >         if (map == MAP_FAILED)
> >                 err_exit("mmap", len, msg);
> >
> >         if (dio == 1) {
> >                 /* Open dest file with O_DIRECT */
> >                 fd2 = open(dfile, O_RDWR|O_DIRECT, 0644);
> >                 if (fd2 < 0)
> >                         err_exit("open dest", len, msg);
> >         } else {
> >                 /* Open dest file without O_DIRECT */
> >                 fd2 = open(dfile, O_RDWR, 0644);
> >                 if (fd2 < 0)
> >                         err_exit("open dest", len, msg);
> >         }
> >
> >         /* First, test storing to dest file from source mapping */
> >         ret = write(fd2, map, len);
> >         if (ret != len)
> >                 err_exit("write", len, msg);
> >
> >         ret = fsync(fd2);
> >         if (ret != 0)
> >                 err_exit("fsync", len, msg);
> >
> >         ret = (int)lseek(fd2, 0, SEEK_SET);
> >         if (ret == -1)
> >                 err_exit("lseek", len, msg);
> >
> >         /* Next, test reading from dest file into source mapping */
> >         ret = read(fd2, map, len);
> >         if (ret != len)
> >                 err_exit("read", len, msg);
> >         ret = msync(map, len, MS_SYNC);
> >         if (ret < 0)
> >                 err_exit("msync", len, msg);
> >
> >         ret = munmap(map, len);
> >         if (ret < 0)
> >                 err_exit("munmap", len, msg);
> >
> >         ret = close(fd);
> >         if (ret < 0)
> >                 err_exit("clsoe fd", len, msg);
> >
> >         ret = close(fd2);
> >         if (ret < 0)
> >                 err_exit("close fd2", len, msg);
> >
> >         exit(0);
> > }
> >
> > ----------------------------------------------
> > ----- my log -------------
> 
> Thanks for the reproducer!
> 
> > sh-4.2# uname -r
> > 4.10.0-rc7-master-f7d6040+
> > sh-4.2# whoami
> > root
> > sh-4.2# pwd
> > /root
> > sh-4.2# sh test.sh /dev/pmem0
> > dio PASS
> > buffered IO PASS
> > sh-4.2# sh test.sh /dev/pmem2
> > read(Bad address) len 16777216 /dev/pmem2
> > dio FAIL
> > buffered IO PASS
> 
> This is expected. In the raw case we can't do the direct-I/O access to
> read() into the buffer since there's no page. The reason the write()
> from the buffer succeeds is because the extent is unwritten, so the
> filesystem uses the zero page.
> 
> This is why the:
> 
> xfs_io -f -c 'w 0 268435456' /tbdmnt/ts
> 
> ...setup fails at the write(), while the:
> 
> xfs_io -f -c 'falloc 0 268435456' /tbdmnt/ts
> 
> ...setup fails later at the read() when the test switches from hitting
> the zero page to trying to lookup a "dax" page.
> 
> > sh-4.2# modprobe brd rd_size=$((1*1024*1024))
> > sh-4.2# sh test.sh /dev/ram0
> > read(Bad address) len 16777216 /dev/ram0
> > dio FAIL
> 
> This fails because dax on /dev/ramX does not support direct-I/O. The
> write() works for the same "zero-page" reason above, but the read()
> fails because the pte entry for the mapping is marked pte_special()
> and we don't have a ->find_special_page() in the vm_ops to go from pte
> back to the page that the brd driver is using.  I don't think this is
> a problem worth solving since brd is more of a test vehicle than a
> production driver.

Thank you very much for your time Dan ! So detailed !

Thanks,
Xiong

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-02-23  9:53 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-20  4:40 mmap dio write failure Xiong Zhou
     [not found] ` <20170120044007.kwevjo7nawwolagy-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-01-20  6:04   ` Dan Williams
2017-02-07 10:01     ` Xiong Zhou
2017-02-08  3:51   ` read failure (was Re: mmap dio write failure) Xiong Zhou
     [not found]     ` <20170208035105.lvewz5ce7xbu5zud-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-02-08  4:10       ` Dan Williams
     [not found]         ` <CAPcyv4iF2xH=9FHDMqb8OYMFO45GkiEEKf1y73Q7scGQSJniag-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-02-08  4:49           ` Xiong Zhou
     [not found]             ` <20170208044959.rq3ofmsjth2oeu2u-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-02-08  5:05               ` Dan Williams
     [not found]                 ` <CAPcyv4iCz9SqJkoKYr-RWxV4FmqPB1Wfkx_A_jE-S5Unpo-5Pw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-02-08  6:56                   ` Xiong Zhou
     [not found]                     ` <20170208065651.ab74cytferq4ha57-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-02-08  7:09                       ` Xiong Zhou
     [not found]                         ` <20170208070907.m6cwwz47hrow5vui-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-02-08  7:22                           ` Xiong Zhou
     [not found]                             ` <20170208072253.erl4esloglvq2zet-E9dkjZ7ERC1QcClZ3XN9yxcY2uh10dtjAL8bYrjMMd8@public.gmane.org>
2017-02-22 23:05                               ` Dan Williams
     [not found]                                 ` <CAPcyv4gKtLp07ta6PmWKrFJa9RZOPhShFmBcXiRS2izturw4QQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-02-23  9:53                                   ` Xiong Zhou

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.