linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
@ 2022-01-25 22:05 Daniel Black
  2022-01-26  3:02 ` Matthew Wilcox
  0 siblings, 1 reply; 7+ messages in thread
From: Daniel Black @ 2022-01-25 22:05 UTC (permalink / raw)
  To: linux-fsdevel

Folks,

I've been testing the following on a 5.15.14-200.fc35.x86_64 kernel
with /mnt/nas as a CIFS mount.

//192.168.178.171/dan on /mnt/nas type cifs
(rw,relatime,vers=3.0,cache=strict,username=dan,domain=WORKGROUP,uid=0,noforceuid,gid=0,noforcegid,addr=192.168.178.171,file_mode=0777,dir_mode=0777,iocharset=utf8,soft,nounix,serverino,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=60,actimeo=1)

The following is on MariaDB-10.5 but I've tested on MariaDB-10.2 and
it looks to be similarly implemented in MySQL-5.7 and MySQL-8.0.
/mnt/nas/datadir is empty so its an initialization failure for
simplicity.

strace -f -s 99   -e trace=%file,fcntl,io_submit,write,io_getevents -o
/tmp/mysqld.strace sql/mysqld --no-defaults --bootstrap
--datadir=/mnt/nas/datadir --innodb_flush_method=O_DIRECT

an extracted summary is:

65412 openat(AT_FDCWD, "./ibdata1", O_RDWR|O_CLOEXEC) = 10
65412 fcntl(10, F_SETFL, O_RDONLY|O_DIRECT) = 0
...
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\4\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373E\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=65536}]) = 1
65411 <... io_getevents resumed>[{data=0, obj=0x7f4efb46c740, res=-22,
res2=0}], NULL) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\0\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\1@\0\0\0\25\0\0\0\r\0\0\0\4\0\0\0\0\0\306\0\0\0\0\1>\0\0\0\1\0\0\0\0\0\236\0\0\0\0\0\236\0\0\0\0\377"...,
aio_nbytes=16384, aio_offset=0}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\2\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\3\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\377"...,
aio_nbytes=16384, aio_offset=32768}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\3\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
aio_nbytes=16384, aio_offset=49152}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\1\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
aio_nbytes=16384, aio_offset=16384}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\6\377\377\377\377\377\377\377\377\0\0\0\0\0\0$\335\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\0\0\0\2\1\262\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377"...,
aio_nbytes=16384, aio_offset=98304}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\5\377\377\377\377\377\377\377\377\0\0\0\0\0\0$\335\0\7\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\362\0\0\0\0\0\0\0\6\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377"...,
aio_nbytes=16384, aio_offset=81920}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\f\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\0\0\0\0\0\0\2\t\362\0\0\0\0\0\0\0\2\t2\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=196608}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\v\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\2\10r\0\0\0\0\0\0\0\2\7\262\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=180224}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\n\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\2\6\362\0\0\0\0\0\0\0\2\0062\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=163840}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\t\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\2\5r\0\0\0\0\0\0\0\2\4\262\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=147456}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\10\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\2\3\362\0\0\0\0\0\0\0\2\0032\10\1\0\0\3"...,
aio_nbytes=16384, aio_offset=131072}]) = 1
65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
aio_buf="\0\0\0\0\0\0\0\7\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\10\0\0\0\t\0\0\0\n\0\0\0\v\0\0\0\f\0\0\0\0\0\0\0\0\0"...,
aio_nbytes=16384, aio_offset=114688}]) = 1
65411 io_getevents(0x7f4efb83b000, 1, 256, [{data=0,
obj=0x7f4efb46c680, res=-22, res2=0}, {data=0, obj=0x7f4efb46c5c0,
res=-22, res2=0}, {data=0, obj=0x7f4efb46c500, res=-22, res2=0},
{data=0, obj=0x7f4efb46c440, res=-22, res2=0}, {data=0,
obj=0x7f4efb46c380, res=-22, res2=0}, {data=0, obj=0x7f4efb46c2c0,
res=-22, res2=0}, {data=0, obj=0x7f4efb46c200, res=-22, res2=0},
{data=0, obj=0x7f4efb46c140, res=-22, res2=0}, {data=0,
obj=0x7f4efb46c080, res=-22, res2=0}, {data=0, obj=0x7f4efb46bfc0,
res=-22, res2=0}, {data=0, obj=0x7f4efb46bf00, res=-22, res2=0},
{data=0, obj=0x7f4efb46be40, res=-22, res2=0}], NULL) = 12
65413 write(2, "2022-01-25 10:36:50 0 [ERROR] [FATAL] InnoDB: IO Error
Invalid argument on file descriptor 10 writi"..., 130) = 130

The error message I added is for clarity that the errno=-22 EVINAL is
getting returned for the writes.

The same with --innodb_flush_method=fsync omits the fcntl(10, F_SETFL,
O_RDONLY|O_DIRECT) = 0 (btw no idea how O_RDONLY is there, its not in
the code - masked out by SETFL_MASK anyway) succeeds without a
problem.

The kernel code in setfl seems to want to return EINVAL for
filesystems without a direct_IO structure member assigned,

A noop_direct_IO seems to be used frequently to just return EINVAL
(like cifs_direct_io).

So fcntl( SET_FL, O_DIRECT) frequently succeeds on filesystems that
will EINVAL as soon as a write/read occurs. Can this be corrected?
Maybe with something better than:

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..ff55a904bb4e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -58,7 +58,8 @@ static int setfl(int fd, struct file * filp,
unsigned long arg)
        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
                if (!filp->f_mapping || !filp->f_mapping->a_ops ||
-                       !filp->f_mapping->a_ops->direct_IO)
+                       !filp->f_mapping->a_ops->direct_IO ||
+                       filp->f_mapping->a_ops->direct_IO == noop_direct_IO)
                                return -EINVAL;
        }


Can cifs_direct_io be replaced with noop_direct_IO?

Lastly on the list of peculiar behaviors here, is tmpfs will return
EINVAL from the fcntl call however it works fine with O_DIRECT
(https://bugs.mysql.com/bug.php?id=26662). MySQL (and MariaDB still
has the same code) that currently ignores EINVAL, but I'm willing to
make that code better.

Does a userspace have to fully try to write to an O_DIRECT file, note
the failure, reopen or clear O_DIRECT, and resubmit to use O_DIRECT?

While I see that the success/failure of a O_DIRECT read/write can be
related to the capabilities of the underlying block device depending
on offset/length of the read/write, are there other traps?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-25 22:05 fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write Daniel Black
@ 2022-01-26  3:02 ` Matthew Wilcox
  2022-01-26 22:03   ` Daniel Black
  2022-01-27  2:38   ` Daniel Black
  0 siblings, 2 replies; 7+ messages in thread
From: Matthew Wilcox @ 2022-01-26  3:02 UTC (permalink / raw)
  To: Daniel Black; +Cc: linux-fsdevel

On Wed, Jan 26, 2022 at 09:05:48AM +1100, Daniel Black wrote:
On Wed, Jan 26, 2022 at 09:05:48AM +1100, Daniel Black wrote:
> Folks,
> 
> I've been testing the following on a 5.15.14-200.fc35.x86_64 kernel
> with /mnt/nas as a CIFS mount.
> 
> //192.168.178.171/dan on /mnt/nas type cifs
> (rw,relatime,vers=3.0,cache=strict,username=dan,domain=WORKGROUP,uid=0,noforceuid,gid=0,noforcegid,addr=192.168.178.171,file_mode=0777,dir_mode=0777,iocharset=utf8,soft,nounix,serverino,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=60,actimeo=1)
> 
> The following is on MariaDB-10.5 but I've tested on MariaDB-10.2 and
> it looks to be similarly implemented in MySQL-5.7 and MySQL-8.0.
> /mnt/nas/datadir is empty so its an initialization failure for
> simplicity.
> 
> strace -f -s 99   -e trace=%file,fcntl,io_submit,write,io_getevents -o
> /tmp/mysqld.strace sql/mysqld --no-defaults --bootstrap
> --datadir=/mnt/nas/datadir --innodb_flush_method=O_DIRECT
> 
> an extracted summary is:
> 
> 65412 openat(AT_FDCWD, "./ibdata1", O_RDWR|O_CLOEXEC) = 10
> 65412 fcntl(10, F_SETFL, O_RDONLY|O_DIRECT) = 0
> ...
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\4\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373E\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=65536}]) = 1
> 65411 <... io_getevents resumed>[{data=0, obj=0x7f4efb46c740, res=-22,
> res2=0}], NULL) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\0\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\1@\0\0\0\25\0\0\0\r\0\0\0\4\0\0\0\0\0\306\0\0\0\0\1>\0\0\0\1\0\0\0\0\0\236\0\0\0\0\0\236\0\0\0\0\377"...,
> aio_nbytes=16384, aio_offset=0}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\2\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\3\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\377"...,
> aio_nbytes=16384, aio_offset=32768}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\3\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> aio_nbytes=16384, aio_offset=49152}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\1\377\377\377\377\377\377\377\377\0\0\0\0\0\0#\373\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> aio_nbytes=16384, aio_offset=16384}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\6\377\377\377\377\377\377\377\377\0\0\0\0\0\0$\335\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0\377\377\377\377\0\0\0\0\0\0\0\0\0\2\1\262\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377"...,
> aio_nbytes=16384, aio_offset=98304}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\5\377\377\377\377\377\377\377\377\0\0\0\0\0\0$\335\0\7\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\362\0\0\0\0\0\0\0\6\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377"...,
> aio_nbytes=16384, aio_offset=81920}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\f\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\0\0\0\0\0\0\2\t\362\0\0\0\0\0\0\0\2\t2\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=196608}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\v\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\2\10r\0\0\0\0\0\0\0\2\7\262\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=180224}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\n\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\2\6\362\0\0\0\0\0\0\0\2\0062\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=163840}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\t\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\2\5r\0\0\0\0\0\0\0\2\4\262\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=147456}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\10\377\377\377\377\377\377\377\377\0\0\0\0\0\0(ME\277\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0}\0\2\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\2\3\362\0\0\0\0\0\0\0\2\0032\10\1\0\0\3"...,
> aio_nbytes=16384, aio_offset=131072}]) = 1
> 65412 io_submit(0x7f4efb83b000, 1, [{aio_data=0,
> aio_lio_opcode=IOCB_CMD_PWRITE, aio_fildes=10,
> aio_buf="\0\0\0\0\0\0\0\7\377\377\377\377\377\377\377\377\0\0\0\0\0\0(M\0\6\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\0\0\0\0\n\0\0\0\10\0\0\0\t\0\0\0\n\0\0\0\v\0\0\0\f\0\0\0\0\0\0\0\0\0"...,
> aio_nbytes=16384, aio_offset=114688}]) = 1
> 65411 io_getevents(0x7f4efb83b000, 1, 256, [{data=0,
> obj=0x7f4efb46c680, res=-22, res2=0}, {data=0, obj=0x7f4efb46c5c0,
> res=-22, res2=0}, {data=0, obj=0x7f4efb46c500, res=-22, res2=0},
> {data=0, obj=0x7f4efb46c440, res=-22, res2=0}, {data=0,
> obj=0x7f4efb46c380, res=-22, res2=0}, {data=0, obj=0x7f4efb46c2c0,
> res=-22, res2=0}, {data=0, obj=0x7f4efb46c200, res=-22, res2=0},
> {data=0, obj=0x7f4efb46c140, res=-22, res2=0}, {data=0,
> obj=0x7f4efb46c080, res=-22, res2=0}, {data=0, obj=0x7f4efb46bfc0,
> res=-22, res2=0}, {data=0, obj=0x7f4efb46bf00, res=-22, res2=0},
> {data=0, obj=0x7f4efb46be40, res=-22, res2=0}], NULL) = 12
> 65413 write(2, "2022-01-25 10:36:50 0 [ERROR] [FATAL] InnoDB: IO Error
> Invalid argument on file descriptor 10 writi"..., 130) = 130
> 
> The error message I added is for clarity that the errno=-22 EVINAL is
> getting returned for the writes.
> 
> The same with --innodb_flush_method=fsync omits the fcntl(10, F_SETFL,
> O_RDONLY|O_DIRECT) = 0 (btw no idea how O_RDONLY is there, its not in
> the code - masked out by SETFL_MASK anyway) succeeds without a
> problem.

O_RDONLY is defined to be 0, so don't worry about it.

> The kernel code in setfl seems to want to return EINVAL for
> filesystems without a direct_IO structure member assigned,
> 
> A noop_direct_IO seems to be used frequently to just return EINVAL
> (like cifs_direct_io).

Sorry for the confusion.  You've caught us mid-transition.  Eventually,
->direct_IO will be deleted, but for now it signifies whether or not the
filesystem supports O_DIRECT, even though it's not used (except in some
scenarios you don't care about).

> Lastly on the list of peculiar behaviors here, is tmpfs will return
> EINVAL from the fcntl call however it works fine with O_DIRECT
> (https://bugs.mysql.com/bug.php?id=26662). MySQL (and MariaDB still
> has the same code) that currently ignores EINVAL, but I'm willing to
> make that code better.

Out of interest, what behaviour do you _want_ from doing O_DIRECT
to tmpfs?  O_DIRECT is defined to bypass the page cache, but tmpfs
only stores data in the page cache.  So what do you intend to happen?

> Does a userspace have to fully try to write to an O_DIRECT file, note
> the failure, reopen or clear O_DIRECT, and resubmit to use O_DIRECT?
> 
> While I see that the success/failure of a O_DIRECT read/write can be
> related to the capabilities of the underlying block device depending
> on offset/length of the read/write, are there other traps?

It also must be aligned in memory, but I'm not quite sure what
limitations cifs imposes.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-26  3:02 ` Matthew Wilcox
@ 2022-01-26 22:03   ` Daniel Black
  2022-01-26 22:15     ` Matthew Wilcox
  2022-01-27  2:38   ` Daniel Black
  1 sibling, 1 reply; 7+ messages in thread
From: Daniel Black @ 2022-01-26 22:03 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel

On Wed, Jan 26, 2022 at 2:02 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jan 26, 2022 at 09:05:48AM +1100, Daniel Black wrote:
>
> O_RDONLY is defined to be 0, so don't worry about it.

Thanks.

> > The kernel code in setfl seems to want to return EINVAL for
> > filesystems without a direct_IO structure member assigned,
> >
> > A noop_direct_IO seems to be used frequently to just return EINVAL
> > (like cifs_direct_io).
>
> Sorry for the confusion.  You've caught us mid-transition.  Eventually,
> ->direct_IO will be deleted, but for now it signifies whether or not the
> filesystem supports O_DIRECT, even though it's not used (except in some
> scenarios you don't care about).

Is it going to be reasonable to expect fcntl(fd, F_SETFL, O_DIRECT) to
return EINVAL if O_DIRECT isn't supported?

> > Lastly on the list of peculiar behaviors here, is tmpfs will return
> > EINVAL from the fcntl call however it works fine with O_DIRECT
> > (https://bugs.mysql.com/bug.php?id=26662). MySQL (and MariaDB still
> > has the same code) that currently ignores EINVAL, but I'm willing to
> > make that code better.
>
> Out of interest, what behaviour do you _want_ from doing O_DIRECT
> to tmpfs?  O_DIRECT is defined to bypass the page cache, but tmpfs
> only stores data in the page cache.  So what do you intend to happen?

It occurs to me because EINVAL is returned, it's just operating in
non-O_DIRECT mode.

It occurs to me that someone probably added this because (too much)
MySQL/MariaDB
testing is done on tmpfs and someone didn't want to adjust the test
suite to handle
failures everywhere on O_DIRECT. I don't think there was any kernel
expectation there.

My problem it seems, I'll see what I can do to get back to using real
filesystems more.

> > Does a userspace have to fully try to write to an O_DIRECT file, note
> > the failure, reopen or clear O_DIRECT, and resubmit to use O_DIRECT?
> >
> > While I see that the success/failure of a O_DIRECT read/write can be
> > related to the capabilities of the underlying block device depending
> > on offset/length of the read/write, are there other traps?
>
> It also must be aligned in memory,

yep, knew this one.

> but I'm not quite sure what
> limitations cifs imposes.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-26 22:03   ` Daniel Black
@ 2022-01-26 22:15     ` Matthew Wilcox
  2022-01-26 23:16       ` Daniel Black
  0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2022-01-26 22:15 UTC (permalink / raw)
  To: Daniel Black; +Cc: linux-fsdevel

On Thu, Jan 27, 2022 at 09:03:36AM +1100, Daniel Black wrote:
> On Wed, Jan 26, 2022 at 2:02 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Wed, Jan 26, 2022 at 09:05:48AM +1100, Daniel Black wrote:
> >
> > O_RDONLY is defined to be 0, so don't worry about it.
> 
> Thanks.
> 
> > > The kernel code in setfl seems to want to return EINVAL for
> > > filesystems without a direct_IO structure member assigned,
> > >
> > > A noop_direct_IO seems to be used frequently to just return EINVAL
> > > (like cifs_direct_io).
> >
> > Sorry for the confusion.  You've caught us mid-transition.  Eventually,
> > ->direct_IO will be deleted, but for now it signifies whether or not the
> > filesystem supports O_DIRECT, even though it's not used (except in some
> > scenarios you don't care about).
> 
> Is it going to be reasonable to expect fcntl(fd, F_SETFL, O_DIRECT) to
> return EINVAL if O_DIRECT isn't supported?

That is a reasonable expectation.  I can't guarantee that we won't have
bugs, of course ...

> > > Lastly on the list of peculiar behaviors here, is tmpfs will return
> > > EINVAL from the fcntl call however it works fine with O_DIRECT
> > > (https://bugs.mysql.com/bug.php?id=26662). MySQL (and MariaDB still
> > > has the same code) that currently ignores EINVAL, but I'm willing to
> > > make that code better.
> >
> > Out of interest, what behaviour do you _want_ from doing O_DIRECT
> > to tmpfs?  O_DIRECT is defined to bypass the page cache, but tmpfs
> > only stores data in the page cache.  So what do you intend to happen?
> 
> It occurs to me because EINVAL is returned, it's just operating in
> non-O_DIRECT mode.
> 
> It occurs to me that someone probably added this because (too much)
> MySQL/MariaDB
> testing is done on tmpfs and someone didn't want to adjust the test
> suite to handle
> failures everywhere on O_DIRECT. I don't think there was any kernel
> expectation there.
> 
> My problem it seems, I'll see what I can do to get back to using real
> filesystems more.

Heh.  I know Hugh is looking at "supporting" O_DIRECT on tmpfs, at least
for his internal testing.  Not sure what his plans are for merging
that support.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-26 22:15     ` Matthew Wilcox
@ 2022-01-26 23:16       ` Daniel Black
  0 siblings, 0 replies; 7+ messages in thread
From: Daniel Black @ 2022-01-26 23:16 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel

On Thu, Jan 27, 2022 at 9:15 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Jan 27, 2022 at 09:03:36AM +1100, Daniel Black wrote:

> > Is it going to be reasonable to expect fcntl(fd, F_SETFL, O_DIRECT) to
> > return EINVAL if O_DIRECT isn't supported?
>
> That is a reasonable expectation.  I can't guarantee that we won't have
> bugs, of course ...

Ha, sure.

I've begun to https://kernelci.org/ options to try to catch at least
some of them
pre-release.

> > My problem it seems, I'll see what I can do to get back to using real
> > filesystems more.
>
> Heh.  I know Hugh is looking at "supporting" O_DIRECT on tmpfs, at least
> for his internal testing.  Not sure what his plans are for merging
> that support.

I'd be happy to see it in that it will remove a long standing cludge
bit of weakly
commented user space code in the fullness of supported kernel lifespans, but
no great urgency.

Thanks Matthew.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-26  3:02 ` Matthew Wilcox
  2022-01-26 22:03   ` Daniel Black
@ 2022-01-27  2:38   ` Daniel Black
  2022-01-27  4:37     ` Matthew Wilcox
  1 sibling, 1 reply; 7+ messages in thread
From: Daniel Black @ 2022-01-27  2:38 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel

On Wed, Jan 26, 2022 at 2:02 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jan 26, 2022 at 09:05:48AM +1100, Daniel Black wrote:
> > The kernel code in setfl seems to want to return EINVAL for
> > filesystems without a direct_IO structure member assigned,
> >
> > A noop_direct_IO seems to be used frequently to just return EINVAL
> > (like cifs_direct_io).
>
> Sorry for the confusion.  You've caught us mid-transition.  Eventually,
> ->direct_IO will be deleted, but for now it signifies whether or not the
> filesystem supports O_DIRECT, even though it's not used (except in some
> scenarios you don't care about).

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..ff55a904bb4e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -58,7 +58,8 @@ static int setfl(int fd, struct file * filp,
unsigned long arg)
        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
                if (!filp->f_mapping || !filp->f_mapping->a_ops ||
-                       !filp->f_mapping->a_ops->direct_IO)
+                       !filp->f_mapping->a_ops->direct_IO ||
+                       filp->f_mapping->a_ops->direct_IO == noop_direct_IO)
                                return -EINVAL;
        }

The above patch prevents:

  filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);

being executed at the bottom of setfl which keeps the file descriptor
out of O_DIRECT mode when
the filesystem (like CIFS doesn't support it). In the original strace

So while you are mid-transition, the relatively simple flag of
direct_IO is good enough a protection
for a file descriptor entering a mode that isn't supported.

Is this an acceptable stop gap concept and/or stable backport?

>.., but I'm not quite sure what limitations cifs imposes.

Given cifs_direct_io is equivalent to the noop_direct_IO return
-EINVAL now, there's no direct io
there as I discovered testing the bug report
https://jira.mariadb.org/browse/MDEV-26970.

My patch two of the series would be to replace cifs_direct_io with
noop_direct_IO.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write
  2022-01-27  2:38   ` Daniel Black
@ 2022-01-27  4:37     ` Matthew Wilcox
  0 siblings, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2022-01-27  4:37 UTC (permalink / raw)
  To: Daniel Black; +Cc: linux-fsdevel

On Thu, Jan 27, 2022 at 01:38:58PM +1100, Daniel Black wrote:
> > Sorry for the confusion.  You've caught us mid-transition.  Eventually,
> > ->direct_IO will be deleted, but for now it signifies whether or not the
> > filesystem supports O_DIRECT, even though it's not used (except in some
> > scenarios you don't care about).
> 
> being executed at the bottom of setfl which keeps the file descriptor
> out of O_DIRECT mode when
> the filesystem (like CIFS doesn't support it). In the original strace

Apparently I wasn't clear ...

CIFS absolutely does support O_DIRECT.  It does not do it by calling
->direct_IO; instead it's handled in cifs_loose_read_iter().


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-01-27  4:37 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-25 22:05 fcntl(fd, F_SETFL, O_DIRECT) succeeds followed by EINVAL in write Daniel Black
2022-01-26  3:02 ` Matthew Wilcox
2022-01-26 22:03   ` Daniel Black
2022-01-26 22:15     ` Matthew Wilcox
2022-01-26 23:16       ` Daniel Black
2022-01-27  2:38   ` Daniel Black
2022-01-27  4:37     ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).