All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
@ 2023-09-20  2:39 Tyler Fanelli
  2023-09-20  2:40 ` [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Tyler Fanelli
                   ` (3 more replies)
  0 siblings, 4 replies; 48+ messages in thread
From: Tyler Fanelli @ 2023-09-20  2:39 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: mszeredi, gmaglione, hreitz, Tyler Fanelli

At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
of allowing shared mmap of files opened/created with DIRECT_IO enabled.
However, it leaves open the possibility of further relaxing the
DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
DIRECT_IO) in the future.

The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
only serves to allow shared mmap of DIRECT_IO files, while still
bypassing the cache on regular reads and writes. The shared mmap is the
only loosening of the cache policy that can take place with the flag.
This removes some ambiguity and introduces a more stable flag to be used
in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.

Tyler Fanelli (2):
  fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
  docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP

 Documentation/filesystems/fuse-io.rst | 3 ++-
 fs/fuse/file.c                        | 6 +++---
 fs/fuse/fuse_i.h                      | 4 ++--
 fs/fuse/inode.c                       | 6 +++---
 include/uapi/linux/fuse.h             | 7 +++----
 5 files changed, 13 insertions(+), 13 deletions(-)

-- 
2.40.1


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
  2023-09-20  2:39 [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Tyler Fanelli
@ 2023-09-20  2:40 ` Tyler Fanelli
  2023-09-20  8:31   ` Hanna Czenczek
  2023-09-20  2:40 ` [PATCH 2/2] docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP Tyler Fanelli
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 48+ messages in thread
From: Tyler Fanelli @ 2023-09-20  2:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: mszeredi, gmaglione, hreitz, Tyler Fanelli

Although DIRECT_IO_RELAX's initial usage is to allow shared mmap, its
description indicates a purpose of reducing memory footprint. This
may imply that it could be further used to relax other DIRECT_IO
operations in the future.

Replace it with a flag DIRECT_IO_ALLOW_MMAP which does only one thing,
allow shared mmap of DIRECT_IO files while still bypassing the cache
on regular reads and writes.

Signed-off-by: Tyler Fanelli <tfanelli@redhat.com>
---
 fs/fuse/file.c            | 6 +++---
 fs/fuse/fuse_i.h          | 4 ++--
 fs/fuse/inode.c           | 6 +++---
 include/uapi/linux/fuse.h | 7 +++----
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..89e870d1a526 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1448,7 +1448,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 	if (!ia)
 		return -ENOMEM;
 
-	if (fopen_direct_io && fc->direct_io_relax) {
+	if (fopen_direct_io && fc->direct_io_allow_mmap) {
 		res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
 		if (res) {
 			fuse_io_free(ia);
@@ -2466,9 +2466,9 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (ff->open_flags & FOPEN_DIRECT_IO) {
 		/* Can't provide the coherency needed for MAP_SHARED
-		 * if FUSE_DIRECT_IO_RELAX isn't set.
+		 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
 		 */
-		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax)
+		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
 			return -ENODEV;
 
 		invalidate_inode_pages2(file->f_mapping);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..bc3b7d10b929 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -797,8 +797,8 @@ struct fuse_conn {
 	/* Is tmpfile not implemented by fs? */
 	unsigned int no_tmpfile:1;
 
-	/* relax restrictions in FOPEN_DIRECT_IO mode */
-	unsigned int direct_io_relax:1;
+	/* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */
+	unsigned int direct_io_allow_mmap:1;
 
 	/* Is statx not implemented by fs? */
 	unsigned int no_statx:1;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..444418e240c8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1232,8 +1232,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				fc->init_security = 1;
 			if (flags & FUSE_CREATE_SUPP_GROUP)
 				fc->create_supp_group = 1;
-			if (flags & FUSE_DIRECT_IO_RELAX)
-				fc->direct_io_relax = 1;
+			if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
+				fc->direct_io_allow_mmap = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1280,7 +1280,7 @@ void fuse_send_init(struct fuse_mount *fm)
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
 		FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
 		FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
-		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX;
+		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
 #ifdef CONFIG_FUSE_DAX
 	if (fm->fc->dax)
 		flags |= FUSE_MAP_ALIGNMENT;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index db92a7202b34..f4e3c083aede 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -209,7 +209,7 @@
  *  - add FUSE_HAS_EXPIRE_ONLY
  *
  *  7.39
- *  - add FUSE_DIRECT_IO_RELAX
+ *  - add FUSE_DIRECT_IO_ALLOW_MMAP
  *  - add FUSE_STATX and related structures
  */
 
@@ -409,8 +409,7 @@ struct fuse_file_lock {
  * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir,
  *			symlink and mknod (single group that matches parent)
  * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation
- * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now
- *                       allow shared mmap
+ * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode.
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -449,7 +448,7 @@ struct fuse_file_lock {
 #define FUSE_HAS_INODE_DAX	(1ULL << 33)
 #define FUSE_CREATE_SUPP_GROUP	(1ULL << 34)
 #define FUSE_HAS_EXPIRE_ONLY	(1ULL << 35)
-#define FUSE_DIRECT_IO_RELAX	(1ULL << 36)
+#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36)
 
 /**
  * CUSE INIT request/reply flags
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* [PATCH 2/2] docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
  2023-09-20  2:39 [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Tyler Fanelli
  2023-09-20  2:40 ` [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Tyler Fanelli
@ 2023-09-20  2:40 ` Tyler Fanelli
  2023-09-20  8:15 ` [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Miklos Szeredi
  2023-09-20  8:42 ` Bernd Schubert
  3 siblings, 0 replies; 48+ messages in thread
From: Tyler Fanelli @ 2023-09-20  2:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: mszeredi, gmaglione, hreitz, Tyler Fanelli

By default, shared mmap is disabled in FUSE DIRECT_IO mode. However,
when the DIRECT_IO_ALLOW_MMAP flag is enabled in the FUSE_INIT reply,
shared mmap is allowed.

Signed-off-by: Tyler Fanelli <tfanelli@redhat.com>
---
 Documentation/filesystems/fuse-io.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/fuse-io.rst b/Documentation/filesystems/fuse-io.rst
index 255a368fe534..6464de4266ad 100644
--- a/Documentation/filesystems/fuse-io.rst
+++ b/Documentation/filesystems/fuse-io.rst
@@ -15,7 +15,8 @@ The direct-io mode can be selected with the FOPEN_DIRECT_IO flag in the
 FUSE_OPEN reply.
 
 In direct-io mode the page cache is completely bypassed for reads and writes.
-No read-ahead takes place. Shared mmap is disabled.
+No read-ahead takes place. Shared mmap is disabled by default. To allow shared
+mmap, the FUSE_DIRECT_IO_ALLOW_MMAP flag may be enabled in the FUSE_INIT reply.
 
 In cached mode reads may be satisfied from the page cache, and data may be
 read-ahead by the kernel to fill the cache.  The cache is always kept consistent
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-09-20  2:39 [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Tyler Fanelli
  2023-09-20  2:40 ` [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Tyler Fanelli
  2023-09-20  2:40 ` [PATCH 2/2] docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP Tyler Fanelli
@ 2023-09-20  8:15 ` Miklos Szeredi
  2023-11-06 14:08   ` Bernd Schubert
  2023-09-20  8:42 ` Bernd Schubert
  3 siblings, 1 reply; 48+ messages in thread
From: Miklos Szeredi @ 2023-09-20  8:15 UTC (permalink / raw)
  To: Tyler Fanelli; +Cc: linux-fsdevel, mszeredi, gmaglione, hreitz

On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
>
> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> However, it leaves open the possibility of further relaxing the
> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> DIRECT_IO) in the future.
>
> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> only serves to allow shared mmap of DIRECT_IO files, while still
> bypassing the cache on regular reads and writes. The shared mmap is the
> only loosening of the cache policy that can take place with the flag.
> This removes some ambiguity and introduces a more stable flag to be used
> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>
> Tyler Fanelli (2):
>   fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>   docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP

Looks good.

Applied, thanks.  Will send the PR during this merge window, since the
rename could break stuff if already released.

Miklos

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
  2023-09-20  2:40 ` [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Tyler Fanelli
@ 2023-09-20  8:31   ` Hanna Czenczek
  2023-09-20  9:04     ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Hanna Czenczek @ 2023-09-20  8:31 UTC (permalink / raw)
  To: Tyler Fanelli, linux-fsdevel; +Cc: mszeredi, gmaglione

On 20.09.23 04:40, Tyler Fanelli wrote:
> Although DIRECT_IO_RELAX's initial usage is to allow shared mmap, its
> description indicates a purpose of reducing memory footprint. This
> may imply that it could be further used to relax other DIRECT_IO
> operations in the future.
>
> Replace it with a flag DIRECT_IO_ALLOW_MMAP which does only one thing,
> allow shared mmap of DIRECT_IO files while still bypassing the cache
> on regular reads and writes.

Thanks!

I prefer the definition to be narrow so that FUSE servers (virtiofsd, 
specifically) can rely on what exact behavior this flag enables.  As it 
is, I think it’s hard to use the flag, because:

It is not clear what the flag does. 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e78662e818f9 
gives a goal for using it (in case you want to reduce memory footprint), 
but doesn’t say what it will do.  This makes it difficult for us (in 
virtiofsd) to expose it, because we in turn can’t tell users in 
documentation what it’ll do.  For example, the commit correctly advises 
“to make sure it doesn't break coherency in your use case”, but that 
isn’t really possible when it isn’t well-defined what coherency 
properties are changed.

Further, is implied that what the flag does may change in the future, 
but how so is left unclear.  The goal given is to reduce memory 
footprint, but that’s actually done by using DIRECT_IO, not by using 
DIRECT_IO_RELAX, so what restrictions that latter may relax is left 
open.  Allowing mmap specifically kind of increases memory footprint, so 
it seems to me as if the combination of both flags is supposed to 
optimize for memory usage under the hard restriction of allowing every 
operation to work still, and mmap() is the one operation identified so 
far.  But if so, it should be possible to exhaustively identify all 
other operations besides mmap() that are affected by DIRECT_IO, so that 
they can all be enabled by the new flag, and exhaustively listed in its 
documentation.  (I assume mmap() is the only operation that’s affected, 
though.)  Without knowing what the flag will do in the future, any name 
under which we (in virtiofsd) choose to expose this flag might be 
outright wrong in the future.

> Signed-off-by: Tyler Fanelli <tfanelli@redhat.com>
> ---
>   fs/fuse/file.c            | 6 +++---
>   fs/fuse/fuse_i.h          | 4 ++--
>   fs/fuse/inode.c           | 6 +++---
>   include/uapi/linux/fuse.h | 7 +++----
>   4 files changed, 11 insertions(+), 12 deletions(-)

[...]

> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index db92a7202b34..f4e3c083aede 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -209,7 +209,7 @@
>    *  - add FUSE_HAS_EXPIRE_ONLY
>    *
>    *  7.39
> - *  - add FUSE_DIRECT_IO_RELAX
> + *  - add FUSE_DIRECT_IO_ALLOW_MMAP
>    *  - add FUSE_STATX and related structures
>    */
>   
> @@ -409,8 +409,7 @@ struct fuse_file_lock {
>    * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir,
>    *			symlink and mknod (single group that matches parent)
>    * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation
> - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now
> - *                       allow shared mmap
> + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode.
>    */
>   #define FUSE_ASYNC_READ		(1 << 0)
>   #define FUSE_POSIX_LOCKS	(1 << 1)
> @@ -449,7 +448,7 @@ struct fuse_file_lock {
>   #define FUSE_HAS_INODE_DAX	(1ULL << 33)
>   #define FUSE_CREATE_SUPP_GROUP	(1ULL << 34)
>   #define FUSE_HAS_EXPIRE_ONLY	(1ULL << 35)
> -#define FUSE_DIRECT_IO_RELAX	(1ULL << 36)
> +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36)

Is it allowed to remove FUSE_DIRECT_IO_RELAX now that it was already 
present in a uapi header?

Personally, I don’t mind keeping the name for the flag and just change 
the documentation.  Or we might keep the old name as an alias for the 
new one.

>   
>   /**
>    * CUSE INIT request/reply flags


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-09-20  2:39 [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Tyler Fanelli
                   ` (2 preceding siblings ...)
  2023-09-20  8:15 ` [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Miklos Szeredi
@ 2023-09-20  8:42 ` Bernd Schubert
  3 siblings, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-09-20  8:42 UTC (permalink / raw)
  To: Tyler Fanelli, linux-fsdevel; +Cc: mszeredi, gmaglione, hreitz, Hao Xu



On 9/20/23 04:39, Tyler Fanelli wrote:
> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> However, it leaves open the possibility of further relaxing the
> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> DIRECT_IO) in the future.
> 
> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> only serves to allow shared mmap of DIRECT_IO files, while still
> bypassing the cache on regular reads and writes. The shared mmap is the
> only loosening of the cache policy that can take place with the flag.
> This removes some ambiguity and introduces a more stable flag to be used
> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> 
> Tyler Fanelli (2):
>    fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>    docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> 
>   Documentation/filesystems/fuse-io.rst | 3 ++-
>   fs/fuse/file.c                        | 6 +++---
>   fs/fuse/fuse_i.h                      | 4 ++--
>   fs/fuse/inode.c                       | 6 +++---
>   include/uapi/linux/fuse.h             | 7 +++----
>   5 files changed, 13 insertions(+), 13 deletions(-)
> 

I guess would be good to add Hao (in CC here), as the author and in case 
the flag is already used in production (on an internal kernel version).


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
  2023-09-20  8:31   ` Hanna Czenczek
@ 2023-09-20  9:04     ` Bernd Schubert
  0 siblings, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-09-20  9:04 UTC (permalink / raw)
  To: Hanna Czenczek, Tyler Fanelli, linux-fsdevel, Hao Xu; +Cc: mszeredi, gmaglione



On 9/20/23 10:31, Hanna Czenczek wrote:
> On 20.09.23 04:40, Tyler Fanelli wrote:
>> - *                       allow shared mmap
>> + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode.
>>    */
>>   #define FUSE_ASYNC_READ        (1 << 0)
>>   #define FUSE_POSIX_LOCKS    (1 << 1)
>> @@ -449,7 +448,7 @@ struct fuse_file_lock {
>>   #define FUSE_HAS_INODE_DAX    (1ULL << 33)
>>   #define FUSE_CREATE_SUPP_GROUP    (1ULL << 34)
>>   #define FUSE_HAS_EXPIRE_ONLY    (1ULL << 35)
>> -#define FUSE_DIRECT_IO_RELAX    (1ULL << 36)
>> +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36)
> 
> Is it allowed to remove FUSE_DIRECT_IO_RELAX now that it was already 
> present in a uapi header?
> 
> Personally, I don’t mind keeping the name for the flag and just change 
> the documentation.  Or we might keep the old name as an alias for the 
> new one.

It is only in linux-6.6-rcX, not in any released version. Which is why 
Miklos posted that he is going to send these patches as -rc update and 
not only in 6.7.

Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-09-20  8:15 ` [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Miklos Szeredi
@ 2023-11-06 14:08   ` Bernd Schubert
  2023-11-08 18:19     ` Tyler Fanelli
  2023-12-02 15:06     ` Amir Goldstein
  0 siblings, 2 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-11-06 14:08 UTC (permalink / raw)
  To: Miklos Szeredi, Tyler Fanelli; +Cc: linux-fsdevel, mszeredi, gmaglione, hreitz

Hi Miklos,

On 9/20/23 10:15, Miklos Szeredi wrote:
> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
>>
>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
>> However, it leaves open the possibility of further relaxing the
>> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
>> DIRECT_IO) in the future.
>>
>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
>> only serves to allow shared mmap of DIRECT_IO files, while still
>> bypassing the cache on regular reads and writes. The shared mmap is the
>> only loosening of the cache policy that can take place with the flag.
>> This removes some ambiguity and introduces a more stable flag to be used
>> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>
>> Tyler Fanelli (2):
>>    fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>    docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> 
> Looks good.
> 
> Applied, thanks.  Will send the PR during this merge window, since the
> rename could break stuff if already released.

I'm just porting back this feature to our internal fuse module and it 
looks these rename patches have been forgotten?


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-11-06 14:08   ` Bernd Schubert
@ 2023-11-08 18:19     ` Tyler Fanelli
  2023-12-02 15:06     ` Amir Goldstein
  1 sibling, 0 replies; 48+ messages in thread
From: Tyler Fanelli @ 2023-11-08 18:19 UTC (permalink / raw)
  To: Bernd Schubert, Miklos Szeredi; +Cc: linux-fsdevel, mszeredi, gmaglione, hreitz

On 11/6/23 9:08 AM, Bernd Schubert wrote:
> Hi Miklos,
>
> On 9/20/23 10:15, Miklos Szeredi wrote:
>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> 
>> wrote:
>>>
>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
>>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
>>> However, it leaves open the possibility of further relaxing the
>>> DIRECT_IO restrictions (and in-effect, the cache coherency 
>>> guarantees of
>>> DIRECT_IO) in the future.
>>>
>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
>>> only serves to allow shared mmap of DIRECT_IO files, while still
>>> bypassing the cache on regular reads and writes. The shared mmap is the
>>> only loosening of the cache policy that can take place with the flag.
>>> This removes some ambiguity and introduces a more stable flag to be 
>>> used
>>> in FUSE_INIT. Furthermore, we can document that to allow shared 
>>> mmap'ing
>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>
>>> Tyler Fanelli (2):
>>>    fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>    docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>
>> Looks good.
>>
>> Applied, thanks.  Will send the PR during this merge window, since the
>> rename could break stuff if already released.
>
> I'm just porting back this feature to our internal fuse module and it 
> looks these rename patches have been forgotten?
>
>
> Thanks,
> Bernd
>
 From a conversation with Miklos, I believe the patches will be modified 
to make DIRECT_IO_RELAX an obsolete alias and still add 
DIRECT_IO_ALLOW_MMAP.


Tyler


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-11-06 14:08   ` Bernd Schubert
  2023-11-08 18:19     ` Tyler Fanelli
@ 2023-12-02 15:06     ` Amir Goldstein
  2023-12-03 11:20       ` Amir Goldstein
  1 sibling, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-02 15:06 UTC (permalink / raw)
  To: Bernd Schubert, Miklos Szeredi
  Cc: Tyler Fanelli, linux-fsdevel, mszeredi, gmaglione, hreitz, Hao Xu

On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
> Hi Miklos,
>
> On 9/20/23 10:15, Miklos Szeredi wrote:
> > On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
> >>
> >> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> >> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> >> However, it leaves open the possibility of further relaxing the
> >> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> >> DIRECT_IO) in the future.
> >>
> >> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> >> only serves to allow shared mmap of DIRECT_IO files, while still
> >> bypassing the cache on regular reads and writes. The shared mmap is the
> >> only loosening of the cache policy that can take place with the flag.
> >> This removes some ambiguity and introduces a more stable flag to be used
> >> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> >> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> >>
> >> Tyler Fanelli (2):
> >>    fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
> >>    docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> >
> > Looks good.
> >
> > Applied, thanks.  Will send the PR during this merge window, since the
> > rename could break stuff if already released.
>
> I'm just porting back this feature to our internal fuse module and it
> looks these rename patches have been forgotten?
>
>

Hi Miklos, Bernd,

I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
direct_io_relax mode") and I was wondering - isn't dirty pages writeback
needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
direct_io_allow_mmap case?

For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
for munmap of files also in direct-io mode [1], so I was considering installing
fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
and regardless of direct_io_allow_mmap.

I was asking myself if there was a good reason why fuse_page_mkwrite()/
fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
should NOT be called for the FOPEN_DIRECT_IO case regardless of
direct_io_allow_mmap?

I mean, maybe an unmap of a read-only private map does not need to
flush dirty pages (IDK), but caching mode seems to do it anyway?

Thanks,
Amir.

[1] https://lore.kernel.org/linux-fsdevel/CAJfpegtOt6MDFM3vsK+syJhpLMSm7wBazkXuxjRTXtAsn9gCuA@mail.gmail.com/

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-02 15:06     ` Amir Goldstein
@ 2023-12-03 11:20       ` Amir Goldstein
  2023-12-03 23:00         ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-03 11:20 UTC (permalink / raw)
  To: Bernd Schubert, Miklos Szeredi
  Cc: Tyler Fanelli, linux-fsdevel, mszeredi, gmaglione, hreitz, Hao Xu

On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
> >
> > Hi Miklos,
> >
> > On 9/20/23 10:15, Miklos Szeredi wrote:
> > > On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
> > >>
> > >> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> > >> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> > >> However, it leaves open the possibility of further relaxing the
> > >> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> > >> DIRECT_IO) in the future.
> > >>
> > >> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> > >> only serves to allow shared mmap of DIRECT_IO files, while still
> > >> bypassing the cache on regular reads and writes. The shared mmap is the
> > >> only loosening of the cache policy that can take place with the flag.
> > >> This removes some ambiguity and introduces a more stable flag to be used
> > >> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> > >> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> > >>
> > >> Tyler Fanelli (2):
> > >>    fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
> > >>    docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> > >
> > > Looks good.
> > >
> > > Applied, thanks.  Will send the PR during this merge window, since the
> > > rename could break stuff if already released.
> >
> > I'm just porting back this feature to our internal fuse module and it
> > looks these rename patches have been forgotten?
> >
> >
>
> Hi Miklos, Bernd,
>
> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
> direct_io_relax mode") and I was wondering - isn't dirty pages writeback
> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
> direct_io_allow_mmap case?
>
> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
> for munmap of files also in direct-io mode [1], so I was considering installing
> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
> and regardless of direct_io_allow_mmap.
>
> I was asking myself if there was a good reason why fuse_page_mkwrite()/
> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
> should NOT be called for the FOPEN_DIRECT_IO case regardless of
> direct_io_allow_mmap?
>

Before trying to make changes to fuse_file_mmap() I tried to test
DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
passthrough_hp --direct-io.

The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, mmap, splice)
on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
fuse_set_nowrite()

I am wondering how this code was tested?

I could not figure out the problem and how to fix it.
Please suggest a fix and let me know which adjustments are needed
if I want to use fuse_file_vm_ops for all mmap modes.

Thanks,
Amir.

generic/095 5s ...  [10:53:05][   61.185656] kernel BUG at fs/fuse/dir.c:1756!
[   61.186653] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[   61.187447] CPU: 2 PID: 3599 Comm: fio Not tainted 6.6.0-xfstests #2025
[   61.188461] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
BIOS 1.15.0-1 04/01/2014
[   61.189529] RIP: 0010:fuse_set_nowrite+0x47/0xdd
[   61.190117] Code: 48 8b 87 e8 00 00 00 48 85 c0 75 02 0f 0b 48 8d
af 38 06 00 00 48 89 fb 48 89 ef e8 e8 2b 8f 00 8b 83 28 05 00 00 85
c0 79 02 <0f> 0b 05 00 00 00 80 48 89 ef 89 83 28 05 00 00 e8 86 30 8f
00 be
[   61.192497] RSP: 0018:ffffc9000313fc98 EFLAGS: 00010282
[   61.193109] RAX: 0000000080000001 RBX: ffff88800cfb21c0 RCX: ffffc9000313fc3c
[   61.193937] RDX: 0000000000000003 RSI: ffffffff827ce6be RDI: ffffffff828a86cd
[   61.194736] RBP: ffff88800cfb27f8 R08: 0000000e3ef2354a R09: 0000000000000000
[   61.195509] R10: ffffffff82b74f20 R11: 0000000000000002 R12: ffff888009bf1f00
[   61.196291] R13: ffffc9000313fe70 R14: 0000000000000002 R15: ffff88800cfb23f0
[   61.197069] FS:  00007fa089f64740(0000) GS:ffff88807da00000(0000)
knlGS:0000000000000000
[   61.198024] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   61.198701] CR2: 00007fa089f17fe0 CR3: 0000000009202001 CR4: 0000000000370ee0
[   61.199817] Call Trace:
[   61.200198]  <TASK>
[   61.200486]  ? __die_body+0x1b/0x59
[   61.200975]  ? die+0x35/0x4f
[   61.201379]  ? do_trap+0x7c/0xff
[   61.201828]  ? fuse_set_nowrite+0x47/0xdd
[   61.202303]  ? do_error_trap+0xbe/0xeb
[   61.202733]  ? fuse_set_nowrite+0x47/0xdd
[   61.203196]  ? fuse_set_nowrite+0x47/0xdd
[   61.203723]  ? exc_invalid_op+0x52/0x69
[   61.204202]  ? fuse_set_nowrite+0x47/0xdd
[   61.204720]  ? asm_exc_invalid_op+0x1a/0x20
[   61.205204]  ? fuse_set_nowrite+0x47/0xdd
[   61.205628]  ? fuse_set_nowrite+0x3d/0xdd
[   61.206061]  ? do_raw_spin_unlock+0x88/0x8f
[   61.206498]  ? _raw_spin_unlock+0x2d/0x43
[   61.206915]  ? fuse_range_is_writeback+0x71/0x84
[   61.207383]  fuse_sync_writes+0xf/0x19
[   61.207857]  fuse_direct_io+0x167/0x5bd
[   61.208375]  fuse_direct_write_iter+0xf0/0x146
[   61.208990]  vfs_write+0x11d/0x1c4
[   61.209458]  ksys_pwrite64+0x68/0x87
[   61.209959]  do_syscall_64+0x6e/0x88

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-03 11:20       ` Amir Goldstein
@ 2023-12-03 23:00         ` Bernd Schubert
  2023-12-04  6:50           ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-03 23:00 UTC (permalink / raw)
  To: Amir Goldstein, Miklos Szeredi
  Cc: Tyler Fanelli, linux-fsdevel, mszeredi, gmaglione, hreitz, Hao Xu

Hi Amir,

On 12/3/23 12:20, Amir Goldstein wrote:
> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> wrote:
>>
>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>>>
>>> Hi Miklos,
>>>
>>> On 9/20/23 10:15, Miklos Szeredi wrote:
>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
>>>>>
>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
>>>>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
>>>>> However, it leaves open the possibility of further relaxing the
>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
>>>>> DIRECT_IO) in the future.
>>>>>
>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
>>>>> only serves to allow shared mmap of DIRECT_IO files, while still
>>>>> bypassing the cache on regular reads and writes. The shared mmap is the
>>>>> only loosening of the cache policy that can take place with the flag.
>>>>> This removes some ambiguity and introduces a more stable flag to be used
>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>>>
>>>>> Tyler Fanelli (2):
>>>>>     fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>>>     docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>>>
>>>> Looks good.
>>>>
>>>> Applied, thanks.  Will send the PR during this merge window, since the
>>>> rename could break stuff if already released.
>>>
>>> I'm just porting back this feature to our internal fuse module and it
>>> looks these rename patches have been forgotten?
>>>
>>>
>>
>> Hi Miklos, Bernd,
>>
>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
>> direct_io_relax mode") and I was wondering - isn't dirty pages writeback
>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
>> direct_io_allow_mmap case?
>>
>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
>> for munmap of files also in direct-io mode [1], so I was considering installing
>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
>> and regardless of direct_io_allow_mmap.
>>
>> I was asking myself if there was a good reason why fuse_page_mkwrite()/
>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
>> direct_io_allow_mmap?
>>
> 
> Before trying to make changes to fuse_file_mmap() I tried to test
> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
> passthrough_hp --direct-io.
> 
> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, mmap, splice)
> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
> fuse_set_nowrite()
> 
> I am wondering how this code was tested?
> 
> I could not figure out the problem and how to fix it.
> Please suggest a fix and let me know which adjustments are needed
> if I want to use fuse_file_vm_ops for all mmap modes.

So fuse_set_nowrite() tests for inode_is_locked(), but that also 
succeeds for a shared lock. It gets late here (and I might miss 
something), but I think we have an issue with 
FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT and 
mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT 
does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for that 
are still pending.


Will look into it in more detail in the morning.

Thanks,
Bernd

> 
> Thanks,
> Amir.
> 
> generic/095 5s ...  [10:53:05][   61.185656] kernel BUG at fs/fuse/dir.c:1756!
> [   61.186653] invalid opcode: 0000 [#1] PREEMPT SMP PTI
> [   61.187447] CPU: 2 PID: 3599 Comm: fio Not tainted 6.6.0-xfstests #2025
> [   61.188461] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
> BIOS 1.15.0-1 04/01/2014
> [   61.189529] RIP: 0010:fuse_set_nowrite+0x47/0xdd
> [   61.190117] Code: 48 8b 87 e8 00 00 00 48 85 c0 75 02 0f 0b 48 8d
> af 38 06 00 00 48 89 fb 48 89 ef e8 e8 2b 8f 00 8b 83 28 05 00 00 85
> c0 79 02 <0f> 0b 05 00 00 00 80 48 89 ef 89 83 28 05 00 00 e8 86 30 8f
> 00 be
> [   61.192497] RSP: 0018:ffffc9000313fc98 EFLAGS: 00010282
> [   61.193109] RAX: 0000000080000001 RBX: ffff88800cfb21c0 RCX: ffffc9000313fc3c
> [   61.193937] RDX: 0000000000000003 RSI: ffffffff827ce6be RDI: ffffffff828a86cd
> [   61.194736] RBP: ffff88800cfb27f8 R08: 0000000e3ef2354a R09: 0000000000000000
> [   61.195509] R10: ffffffff82b74f20 R11: 0000000000000002 R12: ffff888009bf1f00
> [   61.196291] R13: ffffc9000313fe70 R14: 0000000000000002 R15: ffff88800cfb23f0
> [   61.197069] FS:  00007fa089f64740(0000) GS:ffff88807da00000(0000)
> knlGS:0000000000000000
> [   61.198024] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   61.198701] CR2: 00007fa089f17fe0 CR3: 0000000009202001 CR4: 0000000000370ee0
> [   61.199817] Call Trace:
> [   61.200198]  <TASK>
> [   61.200486]  ? __die_body+0x1b/0x59
> [   61.200975]  ? die+0x35/0x4f
> [   61.201379]  ? do_trap+0x7c/0xff
> [   61.201828]  ? fuse_set_nowrite+0x47/0xdd
> [   61.202303]  ? do_error_trap+0xbe/0xeb
> [   61.202733]  ? fuse_set_nowrite+0x47/0xdd
> [   61.203196]  ? fuse_set_nowrite+0x47/0xdd
> [   61.203723]  ? exc_invalid_op+0x52/0x69
> [   61.204202]  ? fuse_set_nowrite+0x47/0xdd
> [   61.204720]  ? asm_exc_invalid_op+0x1a/0x20
> [   61.205204]  ? fuse_set_nowrite+0x47/0xdd
> [   61.205628]  ? fuse_set_nowrite+0x3d/0xdd
> [   61.206061]  ? do_raw_spin_unlock+0x88/0x8f
> [   61.206498]  ? _raw_spin_unlock+0x2d/0x43
> [   61.206915]  ? fuse_range_is_writeback+0x71/0x84
> [   61.207383]  fuse_sync_writes+0xf/0x19
> [   61.207857]  fuse_direct_io+0x167/0x5bd
> [   61.208375]  fuse_direct_write_iter+0xf0/0x146
> [   61.208990]  vfs_write+0x11d/0x1c4
> [   61.209458]  ksys_pwrite64+0x68/0x87
> [   61.209959]  do_syscall_64+0x6e/0x88
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-03 23:00         ` Bernd Schubert
@ 2023-12-04  6:50           ` Amir Goldstein
  2023-12-04  9:27             ` Miklos Szeredi
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-04  6:50 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu

[-- Attachment #1: Type: text/plain, Size: 4467 bytes --]

On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
> Hi Amir,
>
> On 12/3/23 12:20, Amir Goldstein wrote:
> > On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >>
> >> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
> >> <bernd.schubert@fastmail.fm> wrote:
> >>>
> >>> Hi Miklos,
> >>>
> >>> On 9/20/23 10:15, Miklos Szeredi wrote:
> >>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
> >>>>>
> >>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> >>>>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> >>>>> However, it leaves open the possibility of further relaxing the
> >>>>> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> >>>>> DIRECT_IO) in the future.
> >>>>>
> >>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> >>>>> only serves to allow shared mmap of DIRECT_IO files, while still
> >>>>> bypassing the cache on regular reads and writes. The shared mmap is the
> >>>>> only loosening of the cache policy that can take place with the flag.
> >>>>> This removes some ambiguity and introduces a more stable flag to be used
> >>>>> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> >>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> >>>>>
> >>>>> Tyler Fanelli (2):
> >>>>>     fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
> >>>>>     docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> >>>>
> >>>> Looks good.
> >>>>
> >>>> Applied, thanks.  Will send the PR during this merge window, since the
> >>>> rename could break stuff if already released.
> >>>
> >>> I'm just porting back this feature to our internal fuse module and it
> >>> looks these rename patches have been forgotten?
> >>>
> >>>
> >>
> >> Hi Miklos, Bernd,
> >>
> >> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
> >> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
> >> direct_io_relax mode") and I was wondering - isn't dirty pages writeback
> >> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
> >> direct_io_allow_mmap case?
> >>
> >> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
> >> for munmap of files also in direct-io mode [1], so I was considering installing
> >> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
> >> and regardless of direct_io_allow_mmap.
> >>
> >> I was asking myself if there was a good reason why fuse_page_mkwrite()/
> >> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
> >> should NOT be called for the FOPEN_DIRECT_IO case regardless of
> >> direct_io_allow_mmap?
> >>
> >
> > Before trying to make changes to fuse_file_mmap() I tried to test
> > DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
> > passthrough_hp --direct-io.
> >
> > The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, mmap, splice)
> > on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
> > fuse_set_nowrite()
> >
> > I am wondering how this code was tested?
> >
> > I could not figure out the problem and how to fix it.
> > Please suggest a fix and let me know which adjustments are needed
> > if I want to use fuse_file_vm_ops for all mmap modes.
>
> So fuse_set_nowrite() tests for inode_is_locked(), but that also
> succeeds for a shared lock. It gets late here (and I might miss
> something), but I think we have an issue with
> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT and
> mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for that
> are still pending.
>

Your analysis seems to be correct.

Attached patch fixes the problem and should be backported to 6.6.y.

Miklos,

I prepared the patch on top of master and not on top of the rename to
FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
6.6.y, although if you are planning send the flag rename to v6.7 as a fix,
you may prefer to apply the fix after the rename and request to backport
the flag rename along with the fix to 6.6.y.

Having the final flag name in v6.6.y would be a nice bonus.

Let me know if you want me to post the fix patch based on for-next.

Thanks,
Amir.

[-- Attachment #2: 0001-fuse-disable-FOPEN_PARALLEL_DIRECT_WRITES-with-FUSE_.patch --]
[-- Type: text/x-patch, Size: 1962 bytes --]

From 46865e2660d0e9d64cd8c56c905eafee7f4c03b5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 3 Dec 2023 09:42:33 +0200
Subject: [PATCH] fuse: disable FOPEN_PARALLEL_DIRECT_WRITES with
 FUSE_DIRECT_IO_RELAX

The new fuse init flag FUSE_DIRECT_IO_RELAX breaks assumptions made by
FOPEN_PARALLEL_DIRECT_WRITES and causes test generic/095 to hit
BUG_ON(fi->writectr < 0) assertions in fuse_set_nowrite():

generic/095 5s ...
  kernel BUG at fs/fuse/dir.c:1756!
...
  ? fuse_set_nowrite+0x3d/0xdd
  ? do_raw_spin_unlock+0x88/0x8f
  ? _raw_spin_unlock+0x2d/0x43
  ? fuse_range_is_writeback+0x71/0x84
  fuse_sync_writes+0xf/0x19
  fuse_direct_io+0x167/0x5bd
  fuse_direct_write_iter+0xf0/0x146

Auto disable FOPEN_PARALLEL_DIRECT_WRITES when server negotiated
FUSE_DIRECT_IO_RELAX.

Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode")
Cc: <stable@vger.kernel.org> # v6.6
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/fuse/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..5b5297805675 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1574,6 +1574,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t res;
 	bool exclusive_lock =
 		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+		get_fuse_conn(inode)->direct_io_relax ||
 		iocb->ki_flags & IOCB_APPEND ||
 		fuse_direct_write_extending_i_size(iocb, from);
 
@@ -1581,6 +1582,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	 * Take exclusive lock if
 	 * - Parallel direct writes are disabled - a user space decision
 	 * - Parallel direct writes are enabled and i_size is being extended.
+	 * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_RELAX).
 	 *   This might not be needed at all, but needs further investigation.
 	 */
 	if (exclusive_lock)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-04  6:50           ` Amir Goldstein
@ 2023-12-04  9:27             ` Miklos Szeredi
  2023-12-04 10:04               ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Miklos Szeredi @ 2023-12-04  9:27 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Bernd Schubert, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu

On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
> >
> > Hi Amir,
> >
> > On 12/3/23 12:20, Amir Goldstein wrote:
> > > On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >>
> > >> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
> > >> <bernd.schubert@fastmail.fm> wrote:
> > >>>
> > >>> Hi Miklos,
> > >>>
> > >>> On 9/20/23 10:15, Miklos Szeredi wrote:
> > >>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
> > >>>>>
> > >>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
> > >>>>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
> > >>>>> However, it leaves open the possibility of further relaxing the
> > >>>>> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
> > >>>>> DIRECT_IO) in the future.
> > >>>>>
> > >>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
> > >>>>> only serves to allow shared mmap of DIRECT_IO files, while still
> > >>>>> bypassing the cache on regular reads and writes. The shared mmap is the
> > >>>>> only loosening of the cache policy that can take place with the flag.
> > >>>>> This removes some ambiguity and introduces a more stable flag to be used
> > >>>>> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
> > >>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> > >>>>>
> > >>>>> Tyler Fanelli (2):
> > >>>>>     fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
> > >>>>>     docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> > >>>>
> > >>>> Looks good.
> > >>>>
> > >>>> Applied, thanks.  Will send the PR during this merge window, since the
> > >>>> rename could break stuff if already released.
> > >>>
> > >>> I'm just porting back this feature to our internal fuse module and it
> > >>> looks these rename patches have been forgotten?
> > >>>
> > >>>
> > >>
> > >> Hi Miklos, Bernd,
> > >>
> > >> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
> > >> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
> > >> direct_io_relax mode") and I was wondering - isn't dirty pages writeback
> > >> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
> > >> direct_io_allow_mmap case?
> > >>
> > >> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
> > >> for munmap of files also in direct-io mode [1], so I was considering installing
> > >> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
> > >> and regardless of direct_io_allow_mmap.
> > >>
> > >> I was asking myself if there was a good reason why fuse_page_mkwrite()/
> > >> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
> > >> should NOT be called for the FOPEN_DIRECT_IO case regardless of
> > >> direct_io_allow_mmap?
> > >>
> > >
> > > Before trying to make changes to fuse_file_mmap() I tried to test
> > > DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
> > > passthrough_hp --direct-io.
> > >
> > > The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, mmap, splice)
> > > on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
> > > fuse_set_nowrite()
> > >
> > > I am wondering how this code was tested?
> > >
> > > I could not figure out the problem and how to fix it.
> > > Please suggest a fix and let me know which adjustments are needed
> > > if I want to use fuse_file_vm_ops for all mmap modes.
> >
> > So fuse_set_nowrite() tests for inode_is_locked(), but that also
> > succeeds for a shared lock. It gets late here (and I might miss
> > something), but I think we have an issue with
> > FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT and
> > mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
> > does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for that
> > are still pending.
> >
>
> Your analysis seems to be correct.
>
> Attached patch fixes the problem and should be backported to 6.6.y.
>
> Miklos,
>
> I prepared the patch on top of master and not on top of the rename to
> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
> 6.6.y, although if you are planning send the flag rename to v6.7 as a fix,
> you may prefer to apply the fix after the rename and request to backport
> the flag rename along with the fix to 6.6.y.

I've done that.   Thanks for the fix and testing.

Miklos

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-04  9:27             ` Miklos Szeredi
@ 2023-12-04 10:04               ` Bernd Schubert
  2023-12-04 23:42                 ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-04 10:04 UTC (permalink / raw)
  To: Miklos Szeredi, Amir Goldstein
  Cc: Tyler Fanelli, linux-fsdevel, mszeredi, gmaglione, hreitz, Hao Xu



On 12/4/23 10:27, Miklos Szeredi wrote:
> On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> wrote:
>>
>> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>>>
>>> Hi Amir,
>>>
>>> On 12/3/23 12:20, Amir Goldstein wrote:
>>>> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> wrote:
>>>>>
>>>>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>> Hi Miklos,
>>>>>>
>>>>>> On 9/20/23 10:15, Miklos Szeredi wrote:
>>>>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli <tfanelli@redhat.com> wrote:
>>>>>>>>
>>>>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the purpose
>>>>>>>> of allowing shared mmap of files opened/created with DIRECT_IO enabled.
>>>>>>>> However, it leaves open the possibility of further relaxing the
>>>>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency guarantees of
>>>>>>>> DIRECT_IO) in the future.
>>>>>>>>
>>>>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its purpose. It
>>>>>>>> only serves to allow shared mmap of DIRECT_IO files, while still
>>>>>>>> bypassing the cache on regular reads and writes. The shared mmap is the
>>>>>>>> only loosening of the cache policy that can take place with the flag.
>>>>>>>> This removes some ambiguity and introduces a more stable flag to be used
>>>>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared mmap'ing
>>>>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>>>>>>
>>>>>>>> Tyler Fanelli (2):
>>>>>>>>      fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>>>>>>      docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>>>>>>
>>>>>>> Looks good.
>>>>>>>
>>>>>>> Applied, thanks.  Will send the PR during this merge window, since the
>>>>>>> rename could break stuff if already released.
>>>>>>
>>>>>> I'm just porting back this feature to our internal fuse module and it
>>>>>> looks these rename patches have been forgotten?
>>>>>>
>>>>>>
>>>>>
>>>>> Hi Miklos, Bernd,
>>>>>
>>>>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
>>>>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct write in
>>>>> direct_io_relax mode") and I was wondering - isn't dirty pages writeback
>>>>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
>>>>> direct_io_allow_mmap case?
>>>>>
>>>>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
>>>>> for munmap of files also in direct-io mode [1], so I was considering installing
>>>>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
>>>>> and regardless of direct_io_allow_mmap.
>>>>>
>>>>> I was asking myself if there was a good reason why fuse_page_mkwrite()/
>>>>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
>>>>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
>>>>> direct_io_allow_mmap?
>>>>>
>>>>
>>>> Before trying to make changes to fuse_file_mmap() I tried to test
>>>> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
>>>> passthrough_hp --direct-io.
>>>>
>>>> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, mmap, splice)
>>>> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
>>>> fuse_set_nowrite()
>>>>
>>>> I am wondering how this code was tested?
>>>>
>>>> I could not figure out the problem and how to fix it.
>>>> Please suggest a fix and let me know which adjustments are needed
>>>> if I want to use fuse_file_vm_ops for all mmap modes.
>>>
>>> So fuse_set_nowrite() tests for inode_is_locked(), but that also
>>> succeeds for a shared lock. It gets late here (and I might miss
>>> something), but I think we have an issue with
>>> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT and
>>> mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
>>> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for that
>>> are still pending.
>>>
>>
>> Your analysis seems to be correct.
>>
>> Attached patch fixes the problem and should be backported to 6.6.y.
>>
>> Miklos,
>>
>> I prepared the patch on top of master and not on top of the rename to
>> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
>> 6.6.y, although if you are planning send the flag rename to v6.7 as a fix,
>> you may prefer to apply the fix after the rename and request to backport
>> the flag rename along with the fix to 6.6.y.
> 
> I've done that.   Thanks for the fix and testing.

Hi Amir, hi Miklos,

could you please hold on a bit before sending the patch upstream?
I think we can just test for fuse_range_is_writeback in 
fuse_direct_write_iter. I will have a patch in a few minutes.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-04 10:04               ` Bernd Schubert
@ 2023-12-04 23:42                 ` Bernd Schubert
  2023-12-05  7:00                   ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-04 23:42 UTC (permalink / raw)
  To: Miklos Szeredi, Amir Goldstein
  Cc: Tyler Fanelli, linux-fsdevel, mszeredi, gmaglione, hreitz, Hao Xu



On 12/4/23 11:04, Bernd Schubert wrote:
> 
> 
> On 12/4/23 10:27, Miklos Szeredi wrote:
>> On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> wrote:
>>>
>>> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>> Hi Amir,
>>>>
>>>> On 12/3/23 12:20, Amir Goldstein wrote:
>>>>> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com> 
>>>>> wrote:
>>>>>>
>>>>>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>
>>>>>>> Hi Miklos,
>>>>>>>
>>>>>>> On 9/20/23 10:15, Miklos Szeredi wrote:
>>>>>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli 
>>>>>>>> <tfanelli@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the 
>>>>>>>>> purpose
>>>>>>>>> of allowing shared mmap of files opened/created with DIRECT_IO 
>>>>>>>>> enabled.
>>>>>>>>> However, it leaves open the possibility of further relaxing the
>>>>>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency 
>>>>>>>>> guarantees of
>>>>>>>>> DIRECT_IO) in the future.
>>>>>>>>>
>>>>>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its 
>>>>>>>>> purpose. It
>>>>>>>>> only serves to allow shared mmap of DIRECT_IO files, while still
>>>>>>>>> bypassing the cache on regular reads and writes. The shared 
>>>>>>>>> mmap is the
>>>>>>>>> only loosening of the cache policy that can take place with the 
>>>>>>>>> flag.
>>>>>>>>> This removes some ambiguity and introduces a more stable flag 
>>>>>>>>> to be used
>>>>>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared 
>>>>>>>>> mmap'ing
>>>>>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>>>>>>>
>>>>>>>>> Tyler Fanelli (2):
>>>>>>>>>      fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>>>>>>>      docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>>>>>>>
>>>>>>>> Looks good.
>>>>>>>>
>>>>>>>> Applied, thanks.  Will send the PR during this merge window, 
>>>>>>>> since the
>>>>>>>> rename could break stuff if already released.
>>>>>>>
>>>>>>> I'm just porting back this feature to our internal fuse module 
>>>>>>> and it
>>>>>>> looks these rename patches have been forgotten?
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Hi Miklos, Bernd,
>>>>>>
>>>>>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
>>>>>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct 
>>>>>> write in
>>>>>> direct_io_relax mode") and I was wondering - isn't dirty pages 
>>>>>> writeback
>>>>>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
>>>>>> direct_io_allow_mmap case?
>>>>>>
>>>>>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
>>>>>> for munmap of files also in direct-io mode [1], so I was 
>>>>>> considering installing
>>>>>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
>>>>>> and regardless of direct_io_allow_mmap.
>>>>>>
>>>>>> I was asking myself if there was a good reason why 
>>>>>> fuse_page_mkwrite()/
>>>>>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
>>>>>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
>>>>>> direct_io_allow_mmap?
>>>>>>
>>>>>
>>>>> Before trying to make changes to fuse_file_mmap() I tried to test
>>>>> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
>>>>> passthrough_hp --direct-io.
>>>>>
>>>>> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio, 
>>>>> mmap, splice)
>>>>> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
>>>>> fuse_set_nowrite()
>>>>>
>>>>> I am wondering how this code was tested?
>>>>>
>>>>> I could not figure out the problem and how to fix it.
>>>>> Please suggest a fix and let me know which adjustments are needed
>>>>> if I want to use fuse_file_vm_ops for all mmap modes.
>>>>
>>>> So fuse_set_nowrite() tests for inode_is_locked(), but that also
>>>> succeeds for a shared lock. It gets late here (and I might miss
>>>> something), but I think we have an issue with
>>>> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT 
>>>> and
>>>> mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
>>>> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for 
>>>> that
>>>> are still pending.
>>>>
>>>
>>> Your analysis seems to be correct.
>>>
>>> Attached patch fixes the problem and should be backported to 6.6.y.
>>>
>>> Miklos,
>>>
>>> I prepared the patch on top of master and not on top of the rename to
>>> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
>>> 6.6.y, although if you are planning send the flag rename to v6.7 as a 
>>> fix,
>>> you may prefer to apply the fix after the rename and request to backport
>>> the flag rename along with the fix to 6.6.y.
>>
>> I've done that.   Thanks for the fix and testing.
> 
> Hi Amir, hi Miklos,
> 
> could you please hold on a bit before sending the patch upstream?
> I think we can just test for fuse_range_is_writeback in 
> fuse_direct_write_iter. I will have a patch in a few minutes.

Hmm, that actually doesn't work as we would need to hold the inode lock 
in page write functions.
Then tried to do it per inode and only when the inode gets cached writes 
or mmap - this triggers a lockdep lock order warning, because 
fuse_file_mmap is called with mm->mmap_lock and would take the inode 
lock. But through 
fuse_direct_io/iov_iter_get_pages2/__iov_iter_get_pages_alloc these 
locks are taken the other way around.
So right now I don't see a way out - we need to go with Amirs patch first.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-04 23:42                 ` Bernd Schubert
@ 2023-12-05  7:00                   ` Amir Goldstein
  2023-12-05 14:01                     ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-05  7:00 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu

On Tue, Dec 5, 2023 at 1:42 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/4/23 11:04, Bernd Schubert wrote:
> >
> >
> > On 12/4/23 10:27, Miklos Szeredi wrote:
> >> On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> wrote:
> >>>
> >>> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
> >>> <bernd.schubert@fastmail.fm> wrote:
> >>>>
> >>>> Hi Amir,
> >>>>
> >>>> On 12/3/23 12:20, Amir Goldstein wrote:
> >>>>> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com>
> >>>>> wrote:
> >>>>>>
> >>>>>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
> >>>>>> <bernd.schubert@fastmail.fm> wrote:
> >>>>>>>
> >>>>>>> Hi Miklos,
> >>>>>>>
> >>>>>>> On 9/20/23 10:15, Miklos Szeredi wrote:
> >>>>>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli
> >>>>>>>> <tfanelli@redhat.com> wrote:
> >>>>>>>>>
> >>>>>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the
> >>>>>>>>> purpose
> >>>>>>>>> of allowing shared mmap of files opened/created with DIRECT_IO
> >>>>>>>>> enabled.
> >>>>>>>>> However, it leaves open the possibility of further relaxing the
> >>>>>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency
> >>>>>>>>> guarantees of
> >>>>>>>>> DIRECT_IO) in the future.
> >>>>>>>>>
> >>>>>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its
> >>>>>>>>> purpose. It
> >>>>>>>>> only serves to allow shared mmap of DIRECT_IO files, while still
> >>>>>>>>> bypassing the cache on regular reads and writes. The shared
> >>>>>>>>> mmap is the
> >>>>>>>>> only loosening of the cache policy that can take place with the
> >>>>>>>>> flag.
> >>>>>>>>> This removes some ambiguity and introduces a more stable flag
> >>>>>>>>> to be used
> >>>>>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared
> >>>>>>>>> mmap'ing
> >>>>>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
> >>>>>>>>>
> >>>>>>>>> Tyler Fanelli (2):
> >>>>>>>>>      fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
> >>>>>>>>>      docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
> >>>>>>>>
> >>>>>>>> Looks good.
> >>>>>>>>
> >>>>>>>> Applied, thanks.  Will send the PR during this merge window,
> >>>>>>>> since the
> >>>>>>>> rename could break stuff if already released.
> >>>>>>>
> >>>>>>> I'm just porting back this feature to our internal fuse module
> >>>>>>> and it
> >>>>>>> looks these rename patches have been forgotten?
> >>>>>>>
> >>>>>>>
> >>>>>>
> >>>>>> Hi Miklos, Bernd,
> >>>>>>
> >>>>>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
> >>>>>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct
> >>>>>> write in
> >>>>>> direct_io_relax mode") and I was wondering - isn't dirty pages
> >>>>>> writeback
> >>>>>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
> >>>>>> direct_io_allow_mmap case?
> >>>>>>
> >>>>>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
> >>>>>> for munmap of files also in direct-io mode [1], so I was
> >>>>>> considering installing
> >>>>>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
> >>>>>> and regardless of direct_io_allow_mmap.
> >>>>>>
> >>>>>> I was asking myself if there was a good reason why
> >>>>>> fuse_page_mkwrite()/
> >>>>>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
> >>>>>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
> >>>>>> direct_io_allow_mmap?
> >>>>>>
> >>>>>
> >>>>> Before trying to make changes to fuse_file_mmap() I tried to test
> >>>>> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
> >>>>> passthrough_hp --direct-io.
> >>>>>
> >>>>> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio,
> >>>>> mmap, splice)
> >>>>> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
> >>>>> fuse_set_nowrite()
> >>>>>
> >>>>> I am wondering how this code was tested?
> >>>>>
> >>>>> I could not figure out the problem and how to fix it.
> >>>>> Please suggest a fix and let me know which adjustments are needed
> >>>>> if I want to use fuse_file_vm_ops for all mmap modes.
> >>>>
> >>>> So fuse_set_nowrite() tests for inode_is_locked(), but that also
> >>>> succeeds for a shared lock. It gets late here (and I might miss
> >>>> something), but I think we have an issue with
> >>>> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT
> >>>> and
> >>>> mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
> >>>> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for
> >>>> that
> >>>> are still pending.
> >>>>
> >>>
> >>> Your analysis seems to be correct.
> >>>
> >>> Attached patch fixes the problem and should be backported to 6.6.y.
> >>>
> >>> Miklos,
> >>>
> >>> I prepared the patch on top of master and not on top of the rename to
> >>> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
> >>> 6.6.y, although if you are planning send the flag rename to v6.7 as a
> >>> fix,
> >>> you may prefer to apply the fix after the rename and request to backport
> >>> the flag rename along with the fix to 6.6.y.
> >>
> >> I've done that.   Thanks for the fix and testing.
> >
> > Hi Amir, hi Miklos,
> >
> > could you please hold on a bit before sending the patch upstream?
> > I think we can just test for fuse_range_is_writeback in
> > fuse_direct_write_iter. I will have a patch in a few minutes.
>
> Hmm, that actually doesn't work as we would need to hold the inode lock
> in page write functions.
> Then tried to do it per inode and only when the inode gets cached writes
> or mmap - this triggers a lockdep lock order warning, because
> fuse_file_mmap is called with mm->mmap_lock and would take the inode
> lock. But through
> fuse_direct_io/iov_iter_get_pages2/__iov_iter_get_pages_alloc these
> locks are taken the other way around.
> So right now I don't see a way out - we need to go with Amirs patch first.
>
>

Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
(e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
I guess not otherwise, the combination would have been tested.

FOPEN_PARALLEL_DIRECT_WRITES is typically important for
network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
for network fs. Right?

FWIW, with FUSE_PASSTHROUGH, I plan that a shared mmap of an inode
in "passthrough mode" (i.e. has an open FOPEN_PASSTHROUGH file) will
be allowed (maps the backing file) regardless of fc->direct_io_allow_mmap.
FOPEN_PARALLEL_DIRECT_WRITES will also be allowed on an inode in
"passthrough mode", because an inode in "passthrough mode" cannot have
any pending page cache writes.

This makes me realize that I will also need to handle passthrough of
->direct_IO() on an FOPEN_PASSTHROUGH file.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-05  7:00                   ` Amir Goldstein
@ 2023-12-05 14:01                     ` Bernd Schubert
  2023-12-05 19:18                       ` Amir Goldstein
  2023-12-05 23:56                       ` Bernd Schubert
  0 siblings, 2 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-12-05 14:01 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu



On 12/5/23 08:00, Amir Goldstein wrote:
> On Tue, Dec 5, 2023 at 1:42 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/4/23 11:04, Bernd Schubert wrote:
>>>
>>>
>>> On 12/4/23 10:27, Miklos Szeredi wrote:
>>>> On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> wrote:
>>>>>
>>>>> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>> Hi Amir,
>>>>>>
>>>>>> On 12/3/23 12:20, Amir Goldstein wrote:
>>>>>>> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
>>>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>>>
>>>>>>>>> Hi Miklos,
>>>>>>>>>
>>>>>>>>> On 9/20/23 10:15, Miklos Szeredi wrote:
>>>>>>>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli
>>>>>>>>>> <tfanelli@redhat.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the
>>>>>>>>>>> purpose
>>>>>>>>>>> of allowing shared mmap of files opened/created with DIRECT_IO
>>>>>>>>>>> enabled.
>>>>>>>>>>> However, it leaves open the possibility of further relaxing the
>>>>>>>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency
>>>>>>>>>>> guarantees of
>>>>>>>>>>> DIRECT_IO) in the future.
>>>>>>>>>>>
>>>>>>>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its
>>>>>>>>>>> purpose. It
>>>>>>>>>>> only serves to allow shared mmap of DIRECT_IO files, while still
>>>>>>>>>>> bypassing the cache on regular reads and writes. The shared
>>>>>>>>>>> mmap is the
>>>>>>>>>>> only loosening of the cache policy that can take place with the
>>>>>>>>>>> flag.
>>>>>>>>>>> This removes some ambiguity and introduces a more stable flag
>>>>>>>>>>> to be used
>>>>>>>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared
>>>>>>>>>>> mmap'ing
>>>>>>>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>>>>>>>>>
>>>>>>>>>>> Tyler Fanelli (2):
>>>>>>>>>>>       fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>>       docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>
>>>>>>>>>> Looks good.
>>>>>>>>>>
>>>>>>>>>> Applied, thanks.  Will send the PR during this merge window,
>>>>>>>>>> since the
>>>>>>>>>> rename could break stuff if already released.
>>>>>>>>>
>>>>>>>>> I'm just porting back this feature to our internal fuse module
>>>>>>>>> and it
>>>>>>>>> looks these rename patches have been forgotten?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Miklos, Bernd,
>>>>>>>>
>>>>>>>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
>>>>>>>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct
>>>>>>>> write in
>>>>>>>> direct_io_relax mode") and I was wondering - isn't dirty pages
>>>>>>>> writeback
>>>>>>>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
>>>>>>>> direct_io_allow_mmap case?
>>>>>>>>
>>>>>>>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
>>>>>>>> for munmap of files also in direct-io mode [1], so I was
>>>>>>>> considering installing
>>>>>>>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching case,
>>>>>>>> and regardless of direct_io_allow_mmap.
>>>>>>>>
>>>>>>>> I was asking myself if there was a good reason why
>>>>>>>> fuse_page_mkwrite()/
>>>>>>>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
>>>>>>>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
>>>>>>>> direct_io_allow_mmap?
>>>>>>>>
>>>>>>>
>>>>>>> Before trying to make changes to fuse_file_mmap() I tried to test
>>>>>>> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
>>>>>>> passthrough_hp --direct-io.
>>>>>>>
>>>>>>> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio,
>>>>>>> mmap, splice)
>>>>>>> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
>>>>>>> fuse_set_nowrite()
>>>>>>>
>>>>>>> I am wondering how this code was tested?
>>>>>>>
>>>>>>> I could not figure out the problem and how to fix it.
>>>>>>> Please suggest a fix and let me know which adjustments are needed
>>>>>>> if I want to use fuse_file_vm_ops for all mmap modes.
>>>>>>
>>>>>> So fuse_set_nowrite() tests for inode_is_locked(), but that also
>>>>>> succeeds for a shared lock. It gets late here (and I might miss
>>>>>> something), but I think we have an issue with
>>>>>> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT
>>>>>> and
>>>>>> mmap, the same issue might triggered? Hmm, well, so far plain O_DIRECT
>>>>>> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for
>>>>>> that
>>>>>> are still pending.
>>>>>>
>>>>>
>>>>> Your analysis seems to be correct.
>>>>>
>>>>> Attached patch fixes the problem and should be backported to 6.6.y.
>>>>>
>>>>> Miklos,
>>>>>
>>>>> I prepared the patch on top of master and not on top of the rename to
>>>>> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
>>>>> 6.6.y, although if you are planning send the flag rename to v6.7 as a
>>>>> fix,
>>>>> you may prefer to apply the fix after the rename and request to backport
>>>>> the flag rename along with the fix to 6.6.y.
>>>>
>>>> I've done that.   Thanks for the fix and testing.
>>>
>>> Hi Amir, hi Miklos,
>>>
>>> could you please hold on a bit before sending the patch upstream?
>>> I think we can just test for fuse_range_is_writeback in
>>> fuse_direct_write_iter. I will have a patch in a few minutes.
>>
>> Hmm, that actually doesn't work as we would need to hold the inode lock
>> in page write functions.
>> Then tried to do it per inode and only when the inode gets cached writes
>> or mmap - this triggers a lockdep lock order warning, because
>> fuse_file_mmap is called with mm->mmap_lock and would take the inode
>> lock. But through
>> fuse_direct_io/iov_iter_get_pages2/__iov_iter_get_pages_alloc these
>> locks are taken the other way around.
>> So right now I don't see a way out - we need to go with Amirs patch first.
>>
>>
> 
> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> I guess not otherwise, the combination would have been tested.

I'm not sure how many people are aware of these different flags/features.
I had just finalized the backport of the related patches to RHEL8 on 
Friday, as we (or our customers) need both for different jobs.

> 
> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> for network fs. Right?

We kind of have these use cases for our network file systems

FOPEN_PARALLEL_DIRECT_WRITES:
    - Traditional HPC, large files, parallel IO
    - Large file used on local node as container for many small files

FUSE_DIRECT_IO_ALLOW_MMAP:
    - compilation through gcc (not so important, just not nice when it 
does not work)
    - rather recent: python libraries using mmap _reads_. As it is read 
only no issue of consistency.


These jobs do not intermix - no issue as in generic/095. If such 
applications really exist, I have no issue with a serialization penalty. 
Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other 
nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.

Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain 
O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch 
and post the next version
https://github.com/bsbernd/linux/commits/fuse-dio-v4


In the mean time I have another idea how to solve 
FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP

> 
> FWIW, with FUSE_PASSTHROUGH, I plan that a shared mmap of an inode
> in "passthrough mode" (i.e. has an open FOPEN_PASSTHROUGH file) will
> be allowed (maps the backing file) regardless of fc->direct_io_allow_mmap.
> FOPEN_PARALLEL_DIRECT_WRITES will also be allowed on an inode in
> "passthrough mode", because an inode in "passthrough mode" cannot have
> any pending page cache writes.
> 
> This makes me realize that I will also need to handle passthrough of
> ->direct_IO() on an FOPEN_PASSTHROUGH file.

I really need to take a few hours to look at your patches.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-05 14:01                     ` Bernd Schubert
@ 2023-12-05 19:18                       ` Amir Goldstein
  2023-12-05 23:56                       ` Bernd Schubert
  1 sibling, 0 replies; 48+ messages in thread
From: Amir Goldstein @ 2023-12-05 19:18 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu

> >
> > FWIW, with FUSE_PASSTHROUGH, I plan that a shared mmap of an inode
> > in "passthrough mode" (i.e. has an open FOPEN_PASSTHROUGH file) will
> > be allowed (maps the backing file) regardless of fc->direct_io_allow_mmap.
> > FOPEN_PARALLEL_DIRECT_WRITES will also be allowed on an inode in
> > "passthrough mode", because an inode in "passthrough mode" cannot have
> > any pending page cache writes.
> >
> > This makes me realize that I will also need to handle passthrough of
> > ->direct_IO() on an FOPEN_PASSTHROUGH file.
>
> I really need to take a few hours to look at your patches.
>

Only if you want to glimpse at WIP.
There is still time before the direct_io/passthrough + mmap logic
is worked out.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-05 14:01                     ` Bernd Schubert
  2023-12-05 19:18                       ` Amir Goldstein
@ 2023-12-05 23:56                       ` Bernd Schubert
  2023-12-06  8:25                         ` Amir Goldstein
  1 sibling, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-05 23:56 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

[-- Attachment #1: Type: text/plain, Size: 8114 bytes --]



On 12/5/23 15:01, Bernd Schubert wrote:
> 
> 
> On 12/5/23 08:00, Amir Goldstein wrote:
>> On Tue, Dec 5, 2023 at 1:42 AM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>>>
>>>
>>>
>>> On 12/4/23 11:04, Bernd Schubert wrote:
>>>>
>>>>
>>>> On 12/4/23 10:27, Miklos Szeredi wrote:
>>>>> On Mon, 4 Dec 2023 at 07:50, Amir Goldstein <amir73il@gmail.com> 
>>>>> wrote:
>>>>>>
>>>>>> On Mon, Dec 4, 2023 at 1:00 AM Bernd Schubert
>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>
>>>>>>> Hi Amir,
>>>>>>>
>>>>>>> On 12/3/23 12:20, Amir Goldstein wrote:
>>>>>>>> On Sat, Dec 2, 2023 at 5:06 PM Amir Goldstein <amir73il@gmail.com>
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> On Mon, Nov 6, 2023 at 4:08 PM Bernd Schubert
>>>>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>>>>
>>>>>>>>>> Hi Miklos,
>>>>>>>>>>
>>>>>>>>>> On 9/20/23 10:15, Miklos Szeredi wrote:
>>>>>>>>>>> On Wed, 20 Sept 2023 at 04:41, Tyler Fanelli
>>>>>>>>>>> <tfanelli@redhat.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> At the moment, FUSE_INIT's DIRECT_IO_RELAX flag only serves the
>>>>>>>>>>>> purpose
>>>>>>>>>>>> of allowing shared mmap of files opened/created with DIRECT_IO
>>>>>>>>>>>> enabled.
>>>>>>>>>>>> However, it leaves open the possibility of further relaxing the
>>>>>>>>>>>> DIRECT_IO restrictions (and in-effect, the cache coherency
>>>>>>>>>>>> guarantees of
>>>>>>>>>>>> DIRECT_IO) in the future.
>>>>>>>>>>>>
>>>>>>>>>>>> The DIRECT_IO_ALLOW_MMAP flag leaves no ambiguity of its
>>>>>>>>>>>> purpose. It
>>>>>>>>>>>> only serves to allow shared mmap of DIRECT_IO files, while 
>>>>>>>>>>>> still
>>>>>>>>>>>> bypassing the cache on regular reads and writes. The shared
>>>>>>>>>>>> mmap is the
>>>>>>>>>>>> only loosening of the cache policy that can take place with the
>>>>>>>>>>>> flag.
>>>>>>>>>>>> This removes some ambiguity and introduces a more stable flag
>>>>>>>>>>>> to be used
>>>>>>>>>>>> in FUSE_INIT. Furthermore, we can document that to allow shared
>>>>>>>>>>>> mmap'ing
>>>>>>>>>>>> of DIRECT_IO files, a user must enable DIRECT_IO_ALLOW_MMAP.
>>>>>>>>>>>>
>>>>>>>>>>>> Tyler Fanelli (2):
>>>>>>>>>>>>       fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>>>       docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>>
>>>>>>>>>>> Looks good.
>>>>>>>>>>>
>>>>>>>>>>> Applied, thanks.  Will send the PR during this merge window,
>>>>>>>>>>> since the
>>>>>>>>>>> rename could break stuff if already released.
>>>>>>>>>>
>>>>>>>>>> I'm just porting back this feature to our internal fuse module
>>>>>>>>>> and it
>>>>>>>>>> looks these rename patches have been forgotten?
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Miklos, Bernd,
>>>>>>>>>
>>>>>>>>> I was looking at the DIRECT_IO_ALLOW_MMAP code and specifically at
>>>>>>>>> commit b5a2a3a0b776 ("fuse: write back dirty pages before direct
>>>>>>>>> write in
>>>>>>>>> direct_io_relax mode") and I was wondering - isn't dirty pages
>>>>>>>>> writeback
>>>>>>>>> needed *before* invalidate_inode_pages2() in fuse_file_mmap() for
>>>>>>>>> direct_io_allow_mmap case?
>>>>>>>>>
>>>>>>>>> For FUSE_PASSTHROUGH, I am going to need to call fuse_vma_close()
>>>>>>>>> for munmap of files also in direct-io mode [1], so I was
>>>>>>>>> considering installing
>>>>>>>>> fuse_file_vm_ops for the FOPEN_DIRECT_IO case, same as caching 
>>>>>>>>> case,
>>>>>>>>> and regardless of direct_io_allow_mmap.
>>>>>>>>>
>>>>>>>>> I was asking myself if there was a good reason why
>>>>>>>>> fuse_page_mkwrite()/
>>>>>>>>> fuse_wait_on_page_writeback()/fuse_vma_close()/write_inode_now()
>>>>>>>>> should NOT be called for the FOPEN_DIRECT_IO case regardless of
>>>>>>>>> direct_io_allow_mmap?
>>>>>>>>>
>>>>>>>>
>>>>>>>> Before trying to make changes to fuse_file_mmap() I tried to test
>>>>>>>> DIRECT_IO_RELAX - I enabled it in libfuse and ran fstest with
>>>>>>>> passthrough_hp --direct-io.
>>>>>>>>
>>>>>>>> The test generic/095 - "Concurrent mixed I/O (buffer I/O, aiodio,
>>>>>>>> mmap, splice)
>>>>>>>> on the same files" blew up hitting BUG_ON(fi->writectr < 0) in
>>>>>>>> fuse_set_nowrite()
>>>>>>>>
>>>>>>>> I am wondering how this code was tested?
>>>>>>>>
>>>>>>>> I could not figure out the problem and how to fix it.
>>>>>>>> Please suggest a fix and let me know which adjustments are needed
>>>>>>>> if I want to use fuse_file_vm_ops for all mmap modes.
>>>>>>>
>>>>>>> So fuse_set_nowrite() tests for inode_is_locked(), but that also
>>>>>>> succeeds for a shared lock. It gets late here (and I might miss
>>>>>>> something), but I think we have an issue with
>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES. Assuming there would be plain O_DIRECT
>>>>>>> and
>>>>>>> mmap, the same issue might triggered? Hmm, well, so far plain 
>>>>>>> O_DIRECT
>>>>>>> does not support FOPEN_PARALLEL_DIRECT_WRITES yet - the patches for
>>>>>>> that
>>>>>>> are still pending.
>>>>>>>
>>>>>>
>>>>>> Your analysis seems to be correct.
>>>>>>
>>>>>> Attached patch fixes the problem and should be backported to 6.6.y.
>>>>>>
>>>>>> Miklos,
>>>>>>
>>>>>> I prepared the patch on top of master and not on top of the rename to
>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP in for-next for ease of backport to
>>>>>> 6.6.y, although if you are planning send the flag rename to v6.7 as a
>>>>>> fix,
>>>>>> you may prefer to apply the fix after the rename and request to 
>>>>>> backport
>>>>>> the flag rename along with the fix to 6.6.y.
>>>>>
>>>>> I've done that.   Thanks for the fix and testing.
>>>>
>>>> Hi Amir, hi Miklos,
>>>>
>>>> could you please hold on a bit before sending the patch upstream?
>>>> I think we can just test for fuse_range_is_writeback in
>>>> fuse_direct_write_iter. I will have a patch in a few minutes.
>>>
>>> Hmm, that actually doesn't work as we would need to hold the inode lock
>>> in page write functions.
>>> Then tried to do it per inode and only when the inode gets cached writes
>>> or mmap - this triggers a lockdep lock order warning, because
>>> fuse_file_mmap is called with mm->mmap_lock and would take the inode
>>> lock. But through
>>> fuse_direct_io/iov_iter_get_pages2/__iov_iter_get_pages_alloc these
>>> locks are taken the other way around.
>>> So right now I don't see a way out - we need to go with Amirs patch 
>>> first.
>>>
>>>
>>
>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>> I guess not otherwise, the combination would have been tested.
> 
> I'm not sure how many people are aware of these different flags/features.
> I had just finalized the backport of the related patches to RHEL8 on 
> Friday, as we (or our customers) need both for different jobs.
> 
>>
>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>> for network fs. Right?
> 
> We kind of have these use cases for our network file systems
> 
> FOPEN_PARALLEL_DIRECT_WRITES:
>     - Traditional HPC, large files, parallel IO
>     - Large file used on local node as container for many small files
> 
> FUSE_DIRECT_IO_ALLOW_MMAP:
>     - compilation through gcc (not so important, just not nice when it 
> does not work)
>     - rather recent: python libraries using mmap _reads_. As it is read 
> only no issue of consistency.
> 
> 
> These jobs do not intermix - no issue as in generic/095. If such 
> applications really exist, I have no issue with a serialization penalty. 
> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other 
> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> 
> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain 
> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch 
> and post the next version
> https://github.com/bsbernd/linux/commits/fuse-dio-v4
> 
> 
> In the mean time I have another idea how to solve 
> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP

Please find attached what I had in my mind. With that generic/095 is not 
crashing for me anymore. I just finished the initial coding - it still 
needs a bit cleanup and maybe a few comments.


Thanks,
Bernd

[-- Attachment #2: 01-helper-function.patch --]
[-- Type: text/x-patch, Size: 3314 bytes --]

fuse: Create helper function if DIO write needs exclusive lock

From: Bernd Schubert <bschubert@ddn.com>

This is just a preparation for follow up patches.

Cc: Hao Xu <howeyxu@tencent.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Cc: stable@vger.kernel.org
Preparation for Fixes: 153524053bbb ("fuse: allow non-extending parallel direct writes on the same file")
---
 fs/fuse/file.c |   53 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..60d4e1e50843 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1298,6 +1298,38 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
 	return res;
 }
 
+static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
+/*
+ * @return true if an exclusive lock for direct IO writes is needed
+ */
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+
+	/* server side has to advise that it supports parallel dio writes */
+	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+		return true;
+
+	/* append will need to know the eventual eof - always needs an
+	 * exclusive lock
+	 */
+	if (iocb->ki_flags & IOCB_APPEND)
+		return true;
+
+	/* parallel dio beyond eof is at least for now not supported */
+	if (fuse_io_past_eof(iocb, from))
+		return true;
+
+	return false;
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1557,25 +1589,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return res;
 }
 
-static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
-					       struct iov_iter *iter)
-{
-	struct inode *inode = file_inode(iocb->ki_filp);
-
-	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
-	struct file *file = iocb->ki_filp;
-	struct fuse_file *ff = file->private_data;
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
-	bool exclusive_lock =
-		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
-		iocb->ki_flags & IOCB_APPEND ||
-		fuse_direct_write_extending_i_size(iocb, from);
+	bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from);
 
 	/*
 	 * Take exclusive lock if
@@ -1588,10 +1607,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	else {
 		inode_lock_shared(inode);
 
-		/* A race with truncate might have come up as the decision for
-		 * the lock type was done without holding the lock, check again.
+		/*
+		 * Previous check was without any lock and might have raced.
 		 */
-		if (fuse_direct_write_extending_i_size(iocb, from)) {
+		if (fuse_dio_wr_exclusive_lock(iocb, from)) {
 			inode_unlock_shared(inode);
 			inode_lock(inode);
 			exclusive_lock = true;

[-- Attachment #3: 02-fix-dio-write-shared-lock.patch --]
[-- Type: text/x-patch, Size: 7877 bytes --]

fuse: Test for page cache writes in the shared lock DIO decision

From: Bernd Schubert <bschubert@ddn.com>

xfstest generic/095 triggers BUG_ON(fi->writectr < 0) in
fuse_set_nowrite().
This happens with a shared lock for FOPEN_DIRECT_IO and when in parallel
mmap writes happen (FUSE_DIRECT_IO_RELAX is set).
Reason is that multiple DIO writers see that the inode has pending
page IO writes and try to set FUSE_NOWRITE, but this code path requires
serialization. Ideal would be to let fuse_dio_wr_exclusive_lock detect if
there are outstanding writes, but that would require to hold an inode
lock in related page/folio write paths. Another solution would be to disable
the shared inode lock for FOPEN_DIRECT_IO, when FUSE_DIRECT_IO_RELAX is set,
but typically userspace/server side will set these flags for all inodes (or not
at all). Hence, FUSE_DIRECT_IO_RELAX would entirely disable the shared lock and
impose serialization even though no page IO is ever done for inodes.
The solution here stores a flag into the fuse inode, if page writes ever
happened to an inode and only then to enforce the non-shared lock.

Cc: Hao Xu <howeyxu@tencent.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Cc: stable@vger.kernel.org
Fixes: 153524053bbb ("fuse: allow non-extending parallel direct writes on the same file")
---
 fs/fuse/dir.c    |    1 +
 fs/fuse/file.c   |   72 ++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/fuse/fuse_i.h |    9 +++++++
 3 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d19cbf34c634..09aaaa31ae28 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1751,6 +1751,7 @@ void fuse_set_nowrite(struct inode *inode)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	BUG_ON(!inode_is_locked(inode));
+	lockdep_assert_held_write(&inode->i_rwsem);
 
 	spin_lock(&fi->lock);
 	BUG_ON(fi->writectr < 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 60d4e1e50843..9959eafca0a0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1308,26 +1308,59 @@ static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
 /*
  * @return true if an exclusive lock for direct IO writes is needed
  */
-static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from,
+				       bool *cnt_increased)
 {
 	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fm->fc;
+	bool excl_lock = true;
 
 	/* server side has to advise that it supports parallel dio writes */
 	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
-		return true;
+		goto out;
 
 	/* append will need to know the eventual eof - always needs an
 	 * exclusive lock
 	 */
 	if (iocb->ki_flags & IOCB_APPEND)
-		return true;
+		goto out;
 
 	/* parallel dio beyond eof is at least for now not supported */
 	if (fuse_io_past_eof(iocb, from))
-		return true;
+		goto out;
 
-	return false;
+	/* no need to optimize async requests */
+	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT &&
+	    fc->async_dio)
+		goto out;
+
+	/* The inode ever got page writes and we do not know for sure
+	 * in the DIO path if these are pending - shared lock not possible */
+	spin_lock(&fi->lock);
+	if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
+		if (!(*cnt_increased)) {
+			fi->shared_lock_direct_io_ctr++;
+			*cnt_increased = true;
+		}
+		excl_lock = false;
+	}
+	spin_unlock(&fi->lock);
+
+out:
+	if (excl_lock && *cnt_increased) {
+		bool wake = false;
+		spin_lock(&fi->lock);
+		if (--fi->shared_lock_direct_io_ctr == 0)
+			wake = true;
+		spin_unlock(&fi->lock);
+		if (wake)
+			wake_up(&fi->direct_io_waitq);
+	}
+
+	return excl_lock;
 }
 
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1549,6 +1582,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 				break;
 		}
 	}
+
 	if (ia)
 		fuse_io_free(ia);
 	if (res > 0)
@@ -1592,9 +1626,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
-	bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from);
+	bool shared_lock_cnt_inc = false;
+	bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from,
+							 &shared_lock_cnt_inc);
 
 	/*
 	 * Take exclusive lock if
@@ -1610,7 +1647,8 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		/*
 		 * Previous check was without any lock and might have raced.
 		 */
-		if (fuse_dio_wr_exclusive_lock(iocb, from)) {
+		if (fuse_dio_wr_exclusive_lock(iocb, from,
+					       &shared_lock_cnt_inc)) {
 			inode_unlock_shared(inode);
 			inode_lock(inode);
 			exclusive_lock = true;
@@ -1629,8 +1667,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 	if (exclusive_lock)
 		inode_unlock(inode);
-	else
+	else {
+		bool wake = false;
+
 		inode_unlock_shared(inode);
+		spin_lock(&fi->lock);
+		if (--fi->shared_lock_direct_io_ctr == 0)
+			wake = true;
+		spin_unlock(&fi->lock);
+		if (wake)
+			wake_up(&fi->direct_io_waitq);
+	}
 
 	return res;
 }
@@ -1719,6 +1766,13 @@ __acquires(fi->lock)
 	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
 	int err;
 
+	if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
+		set_bit(FUSE_I_CACHE_WRITES, &fi->state);
+		spin_unlock(&fi->lock);
+		wait_event(fi->direct_io_waitq, fi->shared_lock_direct_io_ctr == 0);
+		spin_lock(&fi->lock);
+	}
+
 	fi->writectr++;
 	if (inarg->offset + data_size <= size) {
 		inarg->size = data_size;
@@ -3261,7 +3315,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	fi->writectr = 0;
+	fi->shared_lock_direct_io_ctr = 0;
 	init_waitqueue_head(&fi->page_waitq);
+	init_waitqueue_head(&fi->direct_io_waitq);
 	fi->writepages = RB_ROOT;
 
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6e6e721f421b..febb1f5cd53f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -110,11 +110,17 @@ struct fuse_inode {
 			 * (FUSE_NOWRITE) means more writes are blocked */
 			int writectr;
 
+			/* counter of tasks with shared lock direct-io writes */
+			int shared_lock_direct_io_ctr;
+
 			/* Waitq for writepage completion */
 			wait_queue_head_t page_waitq;
 
 			/* List of writepage requestst (pending or sent) */
 			struct rb_root writepages;
+
+			/* waitq for direct-io completion */
+			wait_queue_head_t direct_io_waitq;
 		};
 
 		/* readdir cache (directory only) */
@@ -172,6 +178,9 @@ enum {
 	FUSE_I_BAD,
 	/* Has btime */
 	FUSE_I_BTIME,
+	/* Has paged writes */
+	FUSE_I_CACHE_WRITES,
+
 };
 
 struct fuse_conn;
/fs/fuse/fuse_i.h
@@ -110,11 +110,17 @@ struct fuse_inode {
 			 * (FUSE_NOWRITE) means more writes are blocked */
 			int writectr;
 
+			/* counter of tasks with shared lock direct-io writes */
+			int shared_lock_direct_io_ctr;
+
 			/* Waitq for writepage completion */
 			wait_queue_head_t page_waitq;
 
 			/* List of writepage requestst (pending or sent) */
 			struct rb_root writepages;
+
+			/* waitq for direct-io completion */
+			wait_queue_head_t direct_io_waitq;
 		};
 
 		/* readdir cache (directory only) */
@@ -172,6 +178,9 @@ enum {
 	FUSE_I_BAD,
 	/* Has btime */
 	FUSE_I_BTIME,
+	/* Has paged writes */
+	FUSE_I_CACHE_WRITES,
+
 };
 
 struct fuse_conn;

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-05 23:56                       ` Bernd Schubert
@ 2023-12-06  8:25                         ` Amir Goldstein
  2023-12-06 23:28                           ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-06  8:25 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

> >> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> >> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> >> I guess not otherwise, the combination would have been tested.
> >
> > I'm not sure how many people are aware of these different flags/features.
> > I had just finalized the backport of the related patches to RHEL8 on
> > Friday, as we (or our customers) need both for different jobs.
> >
> >>
> >> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> >> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> >> for network fs. Right?
> >
> > We kind of have these use cases for our network file systems
> >
> > FOPEN_PARALLEL_DIRECT_WRITES:
> >     - Traditional HPC, large files, parallel IO
> >     - Large file used on local node as container for many small files
> >
> > FUSE_DIRECT_IO_ALLOW_MMAP:
> >     - compilation through gcc (not so important, just not nice when it
> > does not work)
> >     - rather recent: python libraries using mmap _reads_. As it is read
> > only no issue of consistency.
> >
> >
> > These jobs do not intermix - no issue as in generic/095. If such
> > applications really exist, I have no issue with a serialization penalty.
> > Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
> > nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> >
> > Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
> > O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
> > and post the next version
> > https://github.com/bsbernd/linux/commits/fuse-dio-v4
> >
> >
> > In the mean time I have another idea how to solve
> > FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>
> Please find attached what I had in my mind. With that generic/095 is not
> crashing for me anymore. I just finished the initial coding - it still
> needs a bit cleanup and maybe a few comments.
>

Nice. I like the FUSE_I_CACHE_WRITES state.
For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
of the last open file of the inode.

I did not understand some of the complexity here:

>        /* The inode ever got page writes and we do not know for sure
>         * in the DIO path if these are pending - shared lock not possible */
>        spin_lock(&fi->lock);
>        if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>                if (!(*cnt_increased)) {

How can *cnt_increased be true here?

>                        fi->shared_lock_direct_io_ctr++;
>                        *cnt_increased = true;
>                }
>                excl_lock = false;

Seems like in every outcome of this function
*cnt_increased = !excl_lock
so there is not need for out arg cnt_increased

>        }
>        spin_unlock(&fi->lock);
>
>out:
>        if (excl_lock && *cnt_increased) {
>                bool wake = false;
>                spin_lock(&fi->lock);
>                if (--fi->shared_lock_direct_io_ctr == 0)
>                        wake = true;
>                spin_unlock(&fi->lock);
>                if (wake)
>                        wake_up(&fi->direct_io_waitq);
>        }

I don't see how this wake_up code is reachable.

TBH, I don't fully understand the expected result.
Surely, the behavior of dio mixed with mmap is undefined. Right?
IIUC, your patch does not prevent dirtying page cache while dio is in
flight. It only prevents writeback while dio is in flight, which is the same
behavior as with exclusive inode lock. Right?

Maybe this interaction is spelled out somewhere else, but if not
better spell it out for people like me that are new to this code.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-06  8:25                         ` Amir Goldstein
@ 2023-12-06 23:28                           ` Bernd Schubert
  2023-12-07  7:39                             ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-06 23:28 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/6/23 09:25, Amir Goldstein wrote:
>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>> I guess not otherwise, the combination would have been tested.
>>>
>>> I'm not sure how many people are aware of these different flags/features.
>>> I had just finalized the backport of the related patches to RHEL8 on
>>> Friday, as we (or our customers) need both for different jobs.
>>>
>>>>
>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>> for network fs. Right?
>>>
>>> We kind of have these use cases for our network file systems
>>>
>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>      - Traditional HPC, large files, parallel IO
>>>      - Large file used on local node as container for many small files
>>>
>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>      - compilation through gcc (not so important, just not nice when it
>>> does not work)
>>>      - rather recent: python libraries using mmap _reads_. As it is read
>>> only no issue of consistency.
>>>
>>>
>>> These jobs do not intermix - no issue as in generic/095. If such
>>> applications really exist, I have no issue with a serialization penalty.
>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>
>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>> and post the next version
>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>
>>>
>>> In the mean time I have another idea how to solve
>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>
>> Please find attached what I had in my mind. With that generic/095 is not
>> crashing for me anymore. I just finished the initial coding - it still
>> needs a bit cleanup and maybe a few comments.
>>
> 
> Nice. I like the FUSE_I_CACHE_WRITES state.
> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
> of the last open file of the inode.
> 
> I did not understand some of the complexity here:
> 
>>         /* The inode ever got page writes and we do not know for sure
>>          * in the DIO path if these are pending - shared lock not possible */
>>         spin_lock(&fi->lock);
>>         if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>                 if (!(*cnt_increased)) {
> 
> How can *cnt_increased be true here?

I think you missed the 2nd entry into this function, when the shared 
lock was already taken? I have changed the code now to have all 
complexity in this function (test, lock, retest with lock, release, 
wakeup). I hope that will make it easier to see the intention of the 
code. Will post the new patches in the morning.


> 
>>                         fi->shared_lock_direct_io_ctr++;
>>                         *cnt_increased = true;
>>                 }
>>                 excl_lock = false;
> 
> Seems like in every outcome of this function
> *cnt_increased = !excl_lock
> so there is not need for out arg cnt_increased

If excl_lock would be used as input - yeah, would have worked as well. 
Or a parameter like "retest-under-lock". Code is changed now to avoid 
going in and out.

> 
>>         }
>>         spin_unlock(&fi->lock);
>>
>> out:
>>         if (excl_lock && *cnt_increased) {
>>                 bool wake = false;
>>                 spin_lock(&fi->lock);
>>                 if (--fi->shared_lock_direct_io_ctr == 0)
>>                         wake = true;
>>                 spin_unlock(&fi->lock);
>>                 if (wake)
>>                         wake_up(&fi->direct_io_waitq);
>>         }
> 
> I don't see how this wake_up code is reachable.
> 
> TBH, I don't fully understand the expected result.
> Surely, the behavior of dio mixed with mmap is undefined. Right?
> IIUC, your patch does not prevent dirtying page cache while dio is in
> flight. It only prevents writeback while dio is in flight, which is the same
> behavior as with exclusive inode lock. Right?

Yeah, thanks. I will add it in the patch description.

And there was actually an issue with the patch, as cache flushing needs 
to be initiated before doing the lock decision, fixed now.

> 
> Maybe this interaction is spelled out somewhere else, but if not
> better spell it out for people like me that are new to this code.

Sure, thanks a lot for your helpful comments!



Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-06 23:28                           ` Bernd Schubert
@ 2023-12-07  7:39                             ` Amir Goldstein
  2023-12-07  9:12                               ` Bernd Schubert
  2023-12-07 18:37                               ` Bernd Schubert
  0 siblings, 2 replies; 48+ messages in thread
From: Amir Goldstein @ 2023-12-07  7:39 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/6/23 09:25, Amir Goldstein wrote:
> >>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> >>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> >>>> I guess not otherwise, the combination would have been tested.
> >>>
> >>> I'm not sure how many people are aware of these different flags/features.
> >>> I had just finalized the backport of the related patches to RHEL8 on
> >>> Friday, as we (or our customers) need both for different jobs.
> >>>
> >>>>
> >>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> >>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> >>>> for network fs. Right?
> >>>
> >>> We kind of have these use cases for our network file systems
> >>>
> >>> FOPEN_PARALLEL_DIRECT_WRITES:
> >>>      - Traditional HPC, large files, parallel IO
> >>>      - Large file used on local node as container for many small files
> >>>
> >>> FUSE_DIRECT_IO_ALLOW_MMAP:
> >>>      - compilation through gcc (not so important, just not nice when it
> >>> does not work)
> >>>      - rather recent: python libraries using mmap _reads_. As it is read
> >>> only no issue of consistency.
> >>>
> >>>
> >>> These jobs do not intermix - no issue as in generic/095. If such
> >>> applications really exist, I have no issue with a serialization penalty.
> >>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
> >>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> >>>
> >>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
> >>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
> >>> and post the next version
> >>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
> >>>
> >>>
> >>> In the mean time I have another idea how to solve
> >>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
> >>
> >> Please find attached what I had in my mind. With that generic/095 is not
> >> crashing for me anymore. I just finished the initial coding - it still
> >> needs a bit cleanup and maybe a few comments.
> >>
> >
> > Nice. I like the FUSE_I_CACHE_WRITES state.
> > For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
> > in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
> > of the last open file of the inode.
> >
> > I did not understand some of the complexity here:
> >
> >>         /* The inode ever got page writes and we do not know for sure
> >>          * in the DIO path if these are pending - shared lock not possible */
> >>         spin_lock(&fi->lock);
> >>         if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
> >>                 if (!(*cnt_increased)) {
> >
> > How can *cnt_increased be true here?
>
> I think you missed the 2nd entry into this function, when the shared
> lock was already taken?

Yeh, I did.

> I have changed the code now to have all
> complexity in this function (test, lock, retest with lock, release,
> wakeup). I hope that will make it easier to see the intention of the
> code. Will post the new patches in the morning.
>

Sounds good. Current version was a bit hard to follow.

>
> >
> >>                         fi->shared_lock_direct_io_ctr++;
> >>                         *cnt_increased = true;
> >>                 }
> >>                 excl_lock = false;
> >
> > Seems like in every outcome of this function
> > *cnt_increased = !excl_lock
> > so there is not need for out arg cnt_increased
>
> If excl_lock would be used as input - yeah, would have worked as well.
> Or a parameter like "retest-under-lock". Code is changed now to avoid
> going in and out.
>
> >
> >>         }
> >>         spin_unlock(&fi->lock);
> >>
> >> out:
> >>         if (excl_lock && *cnt_increased) {
> >>                 bool wake = false;
> >>                 spin_lock(&fi->lock);
> >>                 if (--fi->shared_lock_direct_io_ctr == 0)
> >>                         wake = true;
> >>                 spin_unlock(&fi->lock);
> >>                 if (wake)
> >>                         wake_up(&fi->direct_io_waitq);
> >>         }
> >
> > I don't see how this wake_up code is reachable.
> >
> > TBH, I don't fully understand the expected result.
> > Surely, the behavior of dio mixed with mmap is undefined. Right?
> > IIUC, your patch does not prevent dirtying page cache while dio is in
> > flight. It only prevents writeback while dio is in flight, which is the same
> > behavior as with exclusive inode lock. Right?
>
> Yeah, thanks. I will add it in the patch description.
>
> And there was actually an issue with the patch, as cache flushing needs
> to be initiated before doing the lock decision, fixed now.
>

I thought there was, because of the wait in fuse_send_writepage()
but wasn't sure if I was following the flow correctly.

> >
> > Maybe this interaction is spelled out somewhere else, but if not
> > better spell it out for people like me that are new to this code.
>
> Sure, thanks a lot for your helpful comments!
>

Just to be clear, this patch looks like a good improvement and
is mostly independent of the "inode caching mode" and
FOPEN_CACHE_MMAP idea that I suggested.

The only thing that my idea changes is replacing the
FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
state, which is set earlier than FUSE_I_CACHE_WRITES
on caching file open or first direct_io mmap and unlike
FUSE_I_CACHE_WRITES, it is cleared on the last file close.

FUSE_I_CACHE_WRITES means that caching writes happened.
FUSE_I_CACHE_IO_MODE means the caching writes and reads
may happen.

FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
about "caching reads may happen", but IMO that is a small trade off
to make for maintaining the same state for
"do not allow parallel dio" and "do not allow passthrough open".

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-07  7:39                             ` Amir Goldstein
@ 2023-12-07  9:12                               ` Bernd Schubert
  2023-12-07 18:37                               ` Bernd Schubert
  1 sibling, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-12-07  9:12 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/7/23 08:39, Amir Goldstein wrote:
> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>
>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>
>>>>>>
>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>> for network fs. Right?
>>>>>
>>>>> We kind of have these use cases for our network file systems
>>>>>
>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>       - Traditional HPC, large files, parallel IO
>>>>>       - Large file used on local node as container for many small files
>>>>>
>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>       - compilation through gcc (not so important, just not nice when it
>>>>> does not work)
>>>>>       - rather recent: python libraries using mmap _reads_. As it is read
>>>>> only no issue of consistency.
>>>>>
>>>>>
>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>
>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>> and post the next version
>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>
>>>>>
>>>>> In the mean time I have another idea how to solve
>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>
>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>> crashing for me anymore. I just finished the initial coding - it still
>>>> needs a bit cleanup and maybe a few comments.
>>>>
>>>
>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>> of the last open file of the inode.
>>>
>>> I did not understand some of the complexity here:
>>>
>>>>          /* The inode ever got page writes and we do not know for sure
>>>>           * in the DIO path if these are pending - shared lock not possible */
>>>>          spin_lock(&fi->lock);
>>>>          if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>                  if (!(*cnt_increased)) {
>>>
>>> How can *cnt_increased be true here?
>>
>> I think you missed the 2nd entry into this function, when the shared
>> lock was already taken?
> 
> Yeh, I did.
> 
>> I have changed the code now to have all
>> complexity in this function (test, lock, retest with lock, release,
>> wakeup). I hope that will make it easier to see the intention of the
>> code. Will post the new patches in the morning.
>>
> 
> Sounds good. Current version was a bit hard to follow.
> 
>>
>>>
>>>>                          fi->shared_lock_direct_io_ctr++;
>>>>                          *cnt_increased = true;
>>>>                  }
>>>>                  excl_lock = false;
>>>
>>> Seems like in every outcome of this function
>>> *cnt_increased = !excl_lock
>>> so there is not need for out arg cnt_increased
>>
>> If excl_lock would be used as input - yeah, would have worked as well.
>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>> going in and out.
>>
>>>
>>>>          }
>>>>          spin_unlock(&fi->lock);
>>>>
>>>> out:
>>>>          if (excl_lock && *cnt_increased) {
>>>>                  bool wake = false;
>>>>                  spin_lock(&fi->lock);
>>>>                  if (--fi->shared_lock_direct_io_ctr == 0)
>>>>                          wake = true;
>>>>                  spin_unlock(&fi->lock);
>>>>                  if (wake)
>>>>                          wake_up(&fi->direct_io_waitq);
>>>>          }
>>>
>>> I don't see how this wake_up code is reachable.
>>>
>>> TBH, I don't fully understand the expected result.
>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>> flight. It only prevents writeback while dio is in flight, which is the same
>>> behavior as with exclusive inode lock. Right?
>>
>> Yeah, thanks. I will add it in the patch description.
>>
>> And there was actually an issue with the patch, as cache flushing needs
>> to be initiated before doing the lock decision, fixed now.
>>
> 
> I thought there was, because of the wait in fuse_send_writepage()
> but wasn't sure if I was following the flow correctly.
> 
>>>
>>> Maybe this interaction is spelled out somewhere else, but if not
>>> better spell it out for people like me that are new to this code.
>>
>> Sure, thanks a lot for your helpful comments!
>>
> 
> Just to be clear, this patch looks like a good improvement and
> is mostly independent of the "inode caching mode" and
> FOPEN_CACHE_MMAP idea that I suggested.
> 
> The only thing that my idea changes is replacing the
> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
> state, which is set earlier than FUSE_I_CACHE_WRITES
> on caching file open or first direct_io mmap and unlike
> FUSE_I_CACHE_WRITES, it is cleared on the last file close.

That is actually an improvement over my patch, I can add this in as 
well. Will also change to FUSE_I_CACHE_IO_MODE and update the patch to 
set it the flag in fuse_file_mmap.

First need to investigate why a loop of generic/095 triggers a locking 
issue, probably something wrong with my latest patch version.

> 
> FUSE_I_CACHE_WRITES means that caching writes happened.
> FUSE_I_CACHE_IO_MODE means the caching writes and reads
> may happen.
> 
> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
> about "caching reads may happen", but IMO that is a small trade off
> to make for maintaining the same state for
> "do not allow parallel dio" and "do not allow passthrough open".

Yeah and better to improve things (and add more complexity) when someone 
has a real live workload.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-07  7:39                             ` Amir Goldstein
  2023-12-07  9:12                               ` Bernd Schubert
@ 2023-12-07 18:37                               ` Bernd Schubert
  2023-12-08  8:39                                 ` Amir Goldstein
  1 sibling, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-07 18:37 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

[-- Attachment #1: Type: text/plain, Size: 6932 bytes --]



On 12/7/23 08:39, Amir Goldstein wrote:
> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>
>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>
>>>>>>
>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>> for network fs. Right?
>>>>>
>>>>> We kind of have these use cases for our network file systems
>>>>>
>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>       - Traditional HPC, large files, parallel IO
>>>>>       - Large file used on local node as container for many small files
>>>>>
>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>       - compilation through gcc (not so important, just not nice when it
>>>>> does not work)
>>>>>       - rather recent: python libraries using mmap _reads_. As it is read
>>>>> only no issue of consistency.
>>>>>
>>>>>
>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>
>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>> and post the next version
>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>
>>>>>
>>>>> In the mean time I have another idea how to solve
>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>
>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>> crashing for me anymore. I just finished the initial coding - it still
>>>> needs a bit cleanup and maybe a few comments.
>>>>
>>>
>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>> of the last open file of the inode.
>>>
>>> I did not understand some of the complexity here:
>>>
>>>>          /* The inode ever got page writes and we do not know for sure
>>>>           * in the DIO path if these are pending - shared lock not possible */
>>>>          spin_lock(&fi->lock);
>>>>          if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>                  if (!(*cnt_increased)) {
>>>
>>> How can *cnt_increased be true here?
>>
>> I think you missed the 2nd entry into this function, when the shared
>> lock was already taken?
> 
> Yeh, I did.
> 
>> I have changed the code now to have all
>> complexity in this function (test, lock, retest with lock, release,
>> wakeup). I hope that will make it easier to see the intention of the
>> code. Will post the new patches in the morning.
>>
> 
> Sounds good. Current version was a bit hard to follow.
> 
>>
>>>
>>>>                          fi->shared_lock_direct_io_ctr++;
>>>>                          *cnt_increased = true;
>>>>                  }
>>>>                  excl_lock = false;
>>>
>>> Seems like in every outcome of this function
>>> *cnt_increased = !excl_lock
>>> so there is not need for out arg cnt_increased
>>
>> If excl_lock would be used as input - yeah, would have worked as well.
>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>> going in and out.
>>
>>>
>>>>          }
>>>>          spin_unlock(&fi->lock);
>>>>
>>>> out:
>>>>          if (excl_lock && *cnt_increased) {
>>>>                  bool wake = false;
>>>>                  spin_lock(&fi->lock);
>>>>                  if (--fi->shared_lock_direct_io_ctr == 0)
>>>>                          wake = true;
>>>>                  spin_unlock(&fi->lock);
>>>>                  if (wake)
>>>>                          wake_up(&fi->direct_io_waitq);
>>>>          }
>>>
>>> I don't see how this wake_up code is reachable.
>>>
>>> TBH, I don't fully understand the expected result.
>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>> flight. It only prevents writeback while dio is in flight, which is the same
>>> behavior as with exclusive inode lock. Right?
>>
>> Yeah, thanks. I will add it in the patch description.
>>
>> And there was actually an issue with the patch, as cache flushing needs
>> to be initiated before doing the lock decision, fixed now.
>>
> 
> I thought there was, because of the wait in fuse_send_writepage()
> but wasn't sure if I was following the flow correctly.
> 
>>>
>>> Maybe this interaction is spelled out somewhere else, but if not
>>> better spell it out for people like me that are new to this code.
>>
>> Sure, thanks a lot for your helpful comments!
>>
> 
> Just to be clear, this patch looks like a good improvement and
> is mostly independent of the "inode caching mode" and
> FOPEN_CACHE_MMAP idea that I suggested.
> 
> The only thing that my idea changes is replacing the
> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
> state, which is set earlier than FUSE_I_CACHE_WRITES
> on caching file open or first direct_io mmap and unlike
> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
> 
> FUSE_I_CACHE_WRITES means that caching writes happened.
> FUSE_I_CACHE_IO_MODE means the caching writes and reads
> may happen.
> 
> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
> about "caching reads may happen", but IMO that is a small trade off
> to make for maintaining the same state for
> "do not allow parallel dio" and "do not allow passthrough open".

I think the attached patches should do, it now also unsets 
FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from 
fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall, 
but that aligns with passthrough anyway? Amir, right now it only sets
FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
for passthrough there?

@Miklos, could please tell me how to move forward? I definitely need to 
rebase to fuse-next, but my question is if this patch here should 
replace Amirs fix (and get back ported) or if we should apply it on top
of Amirs patch and so let that simple fix get back ported? Given this is 
all features and new flags - I'm all for for the simple fix.
If you agree on the general approach, I can put this on top of my dio
consolidate branch and rebase the rest of the patches on top of it. That 
part will get a bit more complicated, as we will also need to handle 
plain O_DIRECT.


Thanks,
Bernd

[-- Attachment #2: 01-fuse-Create-helper-function-if.patch --]
[-- Type: text/x-patch, Size: 3366 bytes --]

fuse: Create helper function if DIO write needs exclusive lock

From: Bernd Schubert <bschubert@ddn.com>

This is just a preparation for follow up patches.

Cc: Hao Xu <howeyxu@tencent.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Cc: stable@vger.kernel.org
Preparation for Fixes: 153524053bbb ("fuse: allow non-extending parallel direct writes on the same file")
---
 fs/fuse/file.c |   57 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..9cc7184241e5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1298,6 +1298,42 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
 	return res;
 }
 
+static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
+/*
+ * @return true if an exclusive lock for direct IO writes is needed
+ */
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	bool excl_lock = true;
+
+	/* server side has to advise that it supports parallel dio writes */
+	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+		goto out;
+
+	/* append will need to know the eventual eof - always needs an
+	 * exclusive lock
+	 */
+	if (iocb->ki_flags & IOCB_APPEND)
+		goto out;
+
+	/* parallel dio beyond eof is at least for now not supported */
+	if (fuse_io_past_eof(iocb, from))
+		goto out;
+
+	excl_lock = false;
+
+out:
+	return excl_lock;
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
@@ -1557,25 +1593,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return res;
 }
 
-static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
-					       struct iov_iter *iter)
-{
-	struct inode *inode = file_inode(iocb->ki_filp);
-
-	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
-	struct file *file = iocb->ki_filp;
-	struct fuse_file *ff = file->private_data;
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
-	bool exclusive_lock =
-		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
-		iocb->ki_flags & IOCB_APPEND ||
-		fuse_direct_write_extending_i_size(iocb, from);
+	bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from);
 
 	/*
 	 * Take exclusive lock if
@@ -1588,10 +1611,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	else {
 		inode_lock_shared(inode);
 
-		/* A race with truncate might have come up as the decision for
-		 * the lock type was done without holding the lock, check again.
+		/*
+		 * Previous check was without any lock and might have raced.
 		 */
-		if (fuse_direct_write_extending_i_size(iocb, from)) {
+		if (fuse_dio_wr_exclusive_lock(iocb, from)) {
 			inode_unlock_shared(inode);
 			inode_lock(inode);
 			exclusive_lock = true;

[-- Attachment #3: 02-fix-dio-write-shared-lock.patch --]
[-- Type: text/x-patch, Size: 11304 bytes --]

fuse: Test for page cache writes in the shared lock DIO decision

From: Bernd Schubert <bschubert@ddn.com>

xfstest generic/095 triggers BUG_ON(fi->writectr < 0) in
fuse_set_nowrite().
This happens with a shared lock for FOPEN_DIRECT_IO and when in parallel
mmap writes happen (FUSE_DIRECT_IO_RELAX is set).
Reason is that multiple DIO writers see that the inode has pending
page IO writes and try to set FUSE_NOWRITE, but this code path requires
serialization. Ideal would be to let fuse_dio_wr_exclusive_lock detect if
there are outstanding writes, but that would require to hold an inode
lock in related page/folio write paths. Another solution would be to
disable the shared inode lock for FOPEN_DIRECT_IO, when
FUSE_DIRECT_IO_RELAX is set, but typically userspace/server side will set
these flags for all inodes (or not at all). With that FUSE_DIRECT_IO_RELAX
would entirely disable the shared lock and impose serialization even
though no page IO is ever done for inodes.  The solution here stores a
flag into the fuse inode when mmap is started. This flag is used to
to enforce the exclusive inode lock for FOPEN_DIRECT_IO.
Other than that, the patch does not help to improve consistensty for
concurrent page cache (so far only mmap) and direct IO file writes.

Cc: Hao Xu <howeyxu@tencent.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Cc: stable@vger.kernel.org
Fixes: 153524053bbb ("fuse: allow non-extending parallel direct writes on the same file")
---
 fs/fuse/dir.c    |    1 
 fs/fuse/file.c   |  153 ++++++++++++++++++++++++++++++++++++++++++------------
 fs/fuse/fuse_i.h |   12 ++++
 fs/fuse/inode.c  |    1 
 4 files changed, 133 insertions(+), 34 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d19cbf34c634..09aaaa31ae28 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1751,6 +1751,7 @@ void fuse_set_nowrite(struct inode *inode)
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	BUG_ON(!inode_is_locked(inode));
+	lockdep_assert_held_write(&inode->i_rwsem);
 
 	spin_lock(&fi->lock);
 	BUG_ON(fi->writectr < 0);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9cc7184241e5..5d76ebd5419c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -99,6 +99,16 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
 			     int error)
 {
 	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
+	struct fuse_inode *fi = get_fuse_inode(ra->inode);
+
+	spin_lock(&fi->lock);
+	if (--fi->open_ctr == 0) {
+		/* no open file left anymore, remove restrictions from
+		 * the cache bit
+		 */
+		clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+	}
+	spin_unlock(&fi->lock);
 
 	iput(ra->inode);
 	kfree(ra);
@@ -121,6 +131,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 						   GFP_KERNEL | __GFP_NOFAIL))
 				fuse_release_end(ff->fm, args, -ENOTCONN);
 		}
+
 		kfree(ff);
 	}
 }
@@ -198,6 +209,7 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	if (ff->open_flags & FOPEN_STREAM)
 		stream_open(inode, file);
@@ -205,8 +217,6 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		nonseekable_open(inode, file);
 
 	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-
 		spin_lock(&fi->lock);
 		fi->attr_version = atomic64_inc_return(&fc->attr_version);
 		i_size_write(inode, 0);
@@ -216,6 +226,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 	}
 	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 		fuse_link_write_file(file);
+
+	spin_lock(&fi->lock);
+	fi->open_ctr++;
+	spin_unlock(&fi->lock);
 }
 
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1306,13 +1320,19 @@ static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
 }
 
 /*
- * @return true if an exclusive lock for direct IO writes is needed
+ * @return true if an exclusive lock for direct IO writes is taken, false
+ *	   for the shared lock
  */
-static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+bool fuse_dio_lock_inode(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fm->fc;
 	bool excl_lock = true;
+	bool retest = false;
+	bool wake = false;
 
 	/* server side has to advise that it supports parallel dio writes */
 	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
@@ -1324,13 +1344,67 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
 	if (iocb->ki_flags & IOCB_APPEND)
 		goto out;
 
+retest_with_lock:
 	/* parallel dio beyond eof is at least for now not supported */
 	if (fuse_io_past_eof(iocb, from))
 		goto out;
 
-	excl_lock = false;
+	/* no need to optimize async requests */
+	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT &&
+	    fc->async_dio)
+		goto out;
+
+	/* If the inode ever got page writes, we do not know for sure
+	 * in the DIO path if these are pending - a shared lock is then
+	 * not possible
+	 */
+	spin_lock(&fi->lock);
+	if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) {
+		if (retest) {
+			excl_lock = true;
+			if (--fi->shared_lock_direct_io_ctr == 0)
+				wake = true;
+		}
+	} else {
+		if (!retest) {
+			excl_lock = false;
+			/* Increase the counter as soon as the decision for
+			 * shared locks was made to hold off page IO tasks
+			 */
+			if (!retest)
+				fi->shared_lock_direct_io_ctr++;
+		}
+	}
+	spin_unlock(&fi->lock);
 
 out:
+	if (retest) {
+		if (excl_lock) {
+			/* a race happened the lock type needs to change */
+			inode_unlock_shared(inode);
+
+			/* Increasing the shared_lock_direct_io_ctr counter
+			 *  might have hold off page cache tasks, wake these up.
+			 */
+			if (wake)
+				wake_up(&fi->direct_io_waitq);
+
+			inode_lock(inode);
+		}
+	} else {
+		if (excl_lock) {
+			inode_lock(inode);
+		} else {
+			inode_lock_shared(inode);
+
+			/* Need to retest after taken the shared lock, to see
+			 * if there are races
+			 */
+			retest = true;
+			goto retest_with_lock;
+		}
+	}
+
 	return excl_lock;
 }
 
@@ -1596,30 +1670,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
-	bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from);
-
-	/*
-	 * Take exclusive lock if
-	 * - Parallel direct writes are disabled - a user space decision
-	 * - Parallel direct writes are enabled and i_size is being extended.
-	 *   This might not be needed at all, but needs further investigation.
-	 */
-	if (exclusive_lock)
-		inode_lock(inode);
-	else {
-		inode_lock_shared(inode);
 
-		/*
-		 * Previous check was without any lock and might have raced.
-		 */
-		if (fuse_dio_wr_exclusive_lock(iocb, from)) {
-			inode_unlock_shared(inode);
-			inode_lock(inode);
-			exclusive_lock = true;
-		}
-	}
+	/* take inode_lock or inode_lock_shared */
+	bool exclusive = fuse_dio_lock_inode(iocb, from);
 
 	res = generic_write_checks(iocb, from);
 	if (res > 0) {
@@ -1631,10 +1687,20 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			fuse_write_update_attr(inode, iocb->ki_pos, res);
 		}
 	}
-	if (exclusive_lock)
+
+	if (exclusive)
 		inode_unlock(inode);
-	else
+	else {
+		bool wake = false;
+
 		inode_unlock_shared(inode);
+		spin_lock(&fi->lock);
+		if (--fi->shared_lock_direct_io_ctr == 0)
+			wake = true;
+		spin_unlock(&fi->lock);
+		if (wake)
+			wake_up(&fi->direct_io_waitq);
+	}
 
 	return res;
 }
@@ -2481,18 +2547,35 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = ff->fm->fc;
 
 	/* DAX mmap is superior to direct_io mmap */
-	if (FUSE_IS_DAX(file_inode(file)))
+	if (FUSE_IS_DAX(inode))
 		return fuse_dax_mmap(file, vma);
 
 	if (ff->open_flags & FOPEN_DIRECT_IO) {
-		/* Can't provide the coherency needed for MAP_SHARED
-		 * if FUSE_DIRECT_IO_RELAX isn't set.
-		 */
-		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax)
-			return -ENODEV;
+		if (vma->vm_flags & VM_MAYSHARE) {
+			/* Can't provide the coherency needed for MAP_SHARED
+			 * if FUSE_DIRECT_IO_RELAX isn't set.
+			 */
+			if (!fc->direct_io_relax)
+				return -ENODEV;
+
+			if (vma->vm_flags & VM_MAYWRITE) {
+				if (!test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
+					set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+
+				/* direct-io with shared locks cannot handle
+				 * page cache io - wait until it is done
+				 */
+				if (fi->shared_lock_direct_io_ctr != 0) {
+					wait_event(fi->direct_io_waitq,
+						   READ_ONCE(fi->shared_lock_direct_io_ctr) == 0);
+				}
+			}
+		}
 
 		invalidate_inode_pages2(file->f_mapping);
 
@@ -3265,7 +3348,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	fi->writectr = 0;
+	fi->shared_lock_direct_io_ctr = 0;
 	init_waitqueue_head(&fi->page_waitq);
+	init_waitqueue_head(&fi->direct_io_waitq);
 	fi->writepages = RB_ROOT;
 
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6e6e721f421b..27750251d0e5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -84,6 +84,9 @@ struct fuse_inode {
 	/* Which attributes are invalid */
 	u32 inval_mask;
 
+	/* number of opened files by this inode */
+	u32 open_ctr;
+
 	/** The sticky bit in inode->i_mode may have been removed, so
 	    preserve the original mode */
 	umode_t orig_i_mode;
@@ -110,11 +113,17 @@ struct fuse_inode {
 			 * (FUSE_NOWRITE) means more writes are blocked */
 			int writectr;
 
+			/* counter of tasks with shared lock direct-io writes */
+			int shared_lock_direct_io_ctr;
+
 			/* Waitq for writepage completion */
 			wait_queue_head_t page_waitq;
 
 			/* List of writepage requestst (pending or sent) */
 			struct rb_root writepages;
+
+			/* waitq for direct-io completion */
+			wait_queue_head_t direct_io_waitq;
 		};
 
 		/* readdir cache (directory only) */
@@ -172,6 +181,9 @@ enum {
 	FUSE_I_BAD,
 	/* Has btime */
 	FUSE_I_BTIME,
+	/* Has page cache IO */
+	FUSE_I_CACHE_IO_MODE,
+
 };
 
 struct fuse_conn;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 74d4f09d5827..311d1ed73fb7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -83,6 +83,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->attr_version = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->open_ctr = 0;
 	mutex_init(&fi->mutex);
 	spin_lock_init(&fi->lock);
 	fi->forget = fuse_alloc_forget();

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-07 18:37                               ` Bernd Schubert
@ 2023-12-08  8:39                                 ` Amir Goldstein
  2023-12-08 19:49                                   ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-08  8:39 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/7/23 08:39, Amir Goldstein wrote:
> > On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
> > <bernd.schubert@fastmail.fm> wrote:
> >>
> >>
> >>
> >> On 12/6/23 09:25, Amir Goldstein wrote:
> >>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> >>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> >>>>>> I guess not otherwise, the combination would have been tested.
> >>>>>
> >>>>> I'm not sure how many people are aware of these different flags/features.
> >>>>> I had just finalized the backport of the related patches to RHEL8 on
> >>>>> Friday, as we (or our customers) need both for different jobs.
> >>>>>
> >>>>>>
> >>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> >>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> >>>>>> for network fs. Right?
> >>>>>
> >>>>> We kind of have these use cases for our network file systems
> >>>>>
> >>>>> FOPEN_PARALLEL_DIRECT_WRITES:
> >>>>>       - Traditional HPC, large files, parallel IO
> >>>>>       - Large file used on local node as container for many small files
> >>>>>
> >>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
> >>>>>       - compilation through gcc (not so important, just not nice when it
> >>>>> does not work)
> >>>>>       - rather recent: python libraries using mmap _reads_. As it is read
> >>>>> only no issue of consistency.
> >>>>>
> >>>>>
> >>>>> These jobs do not intermix - no issue as in generic/095. If such
> >>>>> applications really exist, I have no issue with a serialization penalty.
> >>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
> >>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> >>>>>
> >>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
> >>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
> >>>>> and post the next version
> >>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
> >>>>>
> >>>>>
> >>>>> In the mean time I have another idea how to solve
> >>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
> >>>>
> >>>> Please find attached what I had in my mind. With that generic/095 is not
> >>>> crashing for me anymore. I just finished the initial coding - it still
> >>>> needs a bit cleanup and maybe a few comments.
> >>>>
> >>>
> >>> Nice. I like the FUSE_I_CACHE_WRITES state.
> >>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
> >>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
> >>> of the last open file of the inode.
> >>>
> >>> I did not understand some of the complexity here:
> >>>
> >>>>          /* The inode ever got page writes and we do not know for sure
> >>>>           * in the DIO path if these are pending - shared lock not possible */
> >>>>          spin_lock(&fi->lock);
> >>>>          if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
> >>>>                  if (!(*cnt_increased)) {
> >>>
> >>> How can *cnt_increased be true here?
> >>
> >> I think you missed the 2nd entry into this function, when the shared
> >> lock was already taken?
> >
> > Yeh, I did.
> >
> >> I have changed the code now to have all
> >> complexity in this function (test, lock, retest with lock, release,
> >> wakeup). I hope that will make it easier to see the intention of the
> >> code. Will post the new patches in the morning.
> >>
> >
> > Sounds good. Current version was a bit hard to follow.
> >
> >>
> >>>
> >>>>                          fi->shared_lock_direct_io_ctr++;
> >>>>                          *cnt_increased = true;
> >>>>                  }
> >>>>                  excl_lock = false;
> >>>
> >>> Seems like in every outcome of this function
> >>> *cnt_increased = !excl_lock
> >>> so there is not need for out arg cnt_increased
> >>
> >> If excl_lock would be used as input - yeah, would have worked as well.
> >> Or a parameter like "retest-under-lock". Code is changed now to avoid
> >> going in and out.
> >>
> >>>
> >>>>          }
> >>>>          spin_unlock(&fi->lock);
> >>>>
> >>>> out:
> >>>>          if (excl_lock && *cnt_increased) {
> >>>>                  bool wake = false;
> >>>>                  spin_lock(&fi->lock);
> >>>>                  if (--fi->shared_lock_direct_io_ctr == 0)
> >>>>                          wake = true;
> >>>>                  spin_unlock(&fi->lock);
> >>>>                  if (wake)
> >>>>                          wake_up(&fi->direct_io_waitq);
> >>>>          }
> >>>
> >>> I don't see how this wake_up code is reachable.
> >>>
> >>> TBH, I don't fully understand the expected result.
> >>> Surely, the behavior of dio mixed with mmap is undefined. Right?
> >>> IIUC, your patch does not prevent dirtying page cache while dio is in
> >>> flight. It only prevents writeback while dio is in flight, which is the same
> >>> behavior as with exclusive inode lock. Right?
> >>
> >> Yeah, thanks. I will add it in the patch description.
> >>
> >> And there was actually an issue with the patch, as cache flushing needs
> >> to be initiated before doing the lock decision, fixed now.
> >>
> >
> > I thought there was, because of the wait in fuse_send_writepage()
> > but wasn't sure if I was following the flow correctly.
> >
> >>>
> >>> Maybe this interaction is spelled out somewhere else, but if not
> >>> better spell it out for people like me that are new to this code.
> >>
> >> Sure, thanks a lot for your helpful comments!
> >>
> >
> > Just to be clear, this patch looks like a good improvement and
> > is mostly independent of the "inode caching mode" and
> > FOPEN_CACHE_MMAP idea that I suggested.
> >
> > The only thing that my idea changes is replacing the
> > FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
> > state, which is set earlier than FUSE_I_CACHE_WRITES
> > on caching file open or first direct_io mmap and unlike
> > FUSE_I_CACHE_WRITES, it is cleared on the last file close.
> >
> > FUSE_I_CACHE_WRITES means that caching writes happened.
> > FUSE_I_CACHE_IO_MODE means the caching writes and reads
> > may happen.
> >
> > FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
> > about "caching reads may happen", but IMO that is a small trade off
> > to make for maintaining the same state for
> > "do not allow parallel dio" and "do not allow passthrough open".
>
> I think the attached patches should do, it now also unsets

IMO, your patch is still more complicated than it should be.
There is no need for the complicated retest state machine.
If you split the helpers to:

bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
...
fuse_dio_lock_inode(iocb, &exclusive);
...
fuse_dio_unlock_inode(iocb, &exclusive);

Then you only need to test FUSE_I_CACHE_IO_MODE in
fuse_dio_wr_needs_exclusive_lock()
and you only need to increment shared_lock_direct_io_ctr
after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.

> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
> but that aligns with passthrough anyway?

Yes.

I see that shared_lock_direct_io_ctr is checked without lock or barriers
in and the wait_event() should be interruptible.
I am also not sure if it breaks any locking order for mmap because
the task that is going to wake it up is holding the shared inode lock...

While looking at this code, the invalidate_inode_pages2() looks suspicious.
If inode is already in FUSE_I_CACHE_IO_MODE when performing
another mmap, doesn't that have potential for data loss?
(even before your patch I mean)

> Amir, right now it only sets
> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
> for passthrough there?
>

We could add a condition, but I don't think that we should.
I think we should refrain from different behavior when it is not justified.
I think it is not justified to allow parallel dio if any file is open in
caching mode on the inode and any mmap (private or shared)
exists on the inode.

That means that FUSE_I_CACHE_IO_MODE should be set on
any mmap, and already on open for non direct_io files.

Mixing caching and direct io on the same inode is hard as it is
already and there is no need to add complexity by allowing
parallel dio in that case. IMO it wins us nothing.

The FUSE_I_CACHE_IO_MODE could be cleared on last file
close (as your patch did) but it could be cleared earlier if
instead of tracking refcount of open file, we track refcount of
files open in caching mode or mmaped, which is what the
FOPEN_MMAP_CACHE flag I suggested is for.

Not sure this is a big win over refount of open files, which is simpler.
The use case is a db file which is open with concurrent dio writers
and some 3rd party app decides that it wants to mmap this file
for some other reason (indexer, virus scan, whatnot) and will taint
the inode with FUSE_I_CACHE_IO_MODE and degrade db performance
until db closes the file.

> @Miklos, could please tell me how to move forward? I definitely need to
> rebase to fuse-next, but my question is if this patch here should
> replace Amirs fix (and get back ported) or if we should apply it on top
> of Amirs patch and so let that simple fix get back ported? Given this is
> all features and new flags - I'm all for for the simple fix.
> If you agree on the general approach, I can put this on top of my dio
> consolidate branch and rebase the rest of the patches on top of it. That
> part will get a bit more complicated, as we will also need to handle
> plain O_DIRECT.
>

I was planning to post a patch for FUSE_I_CACHE_IO_MODE
myself, but feel free to work on your version and we could decide
which parts to take from which patch at the end.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-08  8:39                                 ` Amir Goldstein
@ 2023-12-08 19:49                                   ` Bernd Schubert
  2023-12-08 20:46                                     ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-08 19:49 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/8/23 09:39, Amir Goldstein wrote:
> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/7/23 08:39, Amir Goldstein wrote:
>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>>
>>>>
>>>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>>>
>>>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>>>
>>>>>>>>
>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>>>> for network fs. Right?
>>>>>>>
>>>>>>> We kind of have these use cases for our network file systems
>>>>>>>
>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>>>        - Traditional HPC, large files, parallel IO
>>>>>>>        - Large file used on local node as container for many small files
>>>>>>>
>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>>>        - compilation through gcc (not so important, just not nice when it
>>>>>>> does not work)
>>>>>>>        - rather recent: python libraries using mmap _reads_. As it is read
>>>>>>> only no issue of consistency.
>>>>>>>
>>>>>>>
>>>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>>>
>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>>>> and post the next version
>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>>>
>>>>>>>
>>>>>>> In the mean time I have another idea how to solve
>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>>>
>>>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>>>> crashing for me anymore. I just finished the initial coding - it still
>>>>>> needs a bit cleanup and maybe a few comments.
>>>>>>
>>>>>
>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>>>> of the last open file of the inode.
>>>>>
>>>>> I did not understand some of the complexity here:
>>>>>
>>>>>>           /* The inode ever got page writes and we do not know for sure
>>>>>>            * in the DIO path if these are pending - shared lock not possible */
>>>>>>           spin_lock(&fi->lock);
>>>>>>           if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>>>                   if (!(*cnt_increased)) {
>>>>>
>>>>> How can *cnt_increased be true here?
>>>>
>>>> I think you missed the 2nd entry into this function, when the shared
>>>> lock was already taken?
>>>
>>> Yeh, I did.
>>>
>>>> I have changed the code now to have all
>>>> complexity in this function (test, lock, retest with lock, release,
>>>> wakeup). I hope that will make it easier to see the intention of the
>>>> code. Will post the new patches in the morning.
>>>>
>>>
>>> Sounds good. Current version was a bit hard to follow.
>>>
>>>>
>>>>>
>>>>>>                           fi->shared_lock_direct_io_ctr++;
>>>>>>                           *cnt_increased = true;
>>>>>>                   }
>>>>>>                   excl_lock = false;
>>>>>
>>>>> Seems like in every outcome of this function
>>>>> *cnt_increased = !excl_lock
>>>>> so there is not need for out arg cnt_increased
>>>>
>>>> If excl_lock would be used as input - yeah, would have worked as well.
>>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>>>> going in and out.
>>>>
>>>>>
>>>>>>           }
>>>>>>           spin_unlock(&fi->lock);
>>>>>>
>>>>>> out:
>>>>>>           if (excl_lock && *cnt_increased) {
>>>>>>                   bool wake = false;
>>>>>>                   spin_lock(&fi->lock);
>>>>>>                   if (--fi->shared_lock_direct_io_ctr == 0)
>>>>>>                           wake = true;
>>>>>>                   spin_unlock(&fi->lock);
>>>>>>                   if (wake)
>>>>>>                           wake_up(&fi->direct_io_waitq);
>>>>>>           }
>>>>>
>>>>> I don't see how this wake_up code is reachable.
>>>>>
>>>>> TBH, I don't fully understand the expected result.
>>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>>>> flight. It only prevents writeback while dio is in flight, which is the same
>>>>> behavior as with exclusive inode lock. Right?
>>>>
>>>> Yeah, thanks. I will add it in the patch description.
>>>>
>>>> And there was actually an issue with the patch, as cache flushing needs
>>>> to be initiated before doing the lock decision, fixed now.
>>>>
>>>
>>> I thought there was, because of the wait in fuse_send_writepage()
>>> but wasn't sure if I was following the flow correctly.
>>>
>>>>>
>>>>> Maybe this interaction is spelled out somewhere else, but if not
>>>>> better spell it out for people like me that are new to this code.
>>>>
>>>> Sure, thanks a lot for your helpful comments!
>>>>
>>>
>>> Just to be clear, this patch looks like a good improvement and
>>> is mostly independent of the "inode caching mode" and
>>> FOPEN_CACHE_MMAP idea that I suggested.
>>>
>>> The only thing that my idea changes is replacing the
>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
>>> state, which is set earlier than FUSE_I_CACHE_WRITES
>>> on caching file open or first direct_io mmap and unlike
>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
>>>
>>> FUSE_I_CACHE_WRITES means that caching writes happened.
>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
>>> may happen.
>>>
>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
>>> about "caching reads may happen", but IMO that is a small trade off
>>> to make for maintaining the same state for
>>> "do not allow parallel dio" and "do not allow passthrough open".
>>
>> I think the attached patches should do, it now also unsets
> 
> IMO, your patch is still more complicated than it should be.
> There is no need for the complicated retest state machine.
> If you split the helpers to:
> 
> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
> ...
> fuse_dio_lock_inode(iocb, &exclusive);
> ...
> fuse_dio_unlock_inode(iocb, &exclusive);
> 
> Then you only need to test FUSE_I_CACHE_IO_MODE in
> fuse_dio_wr_needs_exclusive_lock()
> and you only need to increment shared_lock_direct_io_ctr
> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.

Hmm, I'm not sure.

I changed fuse_file_mmap() to call this function

/*
  * direct-io with shared locks cannot handle page cache io - set an inode
  * flag to disable shared locks and wait until remaining threads are done
  */
static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
{
	struct fuse_inode *fi = get_fuse_inode(inode);

	spin_lock(&fi->lock);
	set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
	while (fi->shared_lock_direct_io_ctr > 0) {
		spin_unlock(&fi->lock);
		wait_event_interruptible(fi->direct_io_waitq,
					 fi->shared_lock_direct_io_ctr == 0);
		spin_lock(&fi->lock);
	}
	spin_unlock(&fi->lock);
}


Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.


I'm going to think about over the weekend if your suggestion
to increase fi->shared_lock_direct_io_ctr only after taking the shared
lock is possible. Right now I don't see how to do that.


> 
>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
>> but that aligns with passthrough anyway?
> 
> Yes.
> 
> I see that shared_lock_direct_io_ctr is checked without lock or barriers
> in and the wait_event() should be interruptible.

Thanks, fixed with the function above.

> I am also not sure if it breaks any locking order for mmap because
> the task that is going to wake it up is holding the shared inode lock...

The waitq has its own lock. We have

fuse_file_mmap - called under some mmap lock, waitq lock

fuse_dio_lock_inode: no lock taken before calling wakeup

fuse_direct_write_iter: wakeup after release of all locks

So I don't think we have a locker issue (lockdep also doesn't annotate
anything).
What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap

> 
> While looking at this code, the invalidate_inode_pages2() looks suspicious.
> If inode is already in FUSE_I_CACHE_IO_MODE when performing
> another mmap, doesn't that have potential for data loss?
> (even before your patch I mean)
> 
>> Amir, right now it only sets
>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
>> for passthrough there?
>>
> 
> We could add a condition, but I don't think that we should.
> I think we should refrain from different behavior when it is not justified.
> I think it is not justified to allow parallel dio if any file is open in
> caching mode on the inode and any mmap (private or shared)
> exists on the inode.
> 
> That means that FUSE_I_CACHE_IO_MODE should be set on
> any mmap, and already on open for non direct_io files.

Ok, I can change and add that. Doing it in open is definitely needed
for O_DIRECT (in my other dio branch).

> 
> Mixing caching and direct io on the same inode is hard as it is
> already and there is no need to add complexity by allowing
> parallel dio in that case. IMO it wins us nothing.

So the slight issue I see are people like me, who check the content
of a file during a long running computation. Like an HPC application
is doing some long term runs. Then in the middle of
the run the user wants to see the current content of the file and
reads it - if that is done through mmap (and from a node that runs
the application), parallel DIO is disabled with the current patch
until the file is closed - I see the use case to check for writes.


> 
> The FUSE_I_CACHE_IO_MODE could be cleared on last file
> close (as your patch did) but it could be cleared earlier if
> instead of tracking refcount of open file, we track refcount of
> files open in caching mode or mmaped, which is what the
> FOPEN_MMAP_CACHE flag I suggested is for.

But how does open() know that a file/fd is used for mmap?

> 
> Not sure this is a big win over refount of open files, which is simpler.
> The use case is a db file which is open with concurrent dio writers
> and some 3rd party app decides that it wants to mmap this file
> for some other reason (indexer, virus scan, whatnot) and will taint
> the inode with FUSE_I_CACHE_IO_MODE and degrade db performance
> until db closes the file.

Yeah, so similar use case as mine.

> 
>> @Miklos, could please tell me how to move forward? I definitely need to
>> rebase to fuse-next, but my question is if this patch here should
>> replace Amirs fix (and get back ported) or if we should apply it on top
>> of Amirs patch and so let that simple fix get back ported? Given this is
>> all features and new flags - I'm all for for the simple fix.
>> If you agree on the general approach, I can put this on top of my dio
>> consolidate branch and rebase the rest of the patches on top of it. That
>> part will get a bit more complicated, as we will also need to handle
>> plain O_DIRECT.
>>
> 
> I was planning to post a patch for FUSE_I_CACHE_IO_MODE
> myself, but feel free to work on your version and we could decide
> which parts to take from which patch at the end.

Ok.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-08 19:49                                   ` Bernd Schubert
@ 2023-12-08 20:46                                     ` Amir Goldstein
  2023-12-08 22:38                                       ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-08 20:46 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/8/23 09:39, Amir Goldstein wrote:
> > On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
> > <bernd.schubert@fastmail.fm> wrote:
> >>
> >>
> >>
> >> On 12/7/23 08:39, Amir Goldstein wrote:
> >>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
> >>> <bernd.schubert@fastmail.fm> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 12/6/23 09:25, Amir Goldstein wrote:
> >>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> >>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> >>>>>>>> I guess not otherwise, the combination would have been tested.
> >>>>>>>
> >>>>>>> I'm not sure how many people are aware of these different flags/features.
> >>>>>>> I had just finalized the backport of the related patches to RHEL8 on
> >>>>>>> Friday, as we (or our customers) need both for different jobs.
> >>>>>>>
> >>>>>>>>
> >>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> >>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> >>>>>>>> for network fs. Right?
> >>>>>>>
> >>>>>>> We kind of have these use cases for our network file systems
> >>>>>>>
> >>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
> >>>>>>>        - Traditional HPC, large files, parallel IO
> >>>>>>>        - Large file used on local node as container for many small files
> >>>>>>>
> >>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
> >>>>>>>        - compilation through gcc (not so important, just not nice when it
> >>>>>>> does not work)
> >>>>>>>        - rather recent: python libraries using mmap _reads_. As it is read
> >>>>>>> only no issue of consistency.
> >>>>>>>
> >>>>>>>
> >>>>>>> These jobs do not intermix - no issue as in generic/095. If such
> >>>>>>> applications really exist, I have no issue with a serialization penalty.
> >>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
> >>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> >>>>>>>
> >>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
> >>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
> >>>>>>> and post the next version
> >>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
> >>>>>>>
> >>>>>>>
> >>>>>>> In the mean time I have another idea how to solve
> >>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
> >>>>>>
> >>>>>> Please find attached what I had in my mind. With that generic/095 is not
> >>>>>> crashing for me anymore. I just finished the initial coding - it still
> >>>>>> needs a bit cleanup and maybe a few comments.
> >>>>>>
> >>>>>
> >>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
> >>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
> >>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
> >>>>> of the last open file of the inode.
> >>>>>
> >>>>> I did not understand some of the complexity here:
> >>>>>
> >>>>>>           /* The inode ever got page writes and we do not know for sure
> >>>>>>            * in the DIO path if these are pending - shared lock not possible */
> >>>>>>           spin_lock(&fi->lock);
> >>>>>>           if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
> >>>>>>                   if (!(*cnt_increased)) {
> >>>>>
> >>>>> How can *cnt_increased be true here?
> >>>>
> >>>> I think you missed the 2nd entry into this function, when the shared
> >>>> lock was already taken?
> >>>
> >>> Yeh, I did.
> >>>
> >>>> I have changed the code now to have all
> >>>> complexity in this function (test, lock, retest with lock, release,
> >>>> wakeup). I hope that will make it easier to see the intention of the
> >>>> code. Will post the new patches in the morning.
> >>>>
> >>>
> >>> Sounds good. Current version was a bit hard to follow.
> >>>
> >>>>
> >>>>>
> >>>>>>                           fi->shared_lock_direct_io_ctr++;
> >>>>>>                           *cnt_increased = true;
> >>>>>>                   }
> >>>>>>                   excl_lock = false;
> >>>>>
> >>>>> Seems like in every outcome of this function
> >>>>> *cnt_increased = !excl_lock
> >>>>> so there is not need for out arg cnt_increased
> >>>>
> >>>> If excl_lock would be used as input - yeah, would have worked as well.
> >>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
> >>>> going in and out.
> >>>>
> >>>>>
> >>>>>>           }
> >>>>>>           spin_unlock(&fi->lock);
> >>>>>>
> >>>>>> out:
> >>>>>>           if (excl_lock && *cnt_increased) {
> >>>>>>                   bool wake = false;
> >>>>>>                   spin_lock(&fi->lock);
> >>>>>>                   if (--fi->shared_lock_direct_io_ctr == 0)
> >>>>>>                           wake = true;
> >>>>>>                   spin_unlock(&fi->lock);
> >>>>>>                   if (wake)
> >>>>>>                           wake_up(&fi->direct_io_waitq);
> >>>>>>           }
> >>>>>
> >>>>> I don't see how this wake_up code is reachable.
> >>>>>
> >>>>> TBH, I don't fully understand the expected result.
> >>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
> >>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
> >>>>> flight. It only prevents writeback while dio is in flight, which is the same
> >>>>> behavior as with exclusive inode lock. Right?
> >>>>
> >>>> Yeah, thanks. I will add it in the patch description.
> >>>>
> >>>> And there was actually an issue with the patch, as cache flushing needs
> >>>> to be initiated before doing the lock decision, fixed now.
> >>>>
> >>>
> >>> I thought there was, because of the wait in fuse_send_writepage()
> >>> but wasn't sure if I was following the flow correctly.
> >>>
> >>>>>
> >>>>> Maybe this interaction is spelled out somewhere else, but if not
> >>>>> better spell it out for people like me that are new to this code.
> >>>>
> >>>> Sure, thanks a lot for your helpful comments!
> >>>>
> >>>
> >>> Just to be clear, this patch looks like a good improvement and
> >>> is mostly independent of the "inode caching mode" and
> >>> FOPEN_CACHE_MMAP idea that I suggested.
> >>>
> >>> The only thing that my idea changes is replacing the
> >>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
> >>> state, which is set earlier than FUSE_I_CACHE_WRITES
> >>> on caching file open or first direct_io mmap and unlike
> >>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
> >>>
> >>> FUSE_I_CACHE_WRITES means that caching writes happened.
> >>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
> >>> may happen.
> >>>
> >>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
> >>> about "caching reads may happen", but IMO that is a small trade off
> >>> to make for maintaining the same state for
> >>> "do not allow parallel dio" and "do not allow passthrough open".
> >>
> >> I think the attached patches should do, it now also unsets
> >
> > IMO, your patch is still more complicated than it should be.
> > There is no need for the complicated retest state machine.
> > If you split the helpers to:
> >
> > bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
> > ...
> > fuse_dio_lock_inode(iocb, &exclusive);
> > ...
> > fuse_dio_unlock_inode(iocb, &exclusive);
> >
> > Then you only need to test FUSE_I_CACHE_IO_MODE in
> > fuse_dio_wr_needs_exclusive_lock()
> > and you only need to increment shared_lock_direct_io_ctr
> > after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
>
> Hmm, I'm not sure.
>
> I changed fuse_file_mmap() to call this function
>
> /*
>   * direct-io with shared locks cannot handle page cache io - set an inode
>   * flag to disable shared locks and wait until remaining threads are done
>   */
> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
> {
>         struct fuse_inode *fi = get_fuse_inode(inode);
>
>         spin_lock(&fi->lock);
>         set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
>         while (fi->shared_lock_direct_io_ctr > 0) {
>                 spin_unlock(&fi->lock);
>                 wait_event_interruptible(fi->direct_io_waitq,
>                                          fi->shared_lock_direct_io_ctr == 0);
>                 spin_lock(&fi->lock);
>         }
>         spin_unlock(&fi->lock);
> }
>
>
> Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
> and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
> or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
>
>
> I'm going to think about over the weekend if your suggestion
> to increase fi->shared_lock_direct_io_ctr only after taking the shared
> lock is possible. Right now I don't see how to do that.
>
>
> >
> >> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
> >> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
> >> but that aligns with passthrough anyway?
> >
> > Yes.
> >
> > I see that shared_lock_direct_io_ctr is checked without lock or barriers
> > in and the wait_event() should be interruptible.
>
> Thanks, fixed with the function above.
>
> > I am also not sure if it breaks any locking order for mmap because
> > the task that is going to wake it up is holding the shared inode lock...
>
> The waitq has its own lock. We have
>
> fuse_file_mmap - called under some mmap lock, waitq lock
>
> fuse_dio_lock_inode: no lock taken before calling wakeup
>
> fuse_direct_write_iter: wakeup after release of all locks
>
> So I don't think we have a locker issue (lockdep also doesn't annotate
> anything).

I don't think that lockdep can understand this dependency.

> What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap
>

It's complicated. I need to look at the whole thing again.

> >
> > While looking at this code, the invalidate_inode_pages2() looks suspicious.
> > If inode is already in FUSE_I_CACHE_IO_MODE when performing
> > another mmap, doesn't that have potential for data loss?
> > (even before your patch I mean)
> >
> >> Amir, right now it only sets
> >> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
> >> for passthrough there?
> >>
> >
> > We could add a condition, but I don't think that we should.
> > I think we should refrain from different behavior when it is not justified.
> > I think it is not justified to allow parallel dio if any file is open in
> > caching mode on the inode and any mmap (private or shared)
> > exists on the inode.
> >
> > That means that FUSE_I_CACHE_IO_MODE should be set on
> > any mmap, and already on open for non direct_io files.
>
> Ok, I can change and add that. Doing it in open is definitely needed
> for O_DIRECT (in my other dio branch).
>

Good, the more common code the better.

> >
> > Mixing caching and direct io on the same inode is hard as it is
> > already and there is no need to add complexity by allowing
> > parallel dio in that case. IMO it wins us nothing.
>
> So the slight issue I see are people like me, who check the content
> of a file during a long running computation. Like an HPC application
> is doing some long term runs. Then in the middle of
> the run the user wants to see the current content of the file and
> reads it - if that is done through mmap (and from a node that runs
> the application), parallel DIO is disabled with the current patch
> until the file is closed - I see the use case to check for writes.
>

That's what I thought.

>
> >
> > The FUSE_I_CACHE_IO_MODE could be cleared on last file
> > close (as your patch did) but it could be cleared earlier if
> > instead of tracking refcount of open file, we track refcount of
> > files open in caching mode or mmaped, which is what the
> > FOPEN_MMAP_CACHE flag I suggested is for.
>
> But how does open() know that a file/fd is used for mmap?
>

Because what I tried to suggest is a trick/hack:
first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
and bumps the cached_opens on the inode as if file was
opened in caching mode or in FOPEN_MMAP_CACHE mode.
When the file that was used for mmap is closed and all the rest
of the open files have only ever been used for direct_io, then
inode exists the caching io mode.

Using an FOPEN flag for that is kind of a hack.
We could add an internal file state bits for that as well,
but my thinking was that FOPEN_MMAP_CACHE could really
be set by the server to mean per-file ALLOW_MMAP instead of
the per-filesystem ALLOW_MMAP. Not sure if that will be useful.

Sorry for the hand waving. I was trying to send out a demo
patch that explains it better, but got caught up with other things.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-08 20:46                                     ` Amir Goldstein
@ 2023-12-08 22:38                                       ` Bernd Schubert
  2023-12-12 18:30                                         ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-08 22:38 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/8/23 21:46, Amir Goldstein wrote:
> On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/8/23 09:39, Amir Goldstein wrote:
>>> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>>
>>>>
>>>> On 12/7/23 08:39, Amir Goldstein wrote:
>>>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>>>>>
>>>>>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>>>>>> for network fs. Right?
>>>>>>>>>
>>>>>>>>> We kind of have these use cases for our network file systems
>>>>>>>>>
>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>>>>>         - Traditional HPC, large files, parallel IO
>>>>>>>>>         - Large file used on local node as container for many small files
>>>>>>>>>
>>>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>>>>>         - compilation through gcc (not so important, just not nice when it
>>>>>>>>> does not work)
>>>>>>>>>         - rather recent: python libraries using mmap _reads_. As it is read
>>>>>>>>> only no issue of consistency.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>>>>>
>>>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>>>>>> and post the next version
>>>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> In the mean time I have another idea how to solve
>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>>>>>
>>>>>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>>>>>> crashing for me anymore. I just finished the initial coding - it still
>>>>>>>> needs a bit cleanup and maybe a few comments.
>>>>>>>>
>>>>>>>
>>>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>>>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>>>>>> of the last open file of the inode.
>>>>>>>
>>>>>>> I did not understand some of the complexity here:
>>>>>>>
>>>>>>>>            /* The inode ever got page writes and we do not know for sure
>>>>>>>>             * in the DIO path if these are pending - shared lock not possible */
>>>>>>>>            spin_lock(&fi->lock);
>>>>>>>>            if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>>>>>                    if (!(*cnt_increased)) {
>>>>>>>
>>>>>>> How can *cnt_increased be true here?
>>>>>>
>>>>>> I think you missed the 2nd entry into this function, when the shared
>>>>>> lock was already taken?
>>>>>
>>>>> Yeh, I did.
>>>>>
>>>>>> I have changed the code now to have all
>>>>>> complexity in this function (test, lock, retest with lock, release,
>>>>>> wakeup). I hope that will make it easier to see the intention of the
>>>>>> code. Will post the new patches in the morning.
>>>>>>
>>>>>
>>>>> Sounds good. Current version was a bit hard to follow.
>>>>>
>>>>>>
>>>>>>>
>>>>>>>>                            fi->shared_lock_direct_io_ctr++;
>>>>>>>>                            *cnt_increased = true;
>>>>>>>>                    }
>>>>>>>>                    excl_lock = false;
>>>>>>>
>>>>>>> Seems like in every outcome of this function
>>>>>>> *cnt_increased = !excl_lock
>>>>>>> so there is not need for out arg cnt_increased
>>>>>>
>>>>>> If excl_lock would be used as input - yeah, would have worked as well.
>>>>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>>>>>> going in and out.
>>>>>>
>>>>>>>
>>>>>>>>            }
>>>>>>>>            spin_unlock(&fi->lock);
>>>>>>>>
>>>>>>>> out:
>>>>>>>>            if (excl_lock && *cnt_increased) {
>>>>>>>>                    bool wake = false;
>>>>>>>>                    spin_lock(&fi->lock);
>>>>>>>>                    if (--fi->shared_lock_direct_io_ctr == 0)
>>>>>>>>                            wake = true;
>>>>>>>>                    spin_unlock(&fi->lock);
>>>>>>>>                    if (wake)
>>>>>>>>                            wake_up(&fi->direct_io_waitq);
>>>>>>>>            }
>>>>>>>
>>>>>>> I don't see how this wake_up code is reachable.
>>>>>>>
>>>>>>> TBH, I don't fully understand the expected result.
>>>>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>>>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>>>>>> flight. It only prevents writeback while dio is in flight, which is the same
>>>>>>> behavior as with exclusive inode lock. Right?
>>>>>>
>>>>>> Yeah, thanks. I will add it in the patch description.
>>>>>>
>>>>>> And there was actually an issue with the patch, as cache flushing needs
>>>>>> to be initiated before doing the lock decision, fixed now.
>>>>>>
>>>>>
>>>>> I thought there was, because of the wait in fuse_send_writepage()
>>>>> but wasn't sure if I was following the flow correctly.
>>>>>
>>>>>>>
>>>>>>> Maybe this interaction is spelled out somewhere else, but if not
>>>>>>> better spell it out for people like me that are new to this code.
>>>>>>
>>>>>> Sure, thanks a lot for your helpful comments!
>>>>>>
>>>>>
>>>>> Just to be clear, this patch looks like a good improvement and
>>>>> is mostly independent of the "inode caching mode" and
>>>>> FOPEN_CACHE_MMAP idea that I suggested.
>>>>>
>>>>> The only thing that my idea changes is replacing the
>>>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
>>>>> state, which is set earlier than FUSE_I_CACHE_WRITES
>>>>> on caching file open or first direct_io mmap and unlike
>>>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
>>>>>
>>>>> FUSE_I_CACHE_WRITES means that caching writes happened.
>>>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
>>>>> may happen.
>>>>>
>>>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
>>>>> about "caching reads may happen", but IMO that is a small trade off
>>>>> to make for maintaining the same state for
>>>>> "do not allow parallel dio" and "do not allow passthrough open".
>>>>
>>>> I think the attached patches should do, it now also unsets
>>>
>>> IMO, your patch is still more complicated than it should be.
>>> There is no need for the complicated retest state machine.
>>> If you split the helpers to:
>>>
>>> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
>>> ...
>>> fuse_dio_lock_inode(iocb, &exclusive);
>>> ...
>>> fuse_dio_unlock_inode(iocb, &exclusive);
>>>
>>> Then you only need to test FUSE_I_CACHE_IO_MODE in
>>> fuse_dio_wr_needs_exclusive_lock()
>>> and you only need to increment shared_lock_direct_io_ctr
>>> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
>>
>> Hmm, I'm not sure.
>>
>> I changed fuse_file_mmap() to call this function
>>
>> /*
>>    * direct-io with shared locks cannot handle page cache io - set an inode
>>    * flag to disable shared locks and wait until remaining threads are done
>>    */
>> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
>> {
>>          struct fuse_inode *fi = get_fuse_inode(inode);
>>
>>          spin_lock(&fi->lock);
>>          set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
>>          while (fi->shared_lock_direct_io_ctr > 0) {
>>                  spin_unlock(&fi->lock);
>>                  wait_event_interruptible(fi->direct_io_waitq,
>>                                           fi->shared_lock_direct_io_ctr == 0);
>>                  spin_lock(&fi->lock);
>>          }
>>          spin_unlock(&fi->lock);
>> }
>>
>>
>> Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
>> and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
>> or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
>> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
>>
>>
>> I'm going to think about over the weekend if your suggestion
>> to increase fi->shared_lock_direct_io_ctr only after taking the shared
>> lock is possible. Right now I don't see how to do that.
>>
>>
>>>
>>>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
>>>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
>>>> but that aligns with passthrough anyway?
>>>
>>> Yes.
>>>
>>> I see that shared_lock_direct_io_ctr is checked without lock or barriers
>>> in and the wait_event() should be interruptible.
>>
>> Thanks, fixed with the function above.
>>
>>> I am also not sure if it breaks any locking order for mmap because
>>> the task that is going to wake it up is holding the shared inode lock...
>>
>> The waitq has its own lock. We have
>>
>> fuse_file_mmap - called under some mmap lock, waitq lock
>>
>> fuse_dio_lock_inode: no lock taken before calling wakeup
>>
>> fuse_direct_write_iter: wakeup after release of all locks
>>
>> So I don't think we have a locker issue (lockdep also doesn't annotate
>> anything).
> 
> I don't think that lockdep can understand this dependency.
> 
>> What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap
>>
> 
> It's complicated. I need to look at the whole thing again.
> 
>>>
>>> While looking at this code, the invalidate_inode_pages2() looks suspicious.
>>> If inode is already in FUSE_I_CACHE_IO_MODE when performing
>>> another mmap, doesn't that have potential for data loss?
>>> (even before your patch I mean)
>>>
>>>> Amir, right now it only sets
>>>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
>>>> for passthrough there?
>>>>
>>>
>>> We could add a condition, but I don't think that we should.
>>> I think we should refrain from different behavior when it is not justified.
>>> I think it is not justified to allow parallel dio if any file is open in
>>> caching mode on the inode and any mmap (private or shared)
>>> exists on the inode.
>>>
>>> That means that FUSE_I_CACHE_IO_MODE should be set on
>>> any mmap, and already on open for non direct_io files.
>>
>> Ok, I can change and add that. Doing it in open is definitely needed
>> for O_DIRECT (in my other dio branch).
>>
> 
> Good, the more common code the better.
> 
>>>
>>> Mixing caching and direct io on the same inode is hard as it is
>>> already and there is no need to add complexity by allowing
>>> parallel dio in that case. IMO it wins us nothing.
>>
>> So the slight issue I see are people like me, who check the content
>> of a file during a long running computation. Like an HPC application
>> is doing some long term runs. Then in the middle of
>> the run the user wants to see the current content of the file and
>> reads it - if that is done through mmap (and from a node that runs
>> the application), parallel DIO is disabled with the current patch
>> until the file is closed - I see the use case to check for writes.
>>
> 
> That's what I thought.
> 
>>
>>>
>>> The FUSE_I_CACHE_IO_MODE could be cleared on last file
>>> close (as your patch did) but it could be cleared earlier if
>>> instead of tracking refcount of open file, we track refcount of
>>> files open in caching mode or mmaped, which is what the
>>> FOPEN_MMAP_CACHE flag I suggested is for.
>>
>> But how does open() know that a file/fd is used for mmap?
>>
> 
> Because what I tried to suggest is a trick/hack:
> first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
> and bumps the cached_opens on the inode as if file was
> opened in caching mode or in FOPEN_MMAP_CACHE mode.
> When the file that was used for mmap is closed and all the rest
> of the open files have only ever been used for direct_io, then
> inode exists the caching io mode.
> 
> Using an FOPEN flag for that is kind of a hack.
> We could add an internal file state bits for that as well,
> but my thinking was that FOPEN_MMAP_CACHE could really
> be set by the server to mean per-file ALLOW_MMAP instead of
> the per-filesystem ALLOW_MMAP. Not sure if that will be useful.

Ok, I will try to add that in a different patch to have better 
visibility. Will also put these patch here in front of my dio branch and 
rebase these patches. There comes in a bit additional complexity to 
handle O_DIRECT, but it also consolidates direct-IO writes code paths. 
At least I hope this is still possible with the new changes.

> 
> Sorry for the hand waving. I was trying to send out a demo
> patch that explains it better, but got caught up with other things.

No problem at all, I think I know what you mean and I can try add this 
myself.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-08 22:38                                       ` Bernd Schubert
@ 2023-12-12 18:30                                         ` Amir Goldstein
  2023-12-12 22:07                                           ` Bernd Schubert
  2023-12-19  0:03                                           ` Bernd Schubert
  0 siblings, 2 replies; 48+ messages in thread
From: Amir Goldstein @ 2023-12-12 18:30 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Sat, Dec 9, 2023 at 12:38 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/8/23 21:46, Amir Goldstein wrote:
> > On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
> > <bernd.schubert@fastmail.fm> wrote:
> >>
> >>
> >>
> >> On 12/8/23 09:39, Amir Goldstein wrote:
> >>> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
> >>> <bernd.schubert@fastmail.fm> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 12/7/23 08:39, Amir Goldstein wrote:
> >>>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
> >>>>> <bernd.schubert@fastmail.fm> wrote:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 12/6/23 09:25, Amir Goldstein wrote:
> >>>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
> >>>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
> >>>>>>>>>> I guess not otherwise, the combination would have been tested.
> >>>>>>>>>
> >>>>>>>>> I'm not sure how many people are aware of these different flags/features.
> >>>>>>>>> I had just finalized the backport of the related patches to RHEL8 on
> >>>>>>>>> Friday, as we (or our customers) need both for different jobs.
> >>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
> >>>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
> >>>>>>>>>> for network fs. Right?
> >>>>>>>>>
> >>>>>>>>> We kind of have these use cases for our network file systems
> >>>>>>>>>
> >>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
> >>>>>>>>>         - Traditional HPC, large files, parallel IO
> >>>>>>>>>         - Large file used on local node as container for many small files
> >>>>>>>>>
> >>>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
> >>>>>>>>>         - compilation through gcc (not so important, just not nice when it
> >>>>>>>>> does not work)
> >>>>>>>>>         - rather recent: python libraries using mmap _reads_. As it is read
> >>>>>>>>> only no issue of consistency.
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> These jobs do not intermix - no issue as in generic/095. If such
> >>>>>>>>> applications really exist, I have no issue with a serialization penalty.
> >>>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
> >>>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
> >>>>>>>>>
> >>>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
> >>>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
> >>>>>>>>> and post the next version
> >>>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> In the mean time I have another idea how to solve
> >>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
> >>>>>>>>
> >>>>>>>> Please find attached what I had in my mind. With that generic/095 is not
> >>>>>>>> crashing for me anymore. I just finished the initial coding - it still
> >>>>>>>> needs a bit cleanup and maybe a few comments.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
> >>>>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
> >>>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
> >>>>>>> of the last open file of the inode.
> >>>>>>>
> >>>>>>> I did not understand some of the complexity here:
> >>>>>>>
> >>>>>>>>            /* The inode ever got page writes and we do not know for sure
> >>>>>>>>             * in the DIO path if these are pending - shared lock not possible */
> >>>>>>>>            spin_lock(&fi->lock);
> >>>>>>>>            if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
> >>>>>>>>                    if (!(*cnt_increased)) {
> >>>>>>>
> >>>>>>> How can *cnt_increased be true here?
> >>>>>>
> >>>>>> I think you missed the 2nd entry into this function, when the shared
> >>>>>> lock was already taken?
> >>>>>
> >>>>> Yeh, I did.
> >>>>>
> >>>>>> I have changed the code now to have all
> >>>>>> complexity in this function (test, lock, retest with lock, release,
> >>>>>> wakeup). I hope that will make it easier to see the intention of the
> >>>>>> code. Will post the new patches in the morning.
> >>>>>>
> >>>>>
> >>>>> Sounds good. Current version was a bit hard to follow.
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>>>                            fi->shared_lock_direct_io_ctr++;
> >>>>>>>>                            *cnt_increased = true;
> >>>>>>>>                    }
> >>>>>>>>                    excl_lock = false;
> >>>>>>>
> >>>>>>> Seems like in every outcome of this function
> >>>>>>> *cnt_increased = !excl_lock
> >>>>>>> so there is not need for out arg cnt_increased
> >>>>>>
> >>>>>> If excl_lock would be used as input - yeah, would have worked as well.
> >>>>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
> >>>>>> going in and out.
> >>>>>>
> >>>>>>>
> >>>>>>>>            }
> >>>>>>>>            spin_unlock(&fi->lock);
> >>>>>>>>
> >>>>>>>> out:
> >>>>>>>>            if (excl_lock && *cnt_increased) {
> >>>>>>>>                    bool wake = false;
> >>>>>>>>                    spin_lock(&fi->lock);
> >>>>>>>>                    if (--fi->shared_lock_direct_io_ctr == 0)
> >>>>>>>>                            wake = true;
> >>>>>>>>                    spin_unlock(&fi->lock);
> >>>>>>>>                    if (wake)
> >>>>>>>>                            wake_up(&fi->direct_io_waitq);
> >>>>>>>>            }
> >>>>>>>
> >>>>>>> I don't see how this wake_up code is reachable.
> >>>>>>>
> >>>>>>> TBH, I don't fully understand the expected result.
> >>>>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
> >>>>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
> >>>>>>> flight. It only prevents writeback while dio is in flight, which is the same
> >>>>>>> behavior as with exclusive inode lock. Right?
> >>>>>>
> >>>>>> Yeah, thanks. I will add it in the patch description.
> >>>>>>
> >>>>>> And there was actually an issue with the patch, as cache flushing needs
> >>>>>> to be initiated before doing the lock decision, fixed now.
> >>>>>>
> >>>>>
> >>>>> I thought there was, because of the wait in fuse_send_writepage()
> >>>>> but wasn't sure if I was following the flow correctly.
> >>>>>
> >>>>>>>
> >>>>>>> Maybe this interaction is spelled out somewhere else, but if not
> >>>>>>> better spell it out for people like me that are new to this code.
> >>>>>>
> >>>>>> Sure, thanks a lot for your helpful comments!
> >>>>>>
> >>>>>
> >>>>> Just to be clear, this patch looks like a good improvement and
> >>>>> is mostly independent of the "inode caching mode" and
> >>>>> FOPEN_CACHE_MMAP idea that I suggested.
> >>>>>
> >>>>> The only thing that my idea changes is replacing the
> >>>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
> >>>>> state, which is set earlier than FUSE_I_CACHE_WRITES
> >>>>> on caching file open or first direct_io mmap and unlike
> >>>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
> >>>>>
> >>>>> FUSE_I_CACHE_WRITES means that caching writes happened.
> >>>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
> >>>>> may happen.
> >>>>>
> >>>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
> >>>>> about "caching reads may happen", but IMO that is a small trade off
> >>>>> to make for maintaining the same state for
> >>>>> "do not allow parallel dio" and "do not allow passthrough open".
> >>>>
> >>>> I think the attached patches should do, it now also unsets
> >>>
> >>> IMO, your patch is still more complicated than it should be.
> >>> There is no need for the complicated retest state machine.
> >>> If you split the helpers to:
> >>>
> >>> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
> >>> ...
> >>> fuse_dio_lock_inode(iocb, &exclusive);
> >>> ...
> >>> fuse_dio_unlock_inode(iocb, &exclusive);
> >>>
> >>> Then you only need to test FUSE_I_CACHE_IO_MODE in
> >>> fuse_dio_wr_needs_exclusive_lock()
> >>> and you only need to increment shared_lock_direct_io_ctr
> >>> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
> >>
> >> Hmm, I'm not sure.
> >>
> >> I changed fuse_file_mmap() to call this function
> >>
> >> /*
> >>    * direct-io with shared locks cannot handle page cache io - set an inode
> >>    * flag to disable shared locks and wait until remaining threads are done
> >>    */
> >> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
> >> {
> >>          struct fuse_inode *fi = get_fuse_inode(inode);
> >>
> >>          spin_lock(&fi->lock);
> >>          set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
> >>          while (fi->shared_lock_direct_io_ctr > 0) {
> >>                  spin_unlock(&fi->lock);
> >>                  wait_event_interruptible(fi->direct_io_waitq,
> >>                                           fi->shared_lock_direct_io_ctr == 0);
> >>                  spin_lock(&fi->lock);
> >>          }
> >>          spin_unlock(&fi->lock);
> >> }
> >>
> >>
> >> Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
> >> and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
> >> or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
> >> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
> >>
> >>
> >> I'm going to think about over the weekend if your suggestion
> >> to increase fi->shared_lock_direct_io_ctr only after taking the shared
> >> lock is possible. Right now I don't see how to do that.
> >>
> >>
> >>>
> >>>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
> >>>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
> >>>> but that aligns with passthrough anyway?
> >>>
> >>> Yes.
> >>>
> >>> I see that shared_lock_direct_io_ctr is checked without lock or barriers
> >>> in and the wait_event() should be interruptible.
> >>
> >> Thanks, fixed with the function above.
> >>
> >>> I am also not sure if it breaks any locking order for mmap because
> >>> the task that is going to wake it up is holding the shared inode lock...
> >>
> >> The waitq has its own lock. We have
> >>
> >> fuse_file_mmap - called under some mmap lock, waitq lock
> >>
> >> fuse_dio_lock_inode: no lock taken before calling wakeup
> >>
> >> fuse_direct_write_iter: wakeup after release of all locks
> >>
> >> So I don't think we have a locker issue (lockdep also doesn't annotate
> >> anything).
> >
> > I don't think that lockdep can understand this dependency.
> >
> >> What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap
> >>
> >
> > It's complicated. I need to look at the whole thing again.
> >
> >>>
> >>> While looking at this code, the invalidate_inode_pages2() looks suspicious.
> >>> If inode is already in FUSE_I_CACHE_IO_MODE when performing
> >>> another mmap, doesn't that have potential for data loss?
> >>> (even before your patch I mean)
> >>>
> >>>> Amir, right now it only sets
> >>>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
> >>>> for passthrough there?
> >>>>
> >>>
> >>> We could add a condition, but I don't think that we should.
> >>> I think we should refrain from different behavior when it is not justified.
> >>> I think it is not justified to allow parallel dio if any file is open in
> >>> caching mode on the inode and any mmap (private or shared)
> >>> exists on the inode.
> >>>
> >>> That means that FUSE_I_CACHE_IO_MODE should be set on
> >>> any mmap, and already on open for non direct_io files.
> >>
> >> Ok, I can change and add that. Doing it in open is definitely needed
> >> for O_DIRECT (in my other dio branch).
> >>
> >
> > Good, the more common code the better.
> >
> >>>
> >>> Mixing caching and direct io on the same inode is hard as it is
> >>> already and there is no need to add complexity by allowing
> >>> parallel dio in that case. IMO it wins us nothing.
> >>
> >> So the slight issue I see are people like me, who check the content
> >> of a file during a long running computation. Like an HPC application
> >> is doing some long term runs. Then in the middle of
> >> the run the user wants to see the current content of the file and
> >> reads it - if that is done through mmap (and from a node that runs
> >> the application), parallel DIO is disabled with the current patch
> >> until the file is closed - I see the use case to check for writes.
> >>
> >
> > That's what I thought.
> >
> >>
> >>>
> >>> The FUSE_I_CACHE_IO_MODE could be cleared on last file
> >>> close (as your patch did) but it could be cleared earlier if
> >>> instead of tracking refcount of open file, we track refcount of
> >>> files open in caching mode or mmaped, which is what the
> >>> FOPEN_MMAP_CACHE flag I suggested is for.
> >>
> >> But how does open() know that a file/fd is used for mmap?
> >>
> >
> > Because what I tried to suggest is a trick/hack:
> > first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
> > and bumps the cached_opens on the inode as if file was
> > opened in caching mode or in FOPEN_MMAP_CACHE mode.
> > When the file that was used for mmap is closed and all the rest
> > of the open files have only ever been used for direct_io, then
> > inode exists the caching io mode.
> >
> > Using an FOPEN flag for that is kind of a hack.
> > We could add an internal file state bits for that as well,
> > but my thinking was that FOPEN_MMAP_CACHE could really
> > be set by the server to mean per-file ALLOW_MMAP instead of
> > the per-filesystem ALLOW_MMAP. Not sure if that will be useful.
>
> Ok, I will try to add that in a different patch to have better
> visibility. Will also put these patch here in front of my dio branch and
> rebase these patches. There comes in a bit additional complexity to
> handle O_DIRECT, but it also consolidates direct-IO writes code paths.
> At least I hope this is still possible with the new changes.
>
> >
> > Sorry for the hand waving. I was trying to send out a demo
> > patch that explains it better, but got caught up with other things.
>
> No problem at all, I think I know what you mean and I can try add this
> myself.

Here is what I was thinking about:

https://github.com/amir73il/linux/commits/fuse_io_mode

The concept that I wanted to introduce was the
fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
helpers (akin to deny_write_access()/allow_write_access()).

In this patch, parallel dio in progress deny open in caching mode
and mmap, and I don't know if that is acceptable.
Technically, instead of deny open/mmap you can use additional
techniques to wait for in progress dio and allow caching open/mmap.

Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
pattern when file is open in FOPEN_PASSTHROUGH mode, but
in this case, as agreed with Miklos, a server trying to mix open
in caching mode on the same inode is going to fail the open.

mmap is less of a problem for inode in passthrough mode, because
mmap in of direct_io file and inode in passthrough mode is passthrough
mmap to backing file.

Anyway, if you can use this patch or parts of it, be my guest and if you
want to use a different approach that is fine by me as well - in that case
I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-12 18:30                                         ` Amir Goldstein
@ 2023-12-12 22:07                                           ` Bernd Schubert
       [not found]                                             ` <CAOQ4uxh=aBFEiBVBErEA_d+mWcTOysLgbgWVztSzL+D2BvMLdA@mail.gmail.com>
  2023-12-19  0:03                                           ` Bernd Schubert
  1 sibling, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-12 22:07 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/12/23 19:30, Amir Goldstein wrote:
> On Sat, Dec 9, 2023 at 12:38 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/8/23 21:46, Amir Goldstein wrote:
>>> On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>>
>>>>
>>>> On 12/8/23 09:39, Amir Goldstein wrote:
>>>>> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 12/7/23 08:39, Amir Goldstein wrote:
>>>>>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
>>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>>>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>>>>>>>
>>>>>>>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>>>>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>>>>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>>>>>>>> for network fs. Right?
>>>>>>>>>>>
>>>>>>>>>>> We kind of have these use cases for our network file systems
>>>>>>>>>>>
>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>>>>>>>          - Traditional HPC, large files, parallel IO
>>>>>>>>>>>          - Large file used on local node as container for many small files
>>>>>>>>>>>
>>>>>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>>>>>>>          - compilation through gcc (not so important, just not nice when it
>>>>>>>>>>> does not work)
>>>>>>>>>>>          - rather recent: python libraries using mmap _reads_. As it is read
>>>>>>>>>>> only no issue of consistency.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>>>>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>>>>>>>
>>>>>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>>>>>>>> and post the next version
>>>>>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> In the mean time I have another idea how to solve
>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>
>>>>>>>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>>>>>>>> crashing for me anymore. I just finished the initial coding - it still
>>>>>>>>>> needs a bit cleanup and maybe a few comments.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>>>>>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>>>>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>>>>>>>> of the last open file of the inode.
>>>>>>>>>
>>>>>>>>> I did not understand some of the complexity here:
>>>>>>>>>
>>>>>>>>>>             /* The inode ever got page writes and we do not know for sure
>>>>>>>>>>              * in the DIO path if these are pending - shared lock not possible */
>>>>>>>>>>             spin_lock(&fi->lock);
>>>>>>>>>>             if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>>>>>>>                     if (!(*cnt_increased)) {
>>>>>>>>>
>>>>>>>>> How can *cnt_increased be true here?
>>>>>>>>
>>>>>>>> I think you missed the 2nd entry into this function, when the shared
>>>>>>>> lock was already taken?
>>>>>>>
>>>>>>> Yeh, I did.
>>>>>>>
>>>>>>>> I have changed the code now to have all
>>>>>>>> complexity in this function (test, lock, retest with lock, release,
>>>>>>>> wakeup). I hope that will make it easier to see the intention of the
>>>>>>>> code. Will post the new patches in the morning.
>>>>>>>>
>>>>>>>
>>>>>>> Sounds good. Current version was a bit hard to follow.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>                             fi->shared_lock_direct_io_ctr++;
>>>>>>>>>>                             *cnt_increased = true;
>>>>>>>>>>                     }
>>>>>>>>>>                     excl_lock = false;
>>>>>>>>>
>>>>>>>>> Seems like in every outcome of this function
>>>>>>>>> *cnt_increased = !excl_lock
>>>>>>>>> so there is not need for out arg cnt_increased
>>>>>>>>
>>>>>>>> If excl_lock would be used as input - yeah, would have worked as well.
>>>>>>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>>>>>>>> going in and out.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>             }
>>>>>>>>>>             spin_unlock(&fi->lock);
>>>>>>>>>>
>>>>>>>>>> out:
>>>>>>>>>>             if (excl_lock && *cnt_increased) {
>>>>>>>>>>                     bool wake = false;
>>>>>>>>>>                     spin_lock(&fi->lock);
>>>>>>>>>>                     if (--fi->shared_lock_direct_io_ctr == 0)
>>>>>>>>>>                             wake = true;
>>>>>>>>>>                     spin_unlock(&fi->lock);
>>>>>>>>>>                     if (wake)
>>>>>>>>>>                             wake_up(&fi->direct_io_waitq);
>>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>> I don't see how this wake_up code is reachable.
>>>>>>>>>
>>>>>>>>> TBH, I don't fully understand the expected result.
>>>>>>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>>>>>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>>>>>>>> flight. It only prevents writeback while dio is in flight, which is the same
>>>>>>>>> behavior as with exclusive inode lock. Right?
>>>>>>>>
>>>>>>>> Yeah, thanks. I will add it in the patch description.
>>>>>>>>
>>>>>>>> And there was actually an issue with the patch, as cache flushing needs
>>>>>>>> to be initiated before doing the lock decision, fixed now.
>>>>>>>>
>>>>>>>
>>>>>>> I thought there was, because of the wait in fuse_send_writepage()
>>>>>>> but wasn't sure if I was following the flow correctly.
>>>>>>>
>>>>>>>>>
>>>>>>>>> Maybe this interaction is spelled out somewhere else, but if not
>>>>>>>>> better spell it out for people like me that are new to this code.
>>>>>>>>
>>>>>>>> Sure, thanks a lot for your helpful comments!
>>>>>>>>
>>>>>>>
>>>>>>> Just to be clear, this patch looks like a good improvement and
>>>>>>> is mostly independent of the "inode caching mode" and
>>>>>>> FOPEN_CACHE_MMAP idea that I suggested.
>>>>>>>
>>>>>>> The only thing that my idea changes is replacing the
>>>>>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
>>>>>>> state, which is set earlier than FUSE_I_CACHE_WRITES
>>>>>>> on caching file open or first direct_io mmap and unlike
>>>>>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
>>>>>>>
>>>>>>> FUSE_I_CACHE_WRITES means that caching writes happened.
>>>>>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
>>>>>>> may happen.
>>>>>>>
>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
>>>>>>> about "caching reads may happen", but IMO that is a small trade off
>>>>>>> to make for maintaining the same state for
>>>>>>> "do not allow parallel dio" and "do not allow passthrough open".
>>>>>>
>>>>>> I think the attached patches should do, it now also unsets
>>>>>
>>>>> IMO, your patch is still more complicated than it should be.
>>>>> There is no need for the complicated retest state machine.
>>>>> If you split the helpers to:
>>>>>
>>>>> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
>>>>> ...
>>>>> fuse_dio_lock_inode(iocb, &exclusive);
>>>>> ...
>>>>> fuse_dio_unlock_inode(iocb, &exclusive);
>>>>>
>>>>> Then you only need to test FUSE_I_CACHE_IO_MODE in
>>>>> fuse_dio_wr_needs_exclusive_lock()
>>>>> and you only need to increment shared_lock_direct_io_ctr
>>>>> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
>>>>
>>>> Hmm, I'm not sure.
>>>>
>>>> I changed fuse_file_mmap() to call this function
>>>>
>>>> /*
>>>>     * direct-io with shared locks cannot handle page cache io - set an inode
>>>>     * flag to disable shared locks and wait until remaining threads are done
>>>>     */
>>>> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
>>>> {
>>>>           struct fuse_inode *fi = get_fuse_inode(inode);
>>>>
>>>>           spin_lock(&fi->lock);
>>>>           set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
>>>>           while (fi->shared_lock_direct_io_ctr > 0) {
>>>>                   spin_unlock(&fi->lock);
>>>>                   wait_event_interruptible(fi->direct_io_waitq,
>>>>                                            fi->shared_lock_direct_io_ctr == 0);
>>>>                   spin_lock(&fi->lock);
>>>>           }
>>>>           spin_unlock(&fi->lock);
>>>> }
>>>>
>>>>
>>>> Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
>>>> and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
>>>> or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
>>>> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
>>>>
>>>>
>>>> I'm going to think about over the weekend if your suggestion
>>>> to increase fi->shared_lock_direct_io_ctr only after taking the shared
>>>> lock is possible. Right now I don't see how to do that.
>>>>
>>>>
>>>>>
>>>>>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
>>>>>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
>>>>>> but that aligns with passthrough anyway?
>>>>>
>>>>> Yes.
>>>>>
>>>>> I see that shared_lock_direct_io_ctr is checked without lock or barriers
>>>>> in and the wait_event() should be interruptible.
>>>>
>>>> Thanks, fixed with the function above.
>>>>
>>>>> I am also not sure if it breaks any locking order for mmap because
>>>>> the task that is going to wake it up is holding the shared inode lock...
>>>>
>>>> The waitq has its own lock. We have
>>>>
>>>> fuse_file_mmap - called under some mmap lock, waitq lock
>>>>
>>>> fuse_dio_lock_inode: no lock taken before calling wakeup
>>>>
>>>> fuse_direct_write_iter: wakeup after release of all locks
>>>>
>>>> So I don't think we have a locker issue (lockdep also doesn't annotate
>>>> anything).
>>>
>>> I don't think that lockdep can understand this dependency.
>>>
>>>> What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap
>>>>
>>>
>>> It's complicated. I need to look at the whole thing again.
>>>
>>>>>
>>>>> While looking at this code, the invalidate_inode_pages2() looks suspicious.
>>>>> If inode is already in FUSE_I_CACHE_IO_MODE when performing
>>>>> another mmap, doesn't that have potential for data loss?
>>>>> (even before your patch I mean)
>>>>>
>>>>>> Amir, right now it only sets
>>>>>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
>>>>>> for passthrough there?
>>>>>>
>>>>>
>>>>> We could add a condition, but I don't think that we should.
>>>>> I think we should refrain from different behavior when it is not justified.
>>>>> I think it is not justified to allow parallel dio if any file is open in
>>>>> caching mode on the inode and any mmap (private or shared)
>>>>> exists on the inode.
>>>>>
>>>>> That means that FUSE_I_CACHE_IO_MODE should be set on
>>>>> any mmap, and already on open for non direct_io files.
>>>>
>>>> Ok, I can change and add that. Doing it in open is definitely needed
>>>> for O_DIRECT (in my other dio branch).
>>>>
>>>
>>> Good, the more common code the better.
>>>
>>>>>
>>>>> Mixing caching and direct io on the same inode is hard as it is
>>>>> already and there is no need to add complexity by allowing
>>>>> parallel dio in that case. IMO it wins us nothing.
>>>>
>>>> So the slight issue I see are people like me, who check the content
>>>> of a file during a long running computation. Like an HPC application
>>>> is doing some long term runs. Then in the middle of
>>>> the run the user wants to see the current content of the file and
>>>> reads it - if that is done through mmap (and from a node that runs
>>>> the application), parallel DIO is disabled with the current patch
>>>> until the file is closed - I see the use case to check for writes.
>>>>
>>>
>>> That's what I thought.
>>>
>>>>
>>>>>
>>>>> The FUSE_I_CACHE_IO_MODE could be cleared on last file
>>>>> close (as your patch did) but it could be cleared earlier if
>>>>> instead of tracking refcount of open file, we track refcount of
>>>>> files open in caching mode or mmaped, which is what the
>>>>> FOPEN_MMAP_CACHE flag I suggested is for.
>>>>
>>>> But how does open() know that a file/fd is used for mmap?
>>>>
>>>
>>> Because what I tried to suggest is a trick/hack:
>>> first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
>>> and bumps the cached_opens on the inode as if file was
>>> opened in caching mode or in FOPEN_MMAP_CACHE mode.
>>> When the file that was used for mmap is closed and all the rest
>>> of the open files have only ever been used for direct_io, then
>>> inode exists the caching io mode.
>>>
>>> Using an FOPEN flag for that is kind of a hack.
>>> We could add an internal file state bits for that as well,
>>> but my thinking was that FOPEN_MMAP_CACHE could really
>>> be set by the server to mean per-file ALLOW_MMAP instead of
>>> the per-filesystem ALLOW_MMAP. Not sure if that will be useful.
>>
>> Ok, I will try to add that in a different patch to have better
>> visibility. Will also put these patch here in front of my dio branch and
>> rebase these patches. There comes in a bit additional complexity to
>> handle O_DIRECT, but it also consolidates direct-IO writes code paths.
>> At least I hope this is still possible with the new changes.
>>
>>>
>>> Sorry for the hand waving. I was trying to send out a demo
>>> patch that explains it better, but got caught up with other things.
>>
>> No problem at all, I think I know what you mean and I can try add this
>> myself.
> 
> Here is what I was thinking about:
> 
> https://github.com/amir73il/linux/commits/fuse_io_mode
> 
> The concept that I wanted to introduce was the
> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
> helpers (akin to deny_write_access()/allow_write_access()).
> 
> In this patch, parallel dio in progress deny open in caching mode
> and mmap, and I don't know if that is acceptable.
> Technically, instead of deny open/mmap you can use additional
> techniques to wait for in progress dio and allow caching open/mmap.
> 
> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
> pattern when file is open in FOPEN_PASSTHROUGH mode, but
> in this case, as agreed with Miklos, a server trying to mix open
> in caching mode on the same inode is going to fail the open.
> 
> mmap is less of a problem for inode in passthrough mode, because
> mmap in of direct_io file and inode in passthrough mode is passthrough
> mmap to backing file.
> 
> Anyway, if you can use this patch or parts of it, be my guest and if you
> want to use a different approach that is fine by me as well - in that case
> I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.

Thanks Amir, I'm going to look at it in detail in the morning.
Btw, there is another bad direct_io_allow_mmap issue (part of it is 
invalidate_inode_pages2, which you already noticed, but not alone). 
Going to send out the patch once xfstests have passed
https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
       [not found]                                             ` <CAOQ4uxh=aBFEiBVBErEA_d+mWcTOysLgbgWVztSzL+D2BvMLdA@mail.gmail.com>
@ 2023-12-13 10:11                                               ` Bernd Schubert
  2023-12-13 11:23                                                 ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-13 10:11 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, Miklos Szeredi,
	gmaglione, Max Reitz, Hao Xu, Dharmendra Singh



On 12/13/23 05:21, Amir Goldstein wrote:
> 
> 
> On Wed, Dec 13, 2023, 12:07 AM Bernd Schubert 
> <bernd.schubert@fastmail.fm <mailto:bernd.schubert@fastmail.fm>> wrote:
> 
> 
> 
>     On 12/12/23 19:30, Amir Goldstein wrote:
>      > On Sat, Dec 9, 2023 at 12:38 AM Bernd Schubert
>      > <bernd.schubert@fastmail.fm <mailto:bernd.schubert@fastmail.fm>>
>     wrote:
>      >>
>      >>
>      >>
>      >> On 12/8/23 21:46, Amir Goldstein wrote:
>      >>> On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
>      >>> <bernd.schubert@fastmail.fm
>     <mailto:bernd.schubert@fastmail.fm>> wrote:
>      >>>>
>      >>>>
>      >>>>
>      >>>> On 12/8/23 09:39, Amir Goldstein wrote:
>      >>>>> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
>      >>>>> <bernd.schubert@fastmail.fm
>     <mailto:bernd.schubert@fastmail.fm>> wrote:
>      >>>>>>
>      >>>>>>
>      >>>>>>
>      >>>>>> On 12/7/23 08:39, Amir Goldstein wrote:
>      >>>>>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
>      >>>>>>> <bernd.schubert@fastmail.fm
>     <mailto:bernd.schubert@fastmail.fm>> wrote:
>      >>>>>>>>
>      >>>>>>>>
>      >>>>>>>>
>      >>>>>>>> On 12/6/23 09:25, Amir Goldstein wrote:
>      >>>>>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>      >>>>>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>      >>>>>>>>>>>> I guess not otherwise, the combination would have been
>     tested.
>      >>>>>>>>>>>
>      >>>>>>>>>>> I'm not sure how many people are aware of these
>     different flags/features.
>      >>>>>>>>>>> I had just finalized the backport of the related
>     patches to RHEL8 on
>      >>>>>>>>>>> Friday, as we (or our customers) need both for
>     different jobs.
>      >>>>>>>>>>>
>      >>>>>>>>>>>>
>      >>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>      >>>>>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>      >>>>>>>>>>>> for network fs. Right?
>      >>>>>>>>>>>
>      >>>>>>>>>>> We kind of have these use cases for our network file
>     systems
>      >>>>>>>>>>>
>      >>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>      >>>>>>>>>>>          - Traditional HPC, large files, parallel IO
>      >>>>>>>>>>>          - Large file used on local node as container
>     for many small files
>      >>>>>>>>>>>
>      >>>>>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>      >>>>>>>>>>>          - compilation through gcc (not so important,
>     just not nice when it
>      >>>>>>>>>>> does not work)
>      >>>>>>>>>>>          - rather recent: python libraries using mmap
>     _reads_. As it is read
>      >>>>>>>>>>> only no issue of consistency.
>      >>>>>>>>>>>
>      >>>>>>>>>>>
>      >>>>>>>>>>> These jobs do not intermix - no issue as in
>     generic/095. If such
>      >>>>>>>>>>> applications really exist, I have no issue with a
>     serialization penalty.
>      >>>>>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>      >>>>>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is
>     not so nice.
>      >>>>>>>>>>>
>      >>>>>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES
>     to work on plain
>      >>>>>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to
>     update this branch
>      >>>>>>>>>>> and post the next version
>      >>>>>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>     <https://github.com/bsbernd/linux/commits/fuse-dio-v4>
>      >>>>>>>>>>>
>      >>>>>>>>>>>
>      >>>>>>>>>>> In the mean time I have another idea how to solve
>      >>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>      >>>>>>>>>>
>      >>>>>>>>>> Please find attached what I had in my mind. With that
>     generic/095 is not
>      >>>>>>>>>> crashing for me anymore. I just finished the initial
>     coding - it still
>      >>>>>>>>>> needs a bit cleanup and maybe a few comments.
>      >>>>>>>>>>
>      >>>>>>>>>
>      >>>>>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>      >>>>>>>>> For FUSE_PASSTHROUGH I will need to track if inode is
>     open/mapped
>      >>>>>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on
>     release
>      >>>>>>>>> of the last open file of the inode.
>      >>>>>>>>>
>      >>>>>>>>> I did not understand some of the complexity here:
>      >>>>>>>>>
>      >>>>>>>>>>             /* The inode ever got page writes and we do
>     not know for sure
>      >>>>>>>>>>              * in the DIO path if these are pending -
>     shared lock not possible */
>      >>>>>>>>>>             spin_lock(&fi->lock);
>      >>>>>>>>>>             if (!test_bit(FUSE_I_CACHE_WRITES,
>     &fi->state)) {
>      >>>>>>>>>>                     if (!(*cnt_increased)) {
>      >>>>>>>>>
>      >>>>>>>>> How can *cnt_increased be true here?
>      >>>>>>>>
>      >>>>>>>> I think you missed the 2nd entry into this function, when
>     the shared
>      >>>>>>>> lock was already taken?
>      >>>>>>>
>      >>>>>>> Yeh, I did.
>      >>>>>>>
>      >>>>>>>> I have changed the code now to have all
>      >>>>>>>> complexity in this function (test, lock, retest with lock,
>     release,
>      >>>>>>>> wakeup). I hope that will make it easier to see the
>     intention of the
>      >>>>>>>> code. Will post the new patches in the morning.
>      >>>>>>>>
>      >>>>>>>
>      >>>>>>> Sounds good. Current version was a bit hard to follow.
>      >>>>>>>
>      >>>>>>>>
>      >>>>>>>>>
>      >>>>>>>>>>                             fi->shared_lock_direct_io_ctr++;
>      >>>>>>>>>>                             *cnt_increased = true;
>      >>>>>>>>>>                     }
>      >>>>>>>>>>                     excl_lock = false;
>      >>>>>>>>>
>      >>>>>>>>> Seems like in every outcome of this function
>      >>>>>>>>> *cnt_increased = !excl_lock
>      >>>>>>>>> so there is not need for out arg cnt_increased
>      >>>>>>>>
>      >>>>>>>> If excl_lock would be used as input - yeah, would have
>     worked as well.
>      >>>>>>>> Or a parameter like "retest-under-lock". Code is changed
>     now to avoid
>      >>>>>>>> going in and out.
>      >>>>>>>>
>      >>>>>>>>>
>      >>>>>>>>>>             }
>      >>>>>>>>>>             spin_unlock(&fi->lock);
>      >>>>>>>>>>
>      >>>>>>>>>> out:
>      >>>>>>>>>>             if (excl_lock && *cnt_increased) {
>      >>>>>>>>>>                     bool wake = false;
>      >>>>>>>>>>                     spin_lock(&fi->lock);
>      >>>>>>>>>>                     if (--fi->shared_lock_direct_io_ctr
>     == 0)
>      >>>>>>>>>>                             wake = true;
>      >>>>>>>>>>                     spin_unlock(&fi->lock);
>      >>>>>>>>>>                     if (wake)
>      >>>>>>>>>>                             wake_up(&fi->direct_io_waitq);
>      >>>>>>>>>>             }
>      >>>>>>>>>
>      >>>>>>>>> I don't see how this wake_up code is reachable.
>      >>>>>>>>>
>      >>>>>>>>> TBH, I don't fully understand the expected result.
>      >>>>>>>>> Surely, the behavior of dio mixed with mmap is undefined.
>     Right?
>      >>>>>>>>> IIUC, your patch does not prevent dirtying page cache
>     while dio is in
>      >>>>>>>>> flight. It only prevents writeback while dio is in
>     flight, which is the same
>      >>>>>>>>> behavior as with exclusive inode lock. Right?
>      >>>>>>>>
>      >>>>>>>> Yeah, thanks. I will add it in the patch description.
>      >>>>>>>>
>      >>>>>>>> And there was actually an issue with the patch, as cache
>     flushing needs
>      >>>>>>>> to be initiated before doing the lock decision, fixed now.
>      >>>>>>>>
>      >>>>>>>
>      >>>>>>> I thought there was, because of the wait in
>     fuse_send_writepage()
>      >>>>>>> but wasn't sure if I was following the flow correctly.
>      >>>>>>>
>      >>>>>>>>>
>      >>>>>>>>> Maybe this interaction is spelled out somewhere else, but
>     if not
>      >>>>>>>>> better spell it out for people like me that are new to
>     this code.
>      >>>>>>>>
>      >>>>>>>> Sure, thanks a lot for your helpful comments!
>      >>>>>>>>
>      >>>>>>>
>      >>>>>>> Just to be clear, this patch looks like a good improvement and
>      >>>>>>> is mostly independent of the "inode caching mode" and
>      >>>>>>> FOPEN_CACHE_MMAP idea that I suggested.
>      >>>>>>>
>      >>>>>>> The only thing that my idea changes is replacing the
>      >>>>>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
>      >>>>>>> state, which is set earlier than FUSE_I_CACHE_WRITES
>      >>>>>>> on caching file open or first direct_io mmap and unlike
>      >>>>>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
>      >>>>>>>
>      >>>>>>> FUSE_I_CACHE_WRITES means that caching writes happened.
>      >>>>>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
>      >>>>>>> may happen.
>      >>>>>>>
>      >>>>>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
>      >>>>>>> about "caching reads may happen", but IMO that is a small
>     trade off
>      >>>>>>> to make for maintaining the same state for
>      >>>>>>> "do not allow parallel dio" and "do not allow passthrough
>     open".
>      >>>>>>
>      >>>>>> I think the attached patches should do, it now also unsets
>      >>>>>
>      >>>>> IMO, your patch is still more complicated than it should be.
>      >>>>> There is no need for the complicated retest state machine.
>      >>>>> If you split the helpers to:
>      >>>>>
>      >>>>> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
>      >>>>> ...
>      >>>>> fuse_dio_lock_inode(iocb, &exclusive);
>      >>>>> ...
>      >>>>> fuse_dio_unlock_inode(iocb, &exclusive);
>      >>>>>
>      >>>>> Then you only need to test FUSE_I_CACHE_IO_MODE in
>      >>>>> fuse_dio_wr_needs_exclusive_lock()
>      >>>>> and you only need to increment shared_lock_direct_io_ctr
>      >>>>> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
>      >>>>
>      >>>> Hmm, I'm not sure.
>      >>>>
>      >>>> I changed fuse_file_mmap() to call this function
>      >>>>
>      >>>> /*
>      >>>>     * direct-io with shared locks cannot handle page cache io
>     - set an inode
>      >>>>     * flag to disable shared locks and wait until remaining
>     threads are done
>      >>>>     */
>      >>>> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
>      >>>> {
>      >>>>           struct fuse_inode *fi = get_fuse_inode(inode);
>      >>>>
>      >>>>           spin_lock(&fi->lock);
>      >>>>           set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
>      >>>>           while (fi->shared_lock_direct_io_ctr > 0) {
>      >>>>                   spin_unlock(&fi->lock);
>      >>>>                   wait_event_interruptible(fi->direct_io_waitq,
>      >>>>                                           
>     fi->shared_lock_direct_io_ctr == 0);
>      >>>>                   spin_lock(&fi->lock);
>      >>>>           }
>      >>>>           spin_unlock(&fi->lock);
>      >>>> }
>      >>>>
>      >>>>
>      >>>> Before we had indeed a race. Idea for
>     fuse_file_mmap_handle_dio_writers()
>      >>>> and fuse_dio_lock_inode() is to either have
>     FUSE_I_CACHE_IO_MODE set,
>      >>>> or fi->shared_lock_direct_io_ctr is greater 0, but that
>     requires that
>      >>>> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
>      >>>>
>      >>>>
>      >>>> I'm going to think about over the weekend if your suggestion
>      >>>> to increase fi->shared_lock_direct_io_ctr only after taking
>     the shared
>      >>>> lock is possible. Right now I don't see how to do that.
>      >>>>
>      >>>>
>      >>>>>
>      >>>>>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be
>     done from
>      >>>>>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a
>     dead stall,
>      >>>>>> but that aligns with passthrough anyway?
>      >>>>>
>      >>>>> Yes.
>      >>>>>
>      >>>>> I see that shared_lock_direct_io_ctr is checked without lock
>     or barriers
>      >>>>> in and the wait_event() should be interruptible.
>      >>>>
>      >>>> Thanks, fixed with the function above.
>      >>>>
>      >>>>> I am also not sure if it breaks any locking order for mmap
>     because
>      >>>>> the task that is going to wake it up is holding the shared
>     inode lock...
>      >>>>
>      >>>> The waitq has its own lock. We have
>      >>>>
>      >>>> fuse_file_mmap - called under some mmap lock, waitq lock
>      >>>>
>      >>>> fuse_dio_lock_inode: no lock taken before calling wakeup
>      >>>>
>      >>>> fuse_direct_write_iter: wakeup after release of all locks
>      >>>>
>      >>>> So I don't think we have a locker issue (lockdep also doesn't
>     annotate
>      >>>> anything).
>      >>>
>      >>> I don't think that lockdep can understand this dependency.
>      >>>
>      >>>> What we definitely cannot do it to take the inode i_rwsem lock
>     in fuse_file_mmap
>      >>>>
>      >>>
>      >>> It's complicated. I need to look at the whole thing again.
>      >>>
>      >>>>>
>      >>>>> While looking at this code, the invalidate_inode_pages2()
>     looks suspicious.
>      >>>>> If inode is already in FUSE_I_CACHE_IO_MODE when performing
>      >>>>> another mmap, doesn't that have potential for data loss?
>      >>>>> (even before your patch I mean)
>      >>>>>
>      >>>>>> Amir, right now it only sets
>      >>>>>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a
>     condition
>      >>>>>> for passthrough there?
>      >>>>>>
>      >>>>>
>      >>>>> We could add a condition, but I don't think that we should.
>      >>>>> I think we should refrain from different behavior when it is
>     not justified.
>      >>>>> I think it is not justified to allow parallel dio if any file
>     is open in
>      >>>>> caching mode on the inode and any mmap (private or shared)
>      >>>>> exists on the inode.
>      >>>>>
>      >>>>> That means that FUSE_I_CACHE_IO_MODE should be set on
>      >>>>> any mmap, and already on open for non direct_io files.
>      >>>>
>      >>>> Ok, I can change and add that. Doing it in open is definitely
>     needed
>      >>>> for O_DIRECT (in my other dio branch).
>      >>>>
>      >>>
>      >>> Good, the more common code the better.
>      >>>
>      >>>>>
>      >>>>> Mixing caching and direct io on the same inode is hard as it is
>      >>>>> already and there is no need to add complexity by allowing
>      >>>>> parallel dio in that case. IMO it wins us nothing.
>      >>>>
>      >>>> So the slight issue I see are people like me, who check the
>     content
>      >>>> of a file during a long running computation. Like an HPC
>     application
>      >>>> is doing some long term runs. Then in the middle of
>      >>>> the run the user wants to see the current content of the file and
>      >>>> reads it - if that is done through mmap (and from a node that runs
>      >>>> the application), parallel DIO is disabled with the current patch
>      >>>> until the file is closed - I see the use case to check for writes.
>      >>>>
>      >>>
>      >>> That's what I thought.
>      >>>
>      >>>>
>      >>>>>
>      >>>>> The FUSE_I_CACHE_IO_MODE could be cleared on last file
>      >>>>> close (as your patch did) but it could be cleared earlier if
>      >>>>> instead of tracking refcount of open file, we track refcount of
>      >>>>> files open in caching mode or mmaped, which is what the
>      >>>>> FOPEN_MMAP_CACHE flag I suggested is for.
>      >>>>
>      >>>> But how does open() know that a file/fd is used for mmap?
>      >>>>
>      >>>
>      >>> Because what I tried to suggest is a trick/hack:
>      >>> first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
>      >>> and bumps the cached_opens on the inode as if file was
>      >>> opened in caching mode or in FOPEN_MMAP_CACHE mode.
>      >>> When the file that was used for mmap is closed and all the rest
>      >>> of the open files have only ever been used for direct_io, then
>      >>> inode exists the caching io mode.
>      >>>
>      >>> Using an FOPEN flag for that is kind of a hack.
>      >>> We could add an internal file state bits for that as well,
>      >>> but my thinking was that FOPEN_MMAP_CACHE could really
>      >>> be set by the server to mean per-file ALLOW_MMAP instead of
>      >>> the per-filesystem ALLOW_MMAP. Not sure if that will be useful.
>      >>
>      >> Ok, I will try to add that in a different patch to have better
>      >> visibility. Will also put these patch here in front of my dio
>     branch and
>      >> rebase these patches. There comes in a bit additional complexity to
>      >> handle O_DIRECT, but it also consolidates direct-IO writes code
>     paths.
>      >> At least I hope this is still possible with the new changes.
>      >>
>      >>>
>      >>> Sorry for the hand waving. I was trying to send out a demo
>      >>> patch that explains it better, but got caught up with other things.
>      >>
>      >> No problem at all, I think I know what you mean and I can try
>     add this
>      >> myself.
>      >
>      > Here is what I was thinking about:
>      >
>      > https://github.com/amir73il/linux/commits/fuse_io_mode
>     <https://github.com/amir73il/linux/commits/fuse_io_mode>
>      >
>      > The concept that I wanted to introduce was the
>      > fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
>      > helpers (akin to deny_write_access()/allow_write_access()).
>      >
>      > In this patch, parallel dio in progress deny open in caching mode
>      > and mmap, and I don't know if that is acceptable.
>      > Technically, instead of deny open/mmap you can use additional
>      > techniques to wait for in progress dio and allow caching open/mmap.
>      >
>      > Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
>      > pattern when file is open in FOPEN_PASSTHROUGH mode, but
>      > in this case, as agreed with Miklos, a server trying to mix open
>      > in caching mode on the same inode is going to fail the open.
>      >
>      > mmap is less of a problem for inode in passthrough mode, because
>      > mmap in of direct_io file and inode in passthrough mode is
>     passthrough
>      > mmap to backing file.
>      >
>      > Anyway, if you can use this patch or parts of it, be my guest and
>     if you
>      > want to use a different approach that is fine by me as well - in
>     that case
>      > I will just remove the fuse_file_shared_dio_{start,end}() part
>     from my patch.
> 
>     Thanks Amir, I'm going to look at it in detail in the morning.
>     Btw, there is another bad direct_io_allow_mmap issue (part of it is
>     invalidate_inode_pages2, which you already noticed, but not alone).
>     Going to send out the patch once xfstests have passed
>     https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c <https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c>
> 
> 
> Nice!
> But I think that invalidate pages issue is not restricted to shared mmap?

So history for that is

commit 3121bfe7631126d1b13064855ac2cfa164381bb0
Author: Miklos Szeredi <mszeredi@suse.cz>
Date:   Thu Apr 9 17:37:53 2009 +0200

     fuse: fix "direct_io" private mmap
     
     MAP_PRIVATE mmap could return stale data from the cache for
     "direct_io" files.  Fix this by flushing the cache on mmap.
     
     Found with a slightly modified fsx-linux.
     
     Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0946861b10b7..06f30e965676 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
         if (vma->vm_flags & VM_MAYSHARE)
                 return -ENODEV;
  
+       invalidate_inode_pages2(file->f_mapping);
+
         return generic_file_mmap(file, vma);
  }


I don't have a strong opinion here - so idea of this patch is to avoid
exposing stale data from a previous mmap. I guess (and probably hard to achieve
semantics) would be to invalidate pages when the last mapping of that _area_
is done?
So now with a shared map, data are supposed to be stored in files and
close-to-open consistency with FOPEN_KEEP_CACHE should handle the invalidation?

> 
> I think that the mix of direct io file with private mmap is common and 
> doesn't have issues, but the mix of direct io files and caching files on 
> the same inode is probably not very common has the same issues as the 
> direct_io_allow_mmap regression that you are fixing.

Yeah. I also find it interesting that generic_file_mmap is not doing such
things for files opened with O_DIRECT - FOPEN_DIRECT_IO tries to do
strong coherency?


I'm going to send out the patch for now as it is, as that might become a longer
discussion - maybe Miklos could comment on it.


Thanks,
Bernd

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-13 10:11                                               ` Bernd Schubert
@ 2023-12-13 11:23                                                 ` Amir Goldstein
  2023-12-13 13:03                                                   ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-13 11:23 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, Miklos Szeredi,
	gmaglione, Max Reitz, Hao Xu, Dharmendra Singh

> >
> >     Thanks Amir, I'm going to look at it in detail in the morning.
> >     Btw, there is another bad direct_io_allow_mmap issue (part of it is
> >     invalidate_inode_pages2, which you already noticed, but not alone).
> >     Going to send out the patch once xfstests have passed
> >     https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c <https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c>
> >
> >
> > Nice!
> > But I think that invalidate pages issue is not restricted to shared mmap?
>
> So history for that is
>
> commit 3121bfe7631126d1b13064855ac2cfa164381bb0
> Author: Miklos Szeredi <mszeredi@suse.cz>
> Date:   Thu Apr 9 17:37:53 2009 +0200
>
>      fuse: fix "direct_io" private mmap
>
>      MAP_PRIVATE mmap could return stale data from the cache for
>      "direct_io" files.  Fix this by flushing the cache on mmap.
>
>      Found with a slightly modified fsx-linux.
>
>      Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 0946861b10b7..06f30e965676 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
>          if (vma->vm_flags & VM_MAYSHARE)
>                  return -ENODEV;
>
> +       invalidate_inode_pages2(file->f_mapping);
> +
>          return generic_file_mmap(file, vma);
>   }
>
>
> I don't have a strong opinion here - so idea of this patch is to avoid
> exposing stale data from a previous mmap. I guess (and probably hard to achieve
> semantics) would be to invalidate pages when the last mapping of that _area_
> is done?
> So now with a shared map, data are supposed to be stored in files and
> close-to-open consistency with FOPEN_KEEP_CACHE should handle the invalidation?
>

Nevermind, it was just my bad understanding of invalidate_inode_pages2().
I think it calls fuse_launder_folio() for dirty pages, so data loss is
not a concern.

> >
> > I think that the mix of direct io file with private mmap is common and
> > doesn't have issues, but the mix of direct io files and caching files on
> > the same inode is probably not very common has the same issues as the
> > direct_io_allow_mmap regression that you are fixing.
>
> Yeah. I also find it interesting that generic_file_mmap is not doing such
> things for files opened with O_DIRECT - FOPEN_DIRECT_IO tries to do
> strong coherency?
>
>
> I'm going to send out the patch for now as it is, as that might become a longer
> discussion - maybe Miklos could comment on it.
>

I think your patch should not be avoiding invalidate_inode_pages2()
in the shared mmap case.

You have done that part because of my comment which was wrong,
not because it reproduced a bug.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-13 11:23                                                 ` Amir Goldstein
@ 2023-12-13 13:03                                                   ` Bernd Schubert
  2023-12-13 14:09                                                     ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-13 13:03 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, Miklos Szeredi,
	gmaglione, Max Reitz, Hao Xu, Dharmendra Singh



On 12/13/23 12:23, Amir Goldstein wrote:
>>>
>>>      Thanks Amir, I'm going to look at it in detail in the morning.
>>>      Btw, there is another bad direct_io_allow_mmap issue (part of it is
>>>      invalidate_inode_pages2, which you already noticed, but not alone).
>>>      Going to send out the patch once xfstests have passed
>>>      https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c <https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c>
>>>
>>>
>>> Nice!
>>> But I think that invalidate pages issue is not restricted to shared mmap?
>>
>> So history for that is
>>
>> commit 3121bfe7631126d1b13064855ac2cfa164381bb0
>> Author: Miklos Szeredi <mszeredi@suse.cz>
>> Date:   Thu Apr 9 17:37:53 2009 +0200
>>
>>       fuse: fix "direct_io" private mmap
>>
>>       MAP_PRIVATE mmap could return stale data from the cache for
>>       "direct_io" files.  Fix this by flushing the cache on mmap.
>>
>>       Found with a slightly modified fsx-linux.
>>
>>       Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index 0946861b10b7..06f30e965676 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
>>           if (vma->vm_flags & VM_MAYSHARE)
>>                   return -ENODEV;
>>
>> +       invalidate_inode_pages2(file->f_mapping);
>> +
>>           return generic_file_mmap(file, vma);
>>    }
>>
>>
>> I don't have a strong opinion here - so idea of this patch is to avoid
>> exposing stale data from a previous mmap. I guess (and probably hard to achieve
>> semantics) would be to invalidate pages when the last mapping of that _area_
>> is done?
>> So now with a shared map, data are supposed to be stored in files and
>> close-to-open consistency with FOPEN_KEEP_CACHE should handle the invalidation?
>>
> 
> Nevermind, it was just my bad understanding of invalidate_inode_pages2().
> I think it calls fuse_launder_folio() for dirty pages, so data loss is
> not a concern.
> 
>>>
>>> I think that the mix of direct io file with private mmap is common and
>>> doesn't have issues, but the mix of direct io files and caching files on
>>> the same inode is probably not very common has the same issues as the
>>> direct_io_allow_mmap regression that you are fixing.
>>
>> Yeah. I also find it interesting that generic_file_mmap is not doing such
>> things for files opened with O_DIRECT - FOPEN_DIRECT_IO tries to do
>> strong coherency?
>>
>>
>> I'm going to send out the patch for now as it is, as that might become a longer
>> discussion - maybe Miklos could comment on it.
>>
> 
> I think your patch should not be avoiding invalidate_inode_pages2()
> in the shared mmap case.
> 
> You have done that part because of my comment which was wrong,
> not because it reproduced a bug.

I debating with myself since yesterday, where invalidate_inode_pages2() 
belongs to.

We have

FOPEN_KEEP_CACHE - if not set invalidate_inode_pages2 is done by 
fuse_open_common().  If set, server side signals that it wants to keep 
the cache. Also interesting, I don't see anything that prevents that 
FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE are set together.
Also to consider, fc->_no_open sets FOPEN_KEEP_CACHE by default.

MAP_PRIVATE - here I'm sure that invalidate_inode_pages2 is right, even 
with FOPEN_KEEP_CACHE. There is also zero risk to lose data, as 
MAP_PRIVATE does not write out data.

MAP_SHARED - was not allowed with FOPEN_DIRECT_IO before. Unless 
FOPEN_KEEP_CACHE is set, close-to-open semantics come in. My argument to 
to avoid invalidate_inode_pages2 in the current patch is that MAP_SHARED 
wants to share data between processes. And also maybe important, there 
is no flush in that function - dirty pages would be thrown away - data 
corruption!?


Thanks,
Bernd


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-13 13:03                                                   ` Bernd Schubert
@ 2023-12-13 14:09                                                     ` Bernd Schubert
  2023-12-14 11:50                                                       ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-13 14:09 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, Miklos Szeredi,
	gmaglione, Max Reitz, Hao Xu, Dharmendra Singh



On 12/13/23 14:03, Bernd Schubert wrote:
> 
> 
> On 12/13/23 12:23, Amir Goldstein wrote:
>>>>
>>>>      Thanks Amir, I'm going to look at it in detail in the morning.
>>>>      Btw, there is another bad direct_io_allow_mmap issue (part of 
>>>> it is
>>>>      invalidate_inode_pages2, which you already noticed, but not 
>>>> alone).
>>>>      Going to send out the patch once xfstests have passed
>>>>      
>>>> https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c <https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c>
>>>>
>>>>
>>>> Nice!
>>>> But I think that invalidate pages issue is not restricted to shared 
>>>> mmap?
>>>
>>> So history for that is
>>>
>>> commit 3121bfe7631126d1b13064855ac2cfa164381bb0
>>> Author: Miklos Szeredi <mszeredi@suse.cz>
>>> Date:   Thu Apr 9 17:37:53 2009 +0200
>>>
>>>       fuse: fix "direct_io" private mmap
>>>
>>>       MAP_PRIVATE mmap could return stale data from the cache for
>>>       "direct_io" files.  Fix this by flushing the cache on mmap.
>>>
>>>       Found with a slightly modified fsx-linux.
>>>
>>>       Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
>>>
>>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>>> index 0946861b10b7..06f30e965676 100644
>>> --- a/fs/fuse/file.c
>>> +++ b/fs/fuse/file.c
>>> @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, 
>>> struct vm_area_struct *vma)
>>>           if (vma->vm_flags & VM_MAYSHARE)
>>>                   return -ENODEV;
>>>
>>> +       invalidate_inode_pages2(file->f_mapping);
>>> +
>>>           return generic_file_mmap(file, vma);
>>>    }
>>>
>>>
>>> I don't have a strong opinion here - so idea of this patch is to avoid
>>> exposing stale data from a previous mmap. I guess (and probably hard 
>>> to achieve
>>> semantics) would be to invalidate pages when the last mapping of that 
>>> _area_
>>> is done?
>>> So now with a shared map, data are supposed to be stored in files and
>>> close-to-open consistency with FOPEN_KEEP_CACHE should handle the 
>>> invalidation?
>>>
>>
>> Nevermind, it was just my bad understanding of invalidate_inode_pages2().
>> I think it calls fuse_launder_folio() for dirty pages, so data loss is
>> not a concern.
>>
>>>>
>>>> I think that the mix of direct io file with private mmap is common and
>>>> doesn't have issues, but the mix of direct io files and caching 
>>>> files on
>>>> the same inode is probably not very common has the same issues as the
>>>> direct_io_allow_mmap regression that you are fixing.
>>>
>>> Yeah. I also find it interesting that generic_file_mmap is not doing 
>>> such
>>> things for files opened with O_DIRECT - FOPEN_DIRECT_IO tries to do
>>> strong coherency?
>>>
>>>
>>> I'm going to send out the patch for now as it is, as that might 
>>> become a longer
>>> discussion - maybe Miklos could comment on it.
>>>
>>
>> I think your patch should not be avoiding invalidate_inode_pages2()
>> in the shared mmap case.
>>
>> You have done that part because of my comment which was wrong,
>> not because it reproduced a bug.
> 
> I debating with myself since yesterday, where invalidate_inode_pages2() 
> belongs to.
> 
> We have
> 
> FOPEN_KEEP_CACHE - if not set invalidate_inode_pages2 is done by 
> fuse_open_common().  If set, server side signals that it wants to keep 
> the cache. Also interesting, I don't see anything that prevents that 
> FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE are set together.
> Also to consider, fc->_no_open sets FOPEN_KEEP_CACHE by default.
> 
> MAP_PRIVATE - here I'm sure that invalidate_inode_pages2 is right, even 
> with FOPEN_KEEP_CACHE. There is also zero risk to lose data, as 
> MAP_PRIVATE does not write out data.
> 
> MAP_SHARED - was not allowed with FOPEN_DIRECT_IO before. Unless 
> FOPEN_KEEP_CACHE is set, close-to-open semantics come in. My argument to 
> to avoid invalidate_inode_pages2 in the current patch is that MAP_SHARED 
> wants to share data between processes. And also maybe important, there 
> is no flush in that function - dirty pages would be thrown away - data 
> corruption!?


Ah sorry, I had actually missed folio_launder() -> fuse_launder_folio(), 
ok fine then, we can keep it for both cases.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-13 14:09                                                     ` Bernd Schubert
@ 2023-12-14 11:50                                                       ` Bernd Schubert
  0 siblings, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-12-14 11:50 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, Miklos Szeredi,
	gmaglione, Max Reitz, Hao Xu, Dharmendra Singh, Matthew Wilcox



On 12/13/23 15:09, Bernd Schubert wrote:
> 
> 
> On 12/13/23 14:03, Bernd Schubert wrote:
>>
>>
>> On 12/13/23 12:23, Amir Goldstein wrote:
>>>>>
>>>>>      Thanks Amir, I'm going to look at it in detail in the morning.
>>>>>      Btw, there is another bad direct_io_allow_mmap issue (part of 
>>>>> it is
>>>>>      invalidate_inode_pages2, which you already noticed, but not 
>>>>> alone).
>>>>>      Going to send out the patch once xfstests have passed
>>>>> https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c <https://github.com/bsbernd/linux/commit/3dae6b05854c4fe84302889a5625c7e5428cdd6c>
>>>>>
>>>>>
>>>>> Nice!
>>>>> But I think that invalidate pages issue is not restricted to shared 
>>>>> mmap?
>>>>
>>>> So history for that is
>>>>
>>>> commit 3121bfe7631126d1b13064855ac2cfa164381bb0
>>>> Author: Miklos Szeredi <mszeredi@suse.cz>
>>>> Date:   Thu Apr 9 17:37:53 2009 +0200
>>>>
>>>>       fuse: fix "direct_io" private mmap
>>>>
>>>>       MAP_PRIVATE mmap could return stale data from the cache for
>>>>       "direct_io" files.  Fix this by flushing the cache on mmap.
>>>>
>>>>       Found with a slightly modified fsx-linux.
>>>>
>>>>       Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
>>>>
>>>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>>>> index 0946861b10b7..06f30e965676 100644
>>>> --- a/fs/fuse/file.c
>>>> +++ b/fs/fuse/file.c
>>>> @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, 
>>>> struct vm_area_struct *vma)
>>>>           if (vma->vm_flags & VM_MAYSHARE)
>>>>                   return -ENODEV;
>>>>
>>>> +       invalidate_inode_pages2(file->f_mapping);
>>>> +
>>>>           return generic_file_mmap(file, vma);
>>>>    }
>>>>
>>>>
>>>> I don't have a strong opinion here - so idea of this patch is to avoid
>>>> exposing stale data from a previous mmap. I guess (and probably hard 
>>>> to achieve
>>>> semantics) would be to invalidate pages when the last mapping of 
>>>> that _area_
>>>> is done?
>>>> So now with a shared map, data are supposed to be stored in files and
>>>> close-to-open consistency with FOPEN_KEEP_CACHE should handle the 
>>>> invalidation?
>>>>
>>>
>>> Nevermind, it was just my bad understanding of 
>>> invalidate_inode_pages2().
>>> I think it calls fuse_launder_folio() for dirty pages, so data loss is
>>> not a concern.
>>>
>>>>>
>>>>> I think that the mix of direct io file with private mmap is common and
>>>>> doesn't have issues, but the mix of direct io files and caching 
>>>>> files on
>>>>> the same inode is probably not very common has the same issues as the
>>>>> direct_io_allow_mmap regression that you are fixing.
>>>>
>>>> Yeah. I also find it interesting that generic_file_mmap is not doing 
>>>> such
>>>> things for files opened with O_DIRECT - FOPEN_DIRECT_IO tries to do
>>>> strong coherency?
>>>>
>>>>
>>>> I'm going to send out the patch for now as it is, as that might 
>>>> become a longer
>>>> discussion - maybe Miklos could comment on it.
>>>>
>>>
>>> I think your patch should not be avoiding invalidate_inode_pages2()
>>> in the shared mmap case.
>>>
>>> You have done that part because of my comment which was wrong,
>>> not because it reproduced a bug.
>>
>> I debating with myself since yesterday, where 
>> invalidate_inode_pages2() belongs to.
>>
>> We have
>>
>> FOPEN_KEEP_CACHE - if not set invalidate_inode_pages2 is done by 
>> fuse_open_common().  If set, server side signals that it wants to keep 
>> the cache. Also interesting, I don't see anything that prevents that 
>> FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE are set together.
>> Also to consider, fc->_no_open sets FOPEN_KEEP_CACHE by default.
>>
>> MAP_PRIVATE - here I'm sure that invalidate_inode_pages2 is right, 
>> even with FOPEN_KEEP_CACHE. There is also zero risk to lose data, as 
>> MAP_PRIVATE does not write out data.
>>
>> MAP_SHARED - was not allowed with FOPEN_DIRECT_IO before. Unless 
>> FOPEN_KEEP_CACHE is set, close-to-open semantics come in. My argument 
>> to to avoid invalidate_inode_pages2 in the current patch is that 
>> MAP_SHARED wants to share data between processes. And also maybe 
>> important, there is no flush in that function - dirty pages would be 
>> thrown away - data corruption!?
> 
> 
> Ah sorry, I had actually missed folio_launder() -> fuse_launder_folio(), 
> ok fine then, we can keep it for both cases.


Hmm, actually, why is fuse_launder_folio only writing out one single 
page, instead of all pages in the folio? This actually applies to all 
file systems that have have .launder_folio method (nfs, orangefs, smb, 
fuse) with the exception of v9fs. The latter is the only that writes out 
the entire folio.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-12 18:30                                         ` Amir Goldstein
  2023-12-12 22:07                                           ` Bernd Schubert
@ 2023-12-19  0:03                                           ` Bernd Schubert
  2023-12-19 13:01                                             ` Amir Goldstein
  1 sibling, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-19  0:03 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/12/23 19:30, Amir Goldstein wrote:
> On Sat, Dec 9, 2023 at 12:38 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/8/23 21:46, Amir Goldstein wrote:
>>> On Fri, Dec 8, 2023 at 9:50 PM Bernd Schubert
>>> <bernd.schubert@fastmail.fm> wrote:
>>>>
>>>>
>>>>
>>>> On 12/8/23 09:39, Amir Goldstein wrote:
>>>>> On Thu, Dec 7, 2023 at 8:38 PM Bernd Schubert
>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 12/7/23 08:39, Amir Goldstein wrote:
>>>>>>> On Thu, Dec 7, 2023 at 1:28 AM Bernd Schubert
>>>>>>> <bernd.schubert@fastmail.fm> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On 12/6/23 09:25, Amir Goldstein wrote:
>>>>>>>>>>>> Is it actually important for FUSE_DIRECT_IO_ALLOW_MMAP fs
>>>>>>>>>>>> (e.g. virtiofsd) to support FOPEN_PARALLEL_DIRECT_WRITES?
>>>>>>>>>>>> I guess not otherwise, the combination would have been tested.
>>>>>>>>>>>
>>>>>>>>>>> I'm not sure how many people are aware of these different flags/features.
>>>>>>>>>>> I had just finalized the backport of the related patches to RHEL8 on
>>>>>>>>>>> Friday, as we (or our customers) need both for different jobs.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES is typically important for
>>>>>>>>>>>> network fs and FUSE_DIRECT_IO_ALLOW_MMAP is typically not
>>>>>>>>>>>> for network fs. Right?
>>>>>>>>>>>
>>>>>>>>>>> We kind of have these use cases for our network file systems
>>>>>>>>>>>
>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES:
>>>>>>>>>>>          - Traditional HPC, large files, parallel IO
>>>>>>>>>>>          - Large file used on local node as container for many small files
>>>>>>>>>>>
>>>>>>>>>>> FUSE_DIRECT_IO_ALLOW_MMAP:
>>>>>>>>>>>          - compilation through gcc (not so important, just not nice when it
>>>>>>>>>>> does not work)
>>>>>>>>>>>          - rather recent: python libraries using mmap _reads_. As it is read
>>>>>>>>>>> only no issue of consistency.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> These jobs do not intermix - no issue as in generic/095. If such
>>>>>>>>>>> applications really exist, I have no issue with a serialization penalty.
>>>>>>>>>>> Just disabling FOPEN_PARALLEL_DIRECT_WRITES because other
>>>>>>>>>>> nodes/applications need FUSE_DIRECT_IO_ALLOW_MMAP is not so nice.
>>>>>>>>>>>
>>>>>>>>>>> Final goal is also to have FOPEN_PARALLEL_DIRECT_WRITES to work on plain
>>>>>>>>>>> O_DIRECT and not only for FUSE_DIRECT_IO - I need to update this branch
>>>>>>>>>>> and post the next version
>>>>>>>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v4
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> In the mean time I have another idea how to solve
>>>>>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES + FUSE_DIRECT_IO_ALLOW_MMAP
>>>>>>>>>>
>>>>>>>>>> Please find attached what I had in my mind. With that generic/095 is not
>>>>>>>>>> crashing for me anymore. I just finished the initial coding - it still
>>>>>>>>>> needs a bit cleanup and maybe a few comments.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Nice. I like the FUSE_I_CACHE_WRITES state.
>>>>>>>>> For FUSE_PASSTHROUGH I will need to track if inode is open/mapped
>>>>>>>>> in caching mode, so FUSE_I_CACHE_WRITES can be cleared on release
>>>>>>>>> of the last open file of the inode.
>>>>>>>>>
>>>>>>>>> I did not understand some of the complexity here:
>>>>>>>>>
>>>>>>>>>>             /* The inode ever got page writes and we do not know for sure
>>>>>>>>>>              * in the DIO path if these are pending - shared lock not possible */
>>>>>>>>>>             spin_lock(&fi->lock);
>>>>>>>>>>             if (!test_bit(FUSE_I_CACHE_WRITES, &fi->state)) {
>>>>>>>>>>                     if (!(*cnt_increased)) {
>>>>>>>>>
>>>>>>>>> How can *cnt_increased be true here?
>>>>>>>>
>>>>>>>> I think you missed the 2nd entry into this function, when the shared
>>>>>>>> lock was already taken?
>>>>>>>
>>>>>>> Yeh, I did.
>>>>>>>
>>>>>>>> I have changed the code now to have all
>>>>>>>> complexity in this function (test, lock, retest with lock, release,
>>>>>>>> wakeup). I hope that will make it easier to see the intention of the
>>>>>>>> code. Will post the new patches in the morning.
>>>>>>>>
>>>>>>>
>>>>>>> Sounds good. Current version was a bit hard to follow.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>                             fi->shared_lock_direct_io_ctr++;
>>>>>>>>>>                             *cnt_increased = true;
>>>>>>>>>>                     }
>>>>>>>>>>                     excl_lock = false;
>>>>>>>>>
>>>>>>>>> Seems like in every outcome of this function
>>>>>>>>> *cnt_increased = !excl_lock
>>>>>>>>> so there is not need for out arg cnt_increased
>>>>>>>>
>>>>>>>> If excl_lock would be used as input - yeah, would have worked as well.
>>>>>>>> Or a parameter like "retest-under-lock". Code is changed now to avoid
>>>>>>>> going in and out.
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>             }
>>>>>>>>>>             spin_unlock(&fi->lock);
>>>>>>>>>>
>>>>>>>>>> out:
>>>>>>>>>>             if (excl_lock && *cnt_increased) {
>>>>>>>>>>                     bool wake = false;
>>>>>>>>>>                     spin_lock(&fi->lock);
>>>>>>>>>>                     if (--fi->shared_lock_direct_io_ctr == 0)
>>>>>>>>>>                             wake = true;
>>>>>>>>>>                     spin_unlock(&fi->lock);
>>>>>>>>>>                     if (wake)
>>>>>>>>>>                             wake_up(&fi->direct_io_waitq);
>>>>>>>>>>             }
>>>>>>>>>
>>>>>>>>> I don't see how this wake_up code is reachable.
>>>>>>>>>
>>>>>>>>> TBH, I don't fully understand the expected result.
>>>>>>>>> Surely, the behavior of dio mixed with mmap is undefined. Right?
>>>>>>>>> IIUC, your patch does not prevent dirtying page cache while dio is in
>>>>>>>>> flight. It only prevents writeback while dio is in flight, which is the same
>>>>>>>>> behavior as with exclusive inode lock. Right?
>>>>>>>>
>>>>>>>> Yeah, thanks. I will add it in the patch description.
>>>>>>>>
>>>>>>>> And there was actually an issue with the patch, as cache flushing needs
>>>>>>>> to be initiated before doing the lock decision, fixed now.
>>>>>>>>
>>>>>>>
>>>>>>> I thought there was, because of the wait in fuse_send_writepage()
>>>>>>> but wasn't sure if I was following the flow correctly.
>>>>>>>
>>>>>>>>>
>>>>>>>>> Maybe this interaction is spelled out somewhere else, but if not
>>>>>>>>> better spell it out for people like me that are new to this code.
>>>>>>>>
>>>>>>>> Sure, thanks a lot for your helpful comments!
>>>>>>>>
>>>>>>>
>>>>>>> Just to be clear, this patch looks like a good improvement and
>>>>>>> is mostly independent of the "inode caching mode" and
>>>>>>> FOPEN_CACHE_MMAP idea that I suggested.
>>>>>>>
>>>>>>> The only thing that my idea changes is replacing the
>>>>>>> FUSE_I_CACHE_WRITES state with a FUSE_I_CACHE_IO_MODE
>>>>>>> state, which is set earlier than FUSE_I_CACHE_WRITES
>>>>>>> on caching file open or first direct_io mmap and unlike
>>>>>>> FUSE_I_CACHE_WRITES, it is cleared on the last file close.
>>>>>>>
>>>>>>> FUSE_I_CACHE_WRITES means that caching writes happened.
>>>>>>> FUSE_I_CACHE_IO_MODE means the caching writes and reads
>>>>>>> may happen.
>>>>>>>
>>>>>>> FOPEN_PARALLEL_DIRECT_WRITES obviously shouldn't care
>>>>>>> about "caching reads may happen", but IMO that is a small trade off
>>>>>>> to make for maintaining the same state for
>>>>>>> "do not allow parallel dio" and "do not allow passthrough open".
>>>>>>
>>>>>> I think the attached patches should do, it now also unsets
>>>>>
>>>>> IMO, your patch is still more complicated than it should be.
>>>>> There is no need for the complicated retest state machine.
>>>>> If you split the helpers to:
>>>>>
>>>>> bool exclusive_lock fuse_dio_wr_needs_exclusive_lock();
>>>>> ...
>>>>> fuse_dio_lock_inode(iocb, &exclusive);
>>>>> ...
>>>>> fuse_dio_unlock_inode(iocb, &exclusive);
>>>>>
>>>>> Then you only need to test FUSE_I_CACHE_IO_MODE in
>>>>> fuse_dio_wr_needs_exclusive_lock()
>>>>> and you only need to increment shared_lock_direct_io_ctr
>>>>> after taking shared lock and re-testing FUSE_I_CACHE_IO_MODE.
>>>>
>>>> Hmm, I'm not sure.
>>>>
>>>> I changed fuse_file_mmap() to call this function
>>>>
>>>> /*
>>>>     * direct-io with shared locks cannot handle page cache io - set an inode
>>>>     * flag to disable shared locks and wait until remaining threads are done
>>>>     */
>>>> static void fuse_file_mmap_handle_dio_writers(struct inode *inode)
>>>> {
>>>>           struct fuse_inode *fi = get_fuse_inode(inode);
>>>>
>>>>           spin_lock(&fi->lock);
>>>>           set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
>>>>           while (fi->shared_lock_direct_io_ctr > 0) {
>>>>                   spin_unlock(&fi->lock);
>>>>                   wait_event_interruptible(fi->direct_io_waitq,
>>>>                                            fi->shared_lock_direct_io_ctr == 0);
>>>>                   spin_lock(&fi->lock);
>>>>           }
>>>>           spin_unlock(&fi->lock);
>>>> }
>>>>
>>>>
>>>> Before we had indeed a race. Idea for fuse_file_mmap_handle_dio_writers()
>>>> and fuse_dio_lock_inode() is to either have FUSE_I_CACHE_IO_MODE set,
>>>> or fi->shared_lock_direct_io_ctr is greater 0, but that requires that
>>>> FUSE_I_CACHE_IO_MODE is checked for when fi->lock is taken.
>>>>
>>>>
>>>> I'm going to think about over the weekend if your suggestion
>>>> to increase fi->shared_lock_direct_io_ctr only after taking the shared
>>>> lock is possible. Right now I don't see how to do that.
>>>>
>>>>
>>>>>
>>>>>> FUSE_I_CACHE_IO_MODE. Setting the flag actually has to be done from
>>>>>> fuse_file_mmap (and not from fuse_send_writepage) to avoid a dead stall,
>>>>>> but that aligns with passthrough anyway?
>>>>>
>>>>> Yes.
>>>>>
>>>>> I see that shared_lock_direct_io_ctr is checked without lock or barriers
>>>>> in and the wait_event() should be interruptible.
>>>>
>>>> Thanks, fixed with the function above.
>>>>
>>>>> I am also not sure if it breaks any locking order for mmap because
>>>>> the task that is going to wake it up is holding the shared inode lock...
>>>>
>>>> The waitq has its own lock. We have
>>>>
>>>> fuse_file_mmap - called under some mmap lock, waitq lock
>>>>
>>>> fuse_dio_lock_inode: no lock taken before calling wakeup
>>>>
>>>> fuse_direct_write_iter: wakeup after release of all locks
>>>>
>>>> So I don't think we have a locker issue (lockdep also doesn't annotate
>>>> anything).
>>>
>>> I don't think that lockdep can understand this dependency.
>>>
>>>> What we definitely cannot do it to take the inode i_rwsem lock in fuse_file_mmap
>>>>
>>>
>>> It's complicated. I need to look at the whole thing again.
>>>
>>>>>
>>>>> While looking at this code, the invalidate_inode_pages2() looks suspicious.
>>>>> If inode is already in FUSE_I_CACHE_IO_MODE when performing
>>>>> another mmap, doesn't that have potential for data loss?
>>>>> (even before your patch I mean)
>>>>>
>>>>>> Amir, right now it only sets
>>>>>> FUSE_I_CACHE_IO_MODE for VM_MAYWRITE. Maybe you could add a condition
>>>>>> for passthrough there?
>>>>>>
>>>>>
>>>>> We could add a condition, but I don't think that we should.
>>>>> I think we should refrain from different behavior when it is not justified.
>>>>> I think it is not justified to allow parallel dio if any file is open in
>>>>> caching mode on the inode and any mmap (private or shared)
>>>>> exists on the inode.
>>>>>
>>>>> That means that FUSE_I_CACHE_IO_MODE should be set on
>>>>> any mmap, and already on open for non direct_io files.
>>>>
>>>> Ok, I can change and add that. Doing it in open is definitely needed
>>>> for O_DIRECT (in my other dio branch).
>>>>
>>>
>>> Good, the more common code the better.
>>>
>>>>>
>>>>> Mixing caching and direct io on the same inode is hard as it is
>>>>> already and there is no need to add complexity by allowing
>>>>> parallel dio in that case. IMO it wins us nothing.
>>>>
>>>> So the slight issue I see are people like me, who check the content
>>>> of a file during a long running computation. Like an HPC application
>>>> is doing some long term runs. Then in the middle of
>>>> the run the user wants to see the current content of the file and
>>>> reads it - if that is done through mmap (and from a node that runs
>>>> the application), parallel DIO is disabled with the current patch
>>>> until the file is closed - I see the use case to check for writes.
>>>>
>>>
>>> That's what I thought.
>>>
>>>>
>>>>>
>>>>> The FUSE_I_CACHE_IO_MODE could be cleared on last file
>>>>> close (as your patch did) but it could be cleared earlier if
>>>>> instead of tracking refcount of open file, we track refcount of
>>>>> files open in caching mode or mmaped, which is what the
>>>>> FOPEN_MMAP_CACHE flag I suggested is for.
>>>>
>>>> But how does open() know that a file/fd is used for mmap?
>>>>
>>>
>>> Because what I tried to suggest is a trick/hack:
>>> first mmap on direct_io file sets FOPEN_MMAP_CACHE on the file
>>> and bumps the cached_opens on the inode as if file was
>>> opened in caching mode or in FOPEN_MMAP_CACHE mode.
>>> When the file that was used for mmap is closed and all the rest
>>> of the open files have only ever been used for direct_io, then
>>> inode exists the caching io mode.
>>>
>>> Using an FOPEN flag for that is kind of a hack.
>>> We could add an internal file state bits for that as well,
>>> but my thinking was that FOPEN_MMAP_CACHE could really
>>> be set by the server to mean per-file ALLOW_MMAP instead of
>>> the per-filesystem ALLOW_MMAP. Not sure if that will be useful.
>>
>> Ok, I will try to add that in a different patch to have better
>> visibility. Will also put these patch here in front of my dio branch and
>> rebase these patches. There comes in a bit additional complexity to
>> handle O_DIRECT, but it also consolidates direct-IO writes code paths.
>> At least I hope this is still possible with the new changes.
>>
>>>
>>> Sorry for the hand waving. I was trying to send out a demo
>>> patch that explains it better, but got caught up with other things.
>>
>> No problem at all, I think I know what you mean and I can try add this
>> myself.
> 
> Here is what I was thinking about:
> 
> https://github.com/amir73il/linux/commits/fuse_io_mode
> 
> The concept that I wanted to introduce was the
> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
> helpers (akin to deny_write_access()/allow_write_access()).
> 
> In this patch, parallel dio in progress deny open in caching mode
> and mmap, and I don't know if that is acceptable.
> Technically, instead of deny open/mmap you can use additional
> techniques to wait for in progress dio and allow caching open/mmap.
> 
> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
> pattern when file is open in FOPEN_PASSTHROUGH mode, but
> in this case, as agreed with Miklos, a server trying to mix open
> in caching mode on the same inode is going to fail the open.
> 
> mmap is less of a problem for inode in passthrough mode, because
> mmap in of direct_io file and inode in passthrough mode is passthrough
> mmap to backing file.
> 
> Anyway, if you can use this patch or parts of it, be my guest and if you
> want to use a different approach that is fine by me as well - in that case
> I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.

Hi Amir,

here is my fuse-dio-v5 branch:
https://github.com/bsbernd/linux/commits/fuse-dio-v5/

(v5 is just compilation tested, tests are running now over night)

This branch is basically about consolidating fuse write direct IO code 
paths and to allow a shared lock for O_DIRECT. I actually could have 
noticed the page cache issue with shared locks before with previous 
versions of these patches, just my VM kernel is optimized for 
compilation time and some SHM options had been missing - with that fio 
refused to run.

The branch includes a modified version of your patch:
https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e

Main changes are
- fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for 
file->f_flags & O_DIRECT
- fuse_file_io_mmap() waits on a dio waitq
- fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved up in 
the file, as I would like to entirely remove the fuse_direct_write iter 
function (all goes through cache_write_iter)


Thanks,
Bernd


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-19  0:03                                           ` Bernd Schubert
@ 2023-12-19 13:01                                             ` Amir Goldstein
  2023-12-19 20:47                                               ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-19 13:01 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

> > Here is what I was thinking about:
> >
> > https://github.com/amir73il/linux/commits/fuse_io_mode
> >
> > The concept that I wanted to introduce was the
> > fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
> > helpers (akin to deny_write_access()/allow_write_access()).
> >
> > In this patch, parallel dio in progress deny open in caching mode
> > and mmap, and I don't know if that is acceptable.
> > Technically, instead of deny open/mmap you can use additional
> > techniques to wait for in progress dio and allow caching open/mmap.
> >
> > Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
> > pattern when file is open in FOPEN_PASSTHROUGH mode, but
> > in this case, as agreed with Miklos, a server trying to mix open
> > in caching mode on the same inode is going to fail the open.
> >
> > mmap is less of a problem for inode in passthrough mode, because
> > mmap in of direct_io file and inode in passthrough mode is passthrough
> > mmap to backing file.
> >
> > Anyway, if you can use this patch or parts of it, be my guest and if you
> > want to use a different approach that is fine by me as well - in that case
> > I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.
>
> Hi Amir,
>
> here is my fuse-dio-v5 branch:
> https://github.com/bsbernd/linux/commits/fuse-dio-v5/
>
> (v5 is just compilation tested, tests are running now over night)

This looks very nice!
I left comments about some minor nits on github.

>
> This branch is basically about consolidating fuse write direct IO code
> paths and to allow a shared lock for O_DIRECT. I actually could have
> noticed the page cache issue with shared locks before with previous
> versions of these patches, just my VM kernel is optimized for
> compilation time and some SHM options had been missing - with that fio
> refused to run.
>
> The branch includes a modified version of your patch:
> https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e
>
> Main changes are
> - fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for
> file->f_flags & O_DIRECT
> - fuse_file_io_mmap() waits on a dio waitq
> - fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved up in
> the file, as I would like to entirely remove the fuse_direct_write iter
> function (all goes through cache_write_iter)
>

Looks mostly good, but I think that fuse_file_shared_dio_start() =>
fuse_inode_deny_io_cache() should actually be done after taking
the inode lock (shared or exclusive) and not like in my patch.

First of all, this comment in fuse_dio_wr_exclusive_lock():

        /*
         * fuse_file_shared_dio_start() must not be called on retest,
         * as it decreases a counter value - must not be done twice
         */
        if (!fuse_file_shared_dio_start(inode))
                return true;

...is suggesting that semantics are not clean and this check
must remain last, because if fuse_dio_wr_exclusive_lock()
returns false, iocachectr must not be elevated.
This is easy to get wrong in the future with current semantics.

The more important thing is that while fuse_file_io_mmap()
is waiting for iocachectr to drop to zero, new parallel dio can
come in and starve the mmap() caller forever.

I think that we are going to need to use some inode state flag
(e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
unless we do not care about this possibility?
We'd only need to set this in fuse_file_io_mmap() until we get
the iocachectr refcount.

I *think* that fuse_inode_deny_io_cache() should be called with
shared inode lock held, because of the existing lock chain
i_rwsem -> page lock -> mmap_lock for page faults, but I am
not sure. My brain is too cooked now to figure this out.
OTOH, I don't see any problem with calling
fuse_inode_deny_io_cache() with shared lock held?

I pushed this version to my fuse_io_mode branch [1].
Only tested generic/095 with FOPEN_DIRECT_IO and
DIRECT_IO_ALLOW_MMAP.

Thanks,
Amir.

[1] https://github.com/amir73il/linux/commits/fuse_io_mode

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-19 13:01                                             ` Amir Goldstein
@ 2023-12-19 20:47                                               ` Bernd Schubert
  2023-12-20  4:18                                                 ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-19 20:47 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

[-- Attachment #1: Type: text/plain, Size: 4582 bytes --]



On 12/19/23 14:01, Amir Goldstein wrote:
>>> Here is what I was thinking about:
>>>
>>> https://github.com/amir73il/linux/commits/fuse_io_mode
>>>
>>> The concept that I wanted to introduce was the
>>> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
>>> helpers (akin to deny_write_access()/allow_write_access()).
>>>
>>> In this patch, parallel dio in progress deny open in caching mode
>>> and mmap, and I don't know if that is acceptable.
>>> Technically, instead of deny open/mmap you can use additional
>>> techniques to wait for in progress dio and allow caching open/mmap.
>>>
>>> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
>>> pattern when file is open in FOPEN_PASSTHROUGH mode, but
>>> in this case, as agreed with Miklos, a server trying to mix open
>>> in caching mode on the same inode is going to fail the open.
>>>
>>> mmap is less of a problem for inode in passthrough mode, because
>>> mmap in of direct_io file and inode in passthrough mode is passthrough
>>> mmap to backing file.
>>>
>>> Anyway, if you can use this patch or parts of it, be my guest and if you
>>> want to use a different approach that is fine by me as well - in that case
>>> I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.
>>
>> Hi Amir,
>>
>> here is my fuse-dio-v5 branch:
>> https://github.com/bsbernd/linux/commits/fuse-dio-v5/
>>
>> (v5 is just compilation tested, tests are running now over night)
> 
> This looks very nice!
> I left comments about some minor nits on github.
> 
>>
>> This branch is basically about consolidating fuse write direct IO code
>> paths and to allow a shared lock for O_DIRECT. I actually could have
>> noticed the page cache issue with shared locks before with previous
>> versions of these patches, just my VM kernel is optimized for
>> compilation time and some SHM options had been missing - with that fio
>> refused to run.
>>
>> The branch includes a modified version of your patch:
>> https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e
>>
>> Main changes are
>> - fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for
>> file->f_flags & O_DIRECT
>> - fuse_file_io_mmap() waits on a dio waitq
>> - fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved up in
>> the file, as I would like to entirely remove the fuse_direct_write iter
>> function (all goes through cache_write_iter)
>>
> 
> Looks mostly good, but I think that fuse_file_shared_dio_start() =>
> fuse_inode_deny_io_cache() should actually be done after taking
> the inode lock (shared or exclusive) and not like in my patch.
> 
> First of all, this comment in fuse_dio_wr_exclusive_lock():
> 
>          /*
>           * fuse_file_shared_dio_start() must not be called on retest,
>           * as it decreases a counter value - must not be done twice
>           */
>          if (!fuse_file_shared_dio_start(inode))
>                  return true;
> 
> ...is suggesting that semantics are not clean and this check
> must remain last, because if fuse_dio_wr_exclusive_lock()
> returns false, iocachectr must not be elevated.
> This is easy to get wrong in the future with current semantics.
> 
> The more important thing is that while fuse_file_io_mmap()
> is waiting for iocachectr to drop to zero, new parallel dio can
> come in and starve the mmap() caller forever.
> 
> I think that we are going to need to use some inode state flag
> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
> unless we do not care about this possibility?
> We'd only need to set this in fuse_file_io_mmap() until we get
> the iocachectr refcount.
> 
> I *think* that fuse_inode_deny_io_cache() should be called with
> shared inode lock held, because of the existing lock chain
> i_rwsem -> page lock -> mmap_lock for page faults, but I am
> not sure. My brain is too cooked now to figure this out.
> OTOH, I don't see any problem with calling
> fuse_inode_deny_io_cache() with shared lock held?
> 
> I pushed this version to my fuse_io_mode branch [1].
> Only tested generic/095 with FOPEN_DIRECT_IO and
> DIRECT_IO_ALLOW_MMAP.
> 
> Thanks,
> Amir.
> 
> [1] https://github.com/amir73il/linux/commits/fuse_io_mode

Thanks, will look into your changes next. I was looking into the initial 
issue with generic/095 with my branch. Fixed by the attached patch. I 
think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
Interesting is that filemap_range_has_writeback() is exported, but there
was no user. Hopefully nobody submits an unexport patch in the mean time.


Thanks,
Bernd

[-- Attachment #2: dirty-pages.patch --]
[-- Type: text/x-patch, Size: 1662 bytes --]

commit bce66bf4b0b5d8cbeeb06ef3550ab4e02477f3e4
Author: Bernd Schubert <bschubert@ddn.com>
Date:   Tue Dec 19 20:36:10 2023 +0100

    dirty pages

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1fd3ba57accc8..26b13128b1e29 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -503,6 +503,19 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 	return NULL;
 }
 
+static bool fuse_inode_has_writeback(struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_writepage_args *wpa;
+
+	spin_lock(&fi->lock);
+	wpa = rb_entry(fi->writepages.rb_node,
+		       struct fuse_writepage_args, writepages_entry);
+	spin_unlock(&fi->lock);
+
+	return wpa != NULL;
+}
+
 /*
  * Check if any page in a range is under writeback
  *
@@ -1449,8 +1462,18 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
 	/* fuse_file_shared_dio_start() must not be called on retest,
 	 * as it decreases a counter value - must not be done twice
 	 */
-	if (!fuse_file_shared_dio_start(inode))
+	if (!fuse_file_shared_dio_start(inode)) {
 		return true;
+	} else {
+		/* we succeeded to enable shared dio, but there still might be
+		 * dirty pages
+		 */
+		if (filemap_range_has_writeback(file->f_mapping, 0, LLONG_MAX) ||
+		    fuse_inode_has_writeback(inode)) {
+			fuse_file_shared_dio_end(inode);
+			return true;
+		}
+	}
 
 	return false;
 }
@@ -1472,6 +1495,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
 		inode_lock(inode);
 	} else {
 		inode_lock_shared(inode);
+
 		/*
 		 * Previous check was without inode lock and might have raced,
 		 * check again.

^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-19 20:47                                               ` Bernd Schubert
@ 2023-12-20  4:18                                                 ` Amir Goldstein
  2023-12-20  9:00                                                   ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-20  4:18 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Tue, Dec 19, 2023 at 10:47 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/19/23 14:01, Amir Goldstein wrote:
> >>> Here is what I was thinking about:
> >>>
> >>> https://github.com/amir73il/linux/commits/fuse_io_mode
> >>>
> >>> The concept that I wanted to introduce was the
> >>> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
> >>> helpers (akin to deny_write_access()/allow_write_access()).
> >>>
> >>> In this patch, parallel dio in progress deny open in caching mode
> >>> and mmap, and I don't know if that is acceptable.
> >>> Technically, instead of deny open/mmap you can use additional
> >>> techniques to wait for in progress dio and allow caching open/mmap.
> >>>
> >>> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
> >>> pattern when file is open in FOPEN_PASSTHROUGH mode, but
> >>> in this case, as agreed with Miklos, a server trying to mix open
> >>> in caching mode on the same inode is going to fail the open.
> >>>
> >>> mmap is less of a problem for inode in passthrough mode, because
> >>> mmap in of direct_io file and inode in passthrough mode is passthrough
> >>> mmap to backing file.
> >>>
> >>> Anyway, if you can use this patch or parts of it, be my guest and if you
> >>> want to use a different approach that is fine by me as well - in that case
> >>> I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.
> >>
> >> Hi Amir,
> >>
> >> here is my fuse-dio-v5 branch:
> >> https://github.com/bsbernd/linux/commits/fuse-dio-v5/
> >>
> >> (v5 is just compilation tested, tests are running now over night)
> >
> > This looks very nice!
> > I left comments about some minor nits on github.
> >
> >>
> >> This branch is basically about consolidating fuse write direct IO code
> >> paths and to allow a shared lock for O_DIRECT. I actually could have
> >> noticed the page cache issue with shared locks before with previous
> >> versions of these patches, just my VM kernel is optimized for
> >> compilation time and some SHM options had been missing - with that fio
> >> refused to run.
> >>
> >> The branch includes a modified version of your patch:
> >> https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e
> >>
> >> Main changes are
> >> - fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for
> >> file->f_flags & O_DIRECT
> >> - fuse_file_io_mmap() waits on a dio waitq
> >> - fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved up in
> >> the file, as I would like to entirely remove the fuse_direct_write iter
> >> function (all goes through cache_write_iter)
> >>
> >
> > Looks mostly good, but I think that fuse_file_shared_dio_start() =>
> > fuse_inode_deny_io_cache() should actually be done after taking
> > the inode lock (shared or exclusive) and not like in my patch.
> >
> > First of all, this comment in fuse_dio_wr_exclusive_lock():
> >
> >          /*
> >           * fuse_file_shared_dio_start() must not be called on retest,
> >           * as it decreases a counter value - must not be done twice
> >           */
> >          if (!fuse_file_shared_dio_start(inode))
> >                  return true;
> >
> > ...is suggesting that semantics are not clean and this check
> > must remain last, because if fuse_dio_wr_exclusive_lock()
> > returns false, iocachectr must not be elevated.
> > This is easy to get wrong in the future with current semantics.
> >
> > The more important thing is that while fuse_file_io_mmap()
> > is waiting for iocachectr to drop to zero, new parallel dio can
> > come in and starve the mmap() caller forever.
> >
> > I think that we are going to need to use some inode state flag
> > (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
> > unless we do not care about this possibility?
> > We'd only need to set this in fuse_file_io_mmap() until we get
> > the iocachectr refcount.
> >
> > I *think* that fuse_inode_deny_io_cache() should be called with
> > shared inode lock held, because of the existing lock chain
> > i_rwsem -> page lock -> mmap_lock for page faults, but I am
> > not sure. My brain is too cooked now to figure this out.
> > OTOH, I don't see any problem with calling
> > fuse_inode_deny_io_cache() with shared lock held?
> >
> > I pushed this version to my fuse_io_mode branch [1].
> > Only tested generic/095 with FOPEN_DIRECT_IO and
> > DIRECT_IO_ALLOW_MMAP.
> >
> > Thanks,
> > Amir.
> >
> > [1] https://github.com/amir73il/linux/commits/fuse_io_mode
>
> Thanks, will look into your changes next. I was looking into the initial
> issue with generic/095 with my branch. Fixed by the attached patch. I
> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
> Interesting is that filemap_range_has_writeback() is exported, but there
> was no user. Hopefully nobody submits an unexport patch in the mean time.
>

Ok. Now I am pretty sure that filemap_range_has_writeback() should be
check after taking the shared lock in fuse_dio_lock() as in my branch and
not in fuse_dio_wr_exclusive_lock() outside the lock.

But at the same time, it is a little concerning that you are able to observe
dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
The whole point of fuse_inode_deny_io_cache() is that it should be
granted after all users of the inode page cache are gone.

Is it expected that fuse inode pages remain dirty after no more open files
and no more mmaps?

Did we miss some case of access to page cache? unaligned dio perhaps?

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-20  4:18                                                 ` Amir Goldstein
@ 2023-12-20  9:00                                                   ` Bernd Schubert
  2023-12-20 12:26                                                     ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-20  9:00 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/20/23 05:18, Amir Goldstein wrote:
> On Tue, Dec 19, 2023 at 10:47 PM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 12/19/23 14:01, Amir Goldstein wrote:
>>>>> Here is what I was thinking about:
>>>>>
>>>>> https://github.com/amir73il/linux/commits/fuse_io_mode
>>>>>
>>>>> The concept that I wanted to introduce was the
>>>>> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
>>>>> helpers (akin to deny_write_access()/allow_write_access()).
>>>>>
>>>>> In this patch, parallel dio in progress deny open in caching mode
>>>>> and mmap, and I don't know if that is acceptable.
>>>>> Technically, instead of deny open/mmap you can use additional
>>>>> techniques to wait for in progress dio and allow caching open/mmap.
>>>>>
>>>>> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
>>>>> pattern when file is open in FOPEN_PASSTHROUGH mode, but
>>>>> in this case, as agreed with Miklos, a server trying to mix open
>>>>> in caching mode on the same inode is going to fail the open.
>>>>>
>>>>> mmap is less of a problem for inode in passthrough mode, because
>>>>> mmap in of direct_io file and inode in passthrough mode is passthrough
>>>>> mmap to backing file.
>>>>>
>>>>> Anyway, if you can use this patch or parts of it, be my guest and if you
>>>>> want to use a different approach that is fine by me as well - in that case
>>>>> I will just remove the fuse_file_shared_dio_{start,end}() part from my patch.
>>>>
>>>> Hi Amir,
>>>>
>>>> here is my fuse-dio-v5 branch:
>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v5/
>>>>
>>>> (v5 is just compilation tested, tests are running now over night)
>>>
>>> This looks very nice!
>>> I left comments about some minor nits on github.
>>>
>>>>
>>>> This branch is basically about consolidating fuse write direct IO code
>>>> paths and to allow a shared lock for O_DIRECT. I actually could have
>>>> noticed the page cache issue with shared locks before with previous
>>>> versions of these patches, just my VM kernel is optimized for
>>>> compilation time and some SHM options had been missing - with that fio
>>>> refused to run.
>>>>
>>>> The branch includes a modified version of your patch:
>>>> https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e
>>>>
>>>> Main changes are
>>>> - fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for
>>>> file->f_flags & O_DIRECT
>>>> - fuse_file_io_mmap() waits on a dio waitq
>>>> - fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved up in
>>>> the file, as I would like to entirely remove the fuse_direct_write iter
>>>> function (all goes through cache_write_iter)
>>>>
>>>
>>> Looks mostly good, but I think that fuse_file_shared_dio_start() =>
>>> fuse_inode_deny_io_cache() should actually be done after taking
>>> the inode lock (shared or exclusive) and not like in my patch.
>>>
>>> First of all, this comment in fuse_dio_wr_exclusive_lock():
>>>
>>>           /*
>>>            * fuse_file_shared_dio_start() must not be called on retest,
>>>            * as it decreases a counter value - must not be done twice
>>>            */
>>>           if (!fuse_file_shared_dio_start(inode))
>>>                   return true;
>>>
>>> ...is suggesting that semantics are not clean and this check
>>> must remain last, because if fuse_dio_wr_exclusive_lock()
>>> returns false, iocachectr must not be elevated.
>>> This is easy to get wrong in the future with current semantics.
>>>
>>> The more important thing is that while fuse_file_io_mmap()
>>> is waiting for iocachectr to drop to zero, new parallel dio can
>>> come in and starve the mmap() caller forever.
>>>
>>> I think that we are going to need to use some inode state flag
>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
>>> unless we do not care about this possibility?
>>> We'd only need to set this in fuse_file_io_mmap() until we get
>>> the iocachectr refcount.
>>>
>>> I *think* that fuse_inode_deny_io_cache() should be called with
>>> shared inode lock held, because of the existing lock chain
>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
>>> not sure. My brain is too cooked now to figure this out.
>>> OTOH, I don't see any problem with calling
>>> fuse_inode_deny_io_cache() with shared lock held?
>>>
>>> I pushed this version to my fuse_io_mode branch [1].
>>> Only tested generic/095 with FOPEN_DIRECT_IO and
>>> DIRECT_IO_ALLOW_MMAP.
>>>
>>> Thanks,
>>> Amir.
>>>
>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
>>
>> Thanks, will look into your changes next. I was looking into the initial
>> issue with generic/095 with my branch. Fixed by the attached patch. I
>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
>> Interesting is that filemap_range_has_writeback() is exported, but there
>> was no user. Hopefully nobody submits an unexport patch in the mean time.
>>
> 
> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
> check after taking the shared lock in fuse_dio_lock() as in my branch and
> not in fuse_dio_wr_exclusive_lock() outside the lock.



> 
> But at the same time, it is a little concerning that you are able to observe
> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
> The whole point of fuse_inode_deny_io_cache() is that it should be
> granted after all users of the inode page cache are gone.
> 
> Is it expected that fuse inode pages remain dirty after no more open files
> and no more mmaps?


I'm actually not sure anymore if filemap_range_has_writeback() is 
actually needed. In fuse_flush() it calls write_inode_now(inode, 1), but 
I don't think that will flush queued fi->writectr (fi->writepages). Will 
report back in the afternoon.


> 
> Did we miss some case of access to page cache? unaligned dio perhaps?

Yeah, there is indeed another problem,
fuse_cache_write_iter() can fall back to page writes, but I had 
especially added a warning message in temporary patch - I didn't manage 
to trigger that path yet. Will add a fix for that.

Thanks,
Bernd



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-20  9:00                                                   ` Bernd Schubert
@ 2023-12-20 12:26                                                     ` Bernd Schubert
  2023-12-20 22:13                                                       ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-20 12:26 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/20/23 10:00, Bernd Schubert wrote:
> 
> 
> On 12/20/23 05:18, Amir Goldstein wrote:
>> On Tue, Dec 19, 2023 at 10:47 PM Bernd Schubert
>> <bernd.schubert@fastmail.fm> wrote:
>>>
>>>
>>>
>>> On 12/19/23 14:01, Amir Goldstein wrote:
>>>>>> Here is what I was thinking about:
>>>>>>
>>>>>> https://github.com/amir73il/linux/commits/fuse_io_mode
>>>>>>
>>>>>> The concept that I wanted to introduce was the
>>>>>> fuse_inode_deny_io_cache()/fuse_inode_allow_io_cache()
>>>>>> helpers (akin to deny_write_access()/allow_write_access()).
>>>>>>
>>>>>> In this patch, parallel dio in progress deny open in caching mode
>>>>>> and mmap, and I don't know if that is acceptable.
>>>>>> Technically, instead of deny open/mmap you can use additional
>>>>>> techniques to wait for in progress dio and allow caching open/mmap.
>>>>>>
>>>>>> Anyway, I plan to use the iocachectr and fuse_inode_deny_io_cache()
>>>>>> pattern when file is open in FOPEN_PASSTHROUGH mode, but
>>>>>> in this case, as agreed with Miklos, a server trying to mix open
>>>>>> in caching mode on the same inode is going to fail the open.
>>>>>>
>>>>>> mmap is less of a problem for inode in passthrough mode, because
>>>>>> mmap in of direct_io file and inode in passthrough mode is 
>>>>>> passthrough
>>>>>> mmap to backing file.
>>>>>>
>>>>>> Anyway, if you can use this patch or parts of it, be my guest and 
>>>>>> if you
>>>>>> want to use a different approach that is fine by me as well - in 
>>>>>> that case
>>>>>> I will just remove the fuse_file_shared_dio_{start,end}() part 
>>>>>> from my patch.
>>>>>
>>>>> Hi Amir,
>>>>>
>>>>> here is my fuse-dio-v5 branch:
>>>>> https://github.com/bsbernd/linux/commits/fuse-dio-v5/
>>>>>
>>>>> (v5 is just compilation tested, tests are running now over night)
>>>>
>>>> This looks very nice!
>>>> I left comments about some minor nits on github.
>>>>
>>>>>
>>>>> This branch is basically about consolidating fuse write direct IO code
>>>>> paths and to allow a shared lock for O_DIRECT. I actually could have
>>>>> noticed the page cache issue with shared locks before with previous
>>>>> versions of these patches, just my VM kernel is optimized for
>>>>> compilation time and some SHM options had been missing - with that fio
>>>>> refused to run.
>>>>>
>>>>> The branch includes a modified version of your patch:
>>>>> https://github.com/bsbernd/linux/commit/6b05e52f7e253d9347d97de675b21b1707d6456e
>>>>>
>>>>> Main changes are
>>>>> - fuse_file_io_open() does not set the FOPEN_CACHE_IO flag for
>>>>> file->f_flags & O_DIRECT
>>>>> - fuse_file_io_mmap() waits on a dio waitq
>>>>> - fuse_file_shared_dio_start / fuse_file_shared_dio_end are moved 
>>>>> up in
>>>>> the file, as I would like to entirely remove the fuse_direct_write 
>>>>> iter
>>>>> function (all goes through cache_write_iter)
>>>>>
>>>>
>>>> Looks mostly good, but I think that fuse_file_shared_dio_start() =>
>>>> fuse_inode_deny_io_cache() should actually be done after taking
>>>> the inode lock (shared or exclusive) and not like in my patch.
>>>>
>>>> First of all, this comment in fuse_dio_wr_exclusive_lock():
>>>>
>>>>           /*
>>>>            * fuse_file_shared_dio_start() must not be called on retest,
>>>>            * as it decreases a counter value - must not be done twice
>>>>            */
>>>>           if (!fuse_file_shared_dio_start(inode))
>>>>                   return true;
>>>>
>>>> ...is suggesting that semantics are not clean and this check
>>>> must remain last, because if fuse_dio_wr_exclusive_lock()
>>>> returns false, iocachectr must not be elevated.
>>>> This is easy to get wrong in the future with current semantics.
>>>>
>>>> The more important thing is that while fuse_file_io_mmap()
>>>> is waiting for iocachectr to drop to zero, new parallel dio can
>>>> come in and starve the mmap() caller forever.
>>>>
>>>> I think that we are going to need to use some inode state flag
>>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
>>>> unless we do not care about this possibility?
>>>> We'd only need to set this in fuse_file_io_mmap() until we get
>>>> the iocachectr refcount.
>>>>
>>>> I *think* that fuse_inode_deny_io_cache() should be called with
>>>> shared inode lock held, because of the existing lock chain
>>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
>>>> not sure. My brain is too cooked now to figure this out.
>>>> OTOH, I don't see any problem with calling
>>>> fuse_inode_deny_io_cache() with shared lock held?
>>>>
>>>> I pushed this version to my fuse_io_mode branch [1].
>>>> Only tested generic/095 with FOPEN_DIRECT_IO and
>>>> DIRECT_IO_ALLOW_MMAP.
>>>>
>>>> Thanks,
>>>> Amir.
>>>>
>>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
>>>
>>> Thanks, will look into your changes next. I was looking into the initial
>>> issue with generic/095 with my branch. Fixed by the attached patch. I
>>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
>>> Interesting is that filemap_range_has_writeback() is exported, but there
>>> was no user. Hopefully nobody submits an unexport patch in the mean 
>>> time.
>>>
>>
>> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
>> check after taking the shared lock in fuse_dio_lock() as in my branch and
>> not in fuse_dio_wr_exclusive_lock() outside the lock.
> 
> 
> 
>>
>> But at the same time, it is a little concerning that you are able to 
>> observe
>> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
>> The whole point of fuse_inode_deny_io_cache() is that it should be
>> granted after all users of the inode page cache are gone.
>>
>> Is it expected that fuse inode pages remain dirty after no more open 
>> files
>> and no more mmaps?
> 
> 
> I'm actually not sure anymore if filemap_range_has_writeback() is 
> actually needed. In fuse_flush() it calls write_inode_now(inode, 1), but 
> I don't think that will flush queued fi->writectr (fi->writepages). Will 
> report back in the afternoon.

Sorry, my fault, please ignore the previous patch. Actually no dirty 
pages to be expected, I had missed the that fuse_flush calls 
fuse_sync_writes(). The main bug in my branch was due to the different 
handling of FOPEN_DIRECT_IO and O_DIRECT - for O_DIRECT I hadn't called 
fuse_file_io_mmap().


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-20 12:26                                                     ` Bernd Schubert
@ 2023-12-20 22:13                                                       ` Bernd Schubert
  2023-12-21  9:18                                                         ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-20 22:13 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh




[...]

>>>>> I think that we are going to need to use some inode state flag
>>>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
>>>>> unless we do not care about this possibility?
>>>>> We'd only need to set this in fuse_file_io_mmap() until we get
>>>>> the iocachectr refcount.


I added back FUSE_I_CACHE_IO_MODE I had used previously.


>>>>>
>>>>> I *think* that fuse_inode_deny_io_cache() should be called with
>>>>> shared inode lock held, because of the existing lock chain
>>>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
>>>>> not sure. My brain is too cooked now to figure this out.
>>>>> OTOH, I don't see any problem with calling
>>>>> fuse_inode_deny_io_cache() with shared lock held?
>>>>>
>>>>> I pushed this version to my fuse_io_mode branch [1].
>>>>> Only tested generic/095 with FOPEN_DIRECT_IO and
>>>>> DIRECT_IO_ALLOW_MMAP.
>>>>>
>>>>> Thanks,
>>>>> Amir.
>>>>>
>>>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
>>>>
>>>> Thanks, will look into your changes next. I was looking into the 
>>>> initial
>>>> issue with generic/095 with my branch. Fixed by the attached patch. I
>>>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
>>>> Interesting is that filemap_range_has_writeback() is exported, but 
>>>> there
>>>> was no user. Hopefully nobody submits an unexport patch in the mean 
>>>> time.
>>>>
>>>
>>> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
>>> check after taking the shared lock in fuse_dio_lock() as in my branch 
>>> and
>>> not in fuse_dio_wr_exclusive_lock() outside the lock.
>>
>>
>>
>>>
>>> But at the same time, it is a little concerning that you are able to 
>>> observe
>>> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
>>> The whole point of fuse_inode_deny_io_cache() is that it should be
>>> granted after all users of the inode page cache are gone.
>>>
>>> Is it expected that fuse inode pages remain dirty after no more open 
>>> files
>>> and no more mmaps?
>>
>>
>> I'm actually not sure anymore if filemap_range_has_writeback() is 
>> actually needed. In fuse_flush() it calls write_inode_now(inode, 1), 
>> but I don't think that will flush queued fi->writectr 
>> (fi->writepages). Will report back in the afternoon.
> 
> Sorry, my fault, please ignore the previous patch. Actually no dirty 
> pages to be expected, I had missed the that fuse_flush calls 
> fuse_sync_writes(). The main bug in my branch was due to the different 
> handling of FOPEN_DIRECT_IO and O_DIRECT - for O_DIRECT I hadn't called 
> fuse_file_io_mmap().


I pushed a few fixes/updates into my fuse-dio-v5 branch and also to 
simplify it for you to my fuse_io_mode branch. Changes are onto of the 
previous patches io-mode patch to simplify it for you to see the changes 
and to possibly squash it into the main io patch.

https://github.com/bsbernd/linux/commits/fuse_io_mode/


Thanks,
Bernd


PS: I start to feel a bit guilty about this long thread on 
linux-fsdevel. Would be better to have that on fuse-devel, just the 
sourceforge list is badly spammed.






^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-20 22:13                                                       ` Bernd Schubert
@ 2023-12-21  9:18                                                         ` Amir Goldstein
  2023-12-21 10:17                                                           ` Bernd Schubert
  0 siblings, 1 reply; 48+ messages in thread
From: Amir Goldstein @ 2023-12-21  9:18 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Thu, Dec 21, 2023 at 12:13 AM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
>
> [...]
>
> >>>>> I think that we are going to need to use some inode state flag
> >>>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
> >>>>> unless we do not care about this possibility?
> >>>>> We'd only need to set this in fuse_file_io_mmap() until we get
> >>>>> the iocachectr refcount.
>
>
> I added back FUSE_I_CACHE_IO_MODE I had used previously.
>

ACK.
Name is a bit confusing for the "want io mode" case, but IMO
a comment would be enough to make it clear.
Push a version with a comment to my branch.


>
> >>>>>
> >>>>> I *think* that fuse_inode_deny_io_cache() should be called with
> >>>>> shared inode lock held, because of the existing lock chain
> >>>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
> >>>>> not sure. My brain is too cooked now to figure this out.
> >>>>> OTOH, I don't see any problem with calling
> >>>>> fuse_inode_deny_io_cache() with shared lock held?
> >>>>>
> >>>>> I pushed this version to my fuse_io_mode branch [1].
> >>>>> Only tested generic/095 with FOPEN_DIRECT_IO and
> >>>>> DIRECT_IO_ALLOW_MMAP.
> >>>>>
> >>>>> Thanks,
> >>>>> Amir.
> >>>>>
> >>>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
> >>>>
> >>>> Thanks, will look into your changes next. I was looking into the
> >>>> initial
> >>>> issue with generic/095 with my branch. Fixed by the attached patch. I
> >>>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
> >>>> Interesting is that filemap_range_has_writeback() is exported, but
> >>>> there
> >>>> was no user. Hopefully nobody submits an unexport patch in the mean
> >>>> time.
> >>>>
> >>>
> >>> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
> >>> check after taking the shared lock in fuse_dio_lock() as in my branch
> >>> and
> >>> not in fuse_dio_wr_exclusive_lock() outside the lock.
> >>
> >>
> >>
> >>>
> >>> But at the same time, it is a little concerning that you are able to
> >>> observe
> >>> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
> >>> The whole point of fuse_inode_deny_io_cache() is that it should be
> >>> granted after all users of the inode page cache are gone.
> >>>
> >>> Is it expected that fuse inode pages remain dirty after no more open
> >>> files
> >>> and no more mmaps?
> >>
> >>
> >> I'm actually not sure anymore if filemap_range_has_writeback() is
> >> actually needed. In fuse_flush() it calls write_inode_now(inode, 1),
> >> but I don't think that will flush queued fi->writectr
> >> (fi->writepages). Will report back in the afternoon.
> >
> > Sorry, my fault, please ignore the previous patch. Actually no dirty
> > pages to be expected, I had missed the that fuse_flush calls
> > fuse_sync_writes(). The main bug in my branch was due to the different
> > handling of FOPEN_DIRECT_IO and O_DIRECT - for O_DIRECT I hadn't called
> > fuse_file_io_mmap().

But why would you need to call fuse_file_io_mmap() for O_DIRECT?
If a file was opened without FOPEN_DIRECT_IO, we already set inode to
caching mode on open.
Does your O_DIRECT patch to mmap solve an actual reproducible bug?

>
>
> I pushed a few fixes/updates into my fuse-dio-v5 branch and also to
> simplify it for you to my fuse_io_mode branch. Changes are onto of the
> previous patches io-mode patch to simplify it for you to see the changes
> and to possibly squash it into the main io patch.
>
> https://github.com/bsbernd/linux/commits/fuse_io_mode/
>

Cool. I squashed all your fixes to my branch, with minor comments
that I also left on github, except for the O_DIRECT patch, because
I do not understand why it is needed.

The 6.8 merge window is very close and the holidays are upon us,
so not sure if you and Miklos could be bothered, but do you think there
is  a chance that we can get fuse_io_mode patches ready for queuing
in time for the 6.8 merge window?

They do have merit on their own for re-allowing parallel dio along with
FOPEN_PARALLEL_DIRECT_WRITES, but also, it would make it easier
for the both of us to develop fuse-dio and fuse-passthrough based on
the io cache mode during the 6.9 dev cycle.

>
> PS: I start to feel a bit guilty about this long thread on
> linux-fsdevel. Would be better to have that on fuse-devel, just the
> sourceforge list is badly spammed.
>

According to MAINTAINERS, linux-fsdevel is the list for linux FUSE
kernel development. The sourceforge fuse-devel is for libfuse.

We could open a linux-fuse list, but it has been this way forever
and I do not know of any complaints from fsdevel members.
the downside of not having linux-fuse list IMO is that we do not
have a "fuse only" searchable archive, but we won't have it for all the
historic messages on fsdevel anyway.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-21  9:18                                                         ` Amir Goldstein
@ 2023-12-21 10:17                                                           ` Bernd Schubert
  2023-12-21 11:14                                                             ` Amir Goldstein
  0 siblings, 1 reply; 48+ messages in thread
From: Bernd Schubert @ 2023-12-21 10:17 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/21/23 10:18, Amir Goldstein wrote:
> On Thu, Dec 21, 2023 at 12:13 AM Bernd Schubert
> <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>>
>> [...]
>>
>>>>>>> I think that we are going to need to use some inode state flag
>>>>>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
>>>>>>> unless we do not care about this possibility?
>>>>>>> We'd only need to set this in fuse_file_io_mmap() until we get
>>>>>>> the iocachectr refcount.
>>
>>
>> I added back FUSE_I_CACHE_IO_MODE I had used previously.
>>
> 
> ACK.
> Name is a bit confusing for the "want io mode" case, but IMO
> a comment would be enough to make it clear.
> Push a version with a comment to my branch.
> 
> 
>>
>>>>>>>
>>>>>>> I *think* that fuse_inode_deny_io_cache() should be called with
>>>>>>> shared inode lock held, because of the existing lock chain
>>>>>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
>>>>>>> not sure. My brain is too cooked now to figure this out.
>>>>>>> OTOH, I don't see any problem with calling
>>>>>>> fuse_inode_deny_io_cache() with shared lock held?
>>>>>>>
>>>>>>> I pushed this version to my fuse_io_mode branch [1].
>>>>>>> Only tested generic/095 with FOPEN_DIRECT_IO and
>>>>>>> DIRECT_IO_ALLOW_MMAP.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Amir.
>>>>>>>
>>>>>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
>>>>>>
>>>>>> Thanks, will look into your changes next. I was looking into the
>>>>>> initial
>>>>>> issue with generic/095 with my branch. Fixed by the attached patch. I
>>>>>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
>>>>>> Interesting is that filemap_range_has_writeback() is exported, but
>>>>>> there
>>>>>> was no user. Hopefully nobody submits an unexport patch in the mean
>>>>>> time.
>>>>>>
>>>>>
>>>>> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
>>>>> check after taking the shared lock in fuse_dio_lock() as in my branch
>>>>> and
>>>>> not in fuse_dio_wr_exclusive_lock() outside the lock.
>>>>
>>>>
>>>>
>>>>>
>>>>> But at the same time, it is a little concerning that you are able to
>>>>> observe
>>>>> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
>>>>> The whole point of fuse_inode_deny_io_cache() is that it should be
>>>>> granted after all users of the inode page cache are gone.
>>>>>
>>>>> Is it expected that fuse inode pages remain dirty after no more open
>>>>> files
>>>>> and no more mmaps?
>>>>
>>>>
>>>> I'm actually not sure anymore if filemap_range_has_writeback() is
>>>> actually needed. In fuse_flush() it calls write_inode_now(inode, 1),
>>>> but I don't think that will flush queued fi->writectr
>>>> (fi->writepages). Will report back in the afternoon.
>>>
>>> Sorry, my fault, please ignore the previous patch. Actually no dirty
>>> pages to be expected, I had missed the that fuse_flush calls
>>> fuse_sync_writes(). The main bug in my branch was due to the different
>>> handling of FOPEN_DIRECT_IO and O_DIRECT - for O_DIRECT I hadn't called
>>> fuse_file_io_mmap().
> 
> But why would you need to call fuse_file_io_mmap() for O_DIRECT?
> If a file was opened without FOPEN_DIRECT_IO, we already set inode to
> caching mode on open.
> Does your O_DIRECT patch to mmap solve an actual reproducible bug?

Yeah it does, in my fuse-dio-v5 branch, which adds in shared locks for 
O_DIRECT writes without FOPEN_DIRECT_IO.

> 
>>
>>
>> I pushed a few fixes/updates into my fuse-dio-v5 branch and also to
>> simplify it for you to my fuse_io_mode branch. Changes are onto of the
>> previous patches io-mode patch to simplify it for you to see the changes
>> and to possibly squash it into the main io patch.
>>
>> https://github.com/bsbernd/linux/commits/fuse_io_mode/
>>
> 
> Cool. I squashed all your fixes to my branch, with minor comments
> that I also left on github, except for the O_DIRECT patch, because
> I do not understand why it is needed.

No issue with that, I can keep that patch on the branch that actually 
needs it.

Oh, I just see your comments - I didn't get github notification and so 
missed your comments before. Sorry about that. Checking where I need to 
enable it. I do get notifications for other projects, so didn't suspect 
that anything would be missing...


> 
> The 6.8 merge window is very close and the holidays are upon us,
> so not sure if you and Miklos could be bothered, but do you think there
> is  a chance that we can get fuse_io_mode patches ready for queuing
> in time for the 6.8 merge window?
> 
> They do have merit on their own for re-allowing parallel dio along with
> FOPEN_PARALLEL_DIRECT_WRITES, but also, it would make it easier
> for the both of us to develop fuse-dio and fuse-passthrough based on
> the io cache mode during the 6.9 dev cycle.

I definitely would also like to get these patches in. Holidays have the 
merit that I don't need to get up at 7am to wake up kids and am then 
tired all the day. And no meetings ;)

 From my point my dio-v5 branch is also ready, it relies on these 
patches. Not sure how to post it with the dependency.
I also have no issue to wait for 6.9, for now I'm going to take these 
patches to our fuse module for ubuntu and rhel9 kernels (quite heavily 
patched, as it needs to live aside the kernel included module - symbol 
renames, etc).


> 
>>
>> PS: I start to feel a bit guilty about this long thread on
>> linux-fsdevel. Would be better to have that on fuse-devel, just the
>> sourceforge list is badly spammed.
>>
> 
> According to MAINTAINERS, linux-fsdevel is the list for linux FUSE
> kernel development. The sourceforge fuse-devel is for libfuse.
> 
> We could open a linux-fuse list, but it has been this way forever
> and I do not know of any complaints from fsdevel members.
> the downside of not having linux-fuse list IMO is that we do not
> have a "fuse only" searchable archive, but we won't have it for all the
> historic messages on fsdevel anyway.

Sure, fine with me. I'm just a bit worried that others might get 
disturbed by all the fuse only messages.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-21 10:17                                                           ` Bernd Schubert
@ 2023-12-21 11:14                                                             ` Amir Goldstein
  2023-12-21 14:36                                                               ` Bernd Schubert
  2023-12-21 15:08                                                               ` Bernd Schubert
  0 siblings, 2 replies; 48+ messages in thread
From: Amir Goldstein @ 2023-12-21 11:14 UTC (permalink / raw)
  To: Bernd Schubert
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh

On Thu, Dec 21, 2023 at 12:17 PM Bernd Schubert
<bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 12/21/23 10:18, Amir Goldstein wrote:
> > On Thu, Dec 21, 2023 at 12:13 AM Bernd Schubert
> > <bernd.schubert@fastmail.fm> wrote:
> >>
> >>
> >>
> >>
> >> [...]
> >>
> >>>>>>> I think that we are going to need to use some inode state flag
> >>>>>>> (e.g. FUSE_I_DIO_WR_EXCL) to protect against this starvation,
> >>>>>>> unless we do not care about this possibility?
> >>>>>>> We'd only need to set this in fuse_file_io_mmap() until we get
> >>>>>>> the iocachectr refcount.
> >>
> >>
> >> I added back FUSE_I_CACHE_IO_MODE I had used previously.
> >>
> >
> > ACK.
> > Name is a bit confusing for the "want io mode" case, but IMO
> > a comment would be enough to make it clear.
> > Push a version with a comment to my branch.
> >
> >
> >>
> >>>>>>>
> >>>>>>> I *think* that fuse_inode_deny_io_cache() should be called with
> >>>>>>> shared inode lock held, because of the existing lock chain
> >>>>>>> i_rwsem -> page lock -> mmap_lock for page faults, but I am
> >>>>>>> not sure. My brain is too cooked now to figure this out.
> >>>>>>> OTOH, I don't see any problem with calling
> >>>>>>> fuse_inode_deny_io_cache() with shared lock held?
> >>>>>>>
> >>>>>>> I pushed this version to my fuse_io_mode branch [1].
> >>>>>>> Only tested generic/095 with FOPEN_DIRECT_IO and
> >>>>>>> DIRECT_IO_ALLOW_MMAP.
> >>>>>>>
> >>>>>>> Thanks,
> >>>>>>> Amir.
> >>>>>>>
> >>>>>>> [1] https://github.com/amir73il/linux/commits/fuse_io_mode
> >>>>>>
> >>>>>> Thanks, will look into your changes next. I was looking into the
> >>>>>> initial
> >>>>>> issue with generic/095 with my branch. Fixed by the attached patch. I
> >>>>>> think it is generic and also applies to FOPEN_DIRECT_IO + mmap.
> >>>>>> Interesting is that filemap_range_has_writeback() is exported, but
> >>>>>> there
> >>>>>> was no user. Hopefully nobody submits an unexport patch in the mean
> >>>>>> time.
> >>>>>>
> >>>>>
> >>>>> Ok. Now I am pretty sure that filemap_range_has_writeback() should be
> >>>>> check after taking the shared lock in fuse_dio_lock() as in my branch
> >>>>> and
> >>>>> not in fuse_dio_wr_exclusive_lock() outside the lock.
> >>>>
> >>>>
> >>>>
> >>>>>
> >>>>> But at the same time, it is a little concerning that you are able to
> >>>>> observe
> >>>>> dirty pages on a fuse inode after success of fuse_inode_deny_io_cache().
> >>>>> The whole point of fuse_inode_deny_io_cache() is that it should be
> >>>>> granted after all users of the inode page cache are gone.
> >>>>>
> >>>>> Is it expected that fuse inode pages remain dirty after no more open
> >>>>> files
> >>>>> and no more mmaps?
> >>>>
> >>>>
> >>>> I'm actually not sure anymore if filemap_range_has_writeback() is
> >>>> actually needed. In fuse_flush() it calls write_inode_now(inode, 1),
> >>>> but I don't think that will flush queued fi->writectr
> >>>> (fi->writepages). Will report back in the afternoon.
> >>>
> >>> Sorry, my fault, please ignore the previous patch. Actually no dirty
> >>> pages to be expected, I had missed the that fuse_flush calls
> >>> fuse_sync_writes(). The main bug in my branch was due to the different
> >>> handling of FOPEN_DIRECT_IO and O_DIRECT - for O_DIRECT I hadn't called
> >>> fuse_file_io_mmap().
> >
> > But why would you need to call fuse_file_io_mmap() for O_DIRECT?
> > If a file was opened without FOPEN_DIRECT_IO, we already set inode to
> > caching mode on open.
> > Does your O_DIRECT patch to mmap solve an actual reproducible bug?
>
> Yeah it does, in my fuse-dio-v5 branch, which adds in shared locks for
> O_DIRECT writes without FOPEN_DIRECT_IO.
>

Ah. right, because open(O_DIRECT) does not enter io cache mode
in your branch. I missed that.

But still, I think that a better fix for fuse_io_mode would be to treat
mmap of O_DIRECT exactly the same as mmap of FOPEN_DIRECT_IO,
including invalidate page cache and require FUSE_DIRECT_IO_ALLOW_MMAP.
I know this could be a change of behavior of applications doing mmap()
on an fd that was opened with O_DIRECT, but I doubt that such applications
exist, even if this really works with upstream code.

Something like this (pushed to my fuse_io_mode branch)?

+static bool fuse_file_is_direct_io(struct file *file)
+{
+       struct fuse_file *ff = file->private_data;
+
+       return ff->open_flags & FOPEN_DIRECT_IO || file->f_flags & O_DIRECT;
+}
+
 /* Request access to submit new io to inode via open file */
 static bool fuse_file_io_open(struct file *file, struct inode *inode)
 {
@@ -116,7 +121,7 @@ static bool fuse_file_io_open(struct file *file,
struct inode *inode)
                return true;

        /* Set explicit FOPEN_CACHE_IO flag for file open in caching mode */
-       if (!(ff->open_flags & FOPEN_DIRECT_IO) && !(file->f_flags & O_DIRECT))
+       if (!fuse_file_is_direct_io(file))
                ff->open_flags |= FOPEN_CACHE_IO;

        spin_lock(&fi->lock);
@@ -2622,8 +2627,9 @@ static int fuse_file_mmap(struct file *file,
struct vm_area_struct *vma)
        if (FUSE_IS_DAX(file_inode(file)))
                return fuse_dax_mmap(file, vma);

-       if (ff->open_flags & FOPEN_DIRECT_IO) {
-               /* Can't provide the coherency needed for MAP_SHARED
+       if (fuse_file_is_direct_io(file)) {
+               /*
+                * Can't provide the coherency needed for MAP_SHARED
                 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
                 */


> >
> >>
> >>
> >> I pushed a few fixes/updates into my fuse-dio-v5 branch and also to
> >> simplify it for you to my fuse_io_mode branch. Changes are onto of the
> >> previous patches io-mode patch to simplify it for you to see the changes
> >> and to possibly squash it into the main io patch.
> >>
> >> https://github.com/bsbernd/linux/commits/fuse_io_mode/
> >>
> >
> > Cool. I squashed all your fixes to my branch, with minor comments
> > that I also left on github, except for the O_DIRECT patch, because
> > I do not understand why it is needed.
>
> No issue with that, I can keep that patch on the branch that actually
> needs it.
>
> Oh, I just see your comments - I didn't get github notification and so
> missed your comments before. Sorry about that. Checking where I need to
> enable it. I do get notifications for other projects, so didn't suspect
> that anything would be missing...
>
>
> >
> > The 6.8 merge window is very close and the holidays are upon us,
> > so not sure if you and Miklos could be bothered, but do you think there
> > is  a chance that we can get fuse_io_mode patches ready for queuing
> > in time for the 6.8 merge window?
> >
> > They do have merit on their own for re-allowing parallel dio along with
> > FOPEN_PARALLEL_DIRECT_WRITES, but also, it would make it easier
> > for the both of us to develop fuse-dio and fuse-passthrough based on
> > the io cache mode during the 6.9 dev cycle.
>
> I definitely would also like to get these patches in. Holidays have the
> merit that I don't need to get up at 7am to wake up kids and am then
> tired all the day. And no meetings ;)
>

I think that between you and I, fuse_io_mode is getting very close to
converging, so queuing it for 6.8 really depends on Miklos' availability
during the following week.

I suggest that you incorporate my review comments from github
and/or use the patches that I pushed to my fuse_io_mode branch
and post the io mode patches for review on the list as soon as
possible. I could do that, but I trust that you are testing dio much
better than I am.

>  From my point my dio-v5 branch is also ready, it relies on these
> patches. Not sure how to post it with the dependency.

Basically, you just post the io mode patch set and then you
post the dio patches with a reference to the io mode patches
that they depend on.

> I also have no issue to wait for 6.9, for now I'm going to take these
> patches to our fuse module for ubuntu and rhel9 kernels (quite heavily
> patched, as it needs to live aside the kernel included module - symbol
> renames, etc).
>

Feels to me like the dio patches are a bit heavier to review than just the
io mode patches, so not likely to be ready for 6.8, but it's not up to me.
I can only say that my review of io mode patches is done and that I have
tested them, while my own ability to review fuse-dio patches for the 6.8
timeframe is limited.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-21 11:14                                                             ` Amir Goldstein
@ 2023-12-21 14:36                                                               ` Bernd Schubert
  2023-12-21 15:08                                                               ` Bernd Schubert
  1 sibling, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-12-21 14:36 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh


>>> But why would you need to call fuse_file_io_mmap() for O_DIRECT?
>>> If a file was opened without FOPEN_DIRECT_IO, we already set inode to
>>> caching mode on open.
>>> Does your O_DIRECT patch to mmap solve an actual reproducible bug?
>>
>> Yeah it does, in my fuse-dio-v5 branch, which adds in shared locks for
>> O_DIRECT writes without FOPEN_DIRECT_IO.
>>
> 
> Ah. right, because open(O_DIRECT) does not enter io cache mode
> in your branch. I missed that.
> 
> But still, I think that a better fix for fuse_io_mode would be to treat
> mmap of O_DIRECT exactly the same as mmap of FOPEN_DIRECT_IO,
> including invalidate page cache and require FUSE_DIRECT_IO_ALLOW_MMAP.
> I know this could be a change of behavior of applications doing mmap()
> on an fd that was opened with O_DIRECT, but I doubt that such applications
> exist, even if this really works with upstream code.
> 
> Something like this (pushed to my fuse_io_mode branch)?
> 
> +static bool fuse_file_is_direct_io(struct file *file)
> +{
> +       struct fuse_file *ff = file->private_data;
> +
> +       return ff->open_flags & FOPEN_DIRECT_IO || file->f_flags & O_DIRECT;
> +}
> +
>   /* Request access to submit new io to inode via open file */
>   static bool fuse_file_io_open(struct file *file, struct inode *inode)
>   {
> @@ -116,7 +121,7 @@ static bool fuse_file_io_open(struct file *file,
> struct inode *inode)
>                  return true;
> 
>          /* Set explicit FOPEN_CACHE_IO flag for file open in caching mode */
> -       if (!(ff->open_flags & FOPEN_DIRECT_IO) && !(file->f_flags & O_DIRECT))
> +       if (!fuse_file_is_direct_io(file))
>                  ff->open_flags |= FOPEN_CACHE_IO;
> 
>          spin_lock(&fi->lock);
> @@ -2622,8 +2627,9 @@ static int fuse_file_mmap(struct file *file,
> struct vm_area_struct *vma)
>          if (FUSE_IS_DAX(file_inode(file)))
>                  return fuse_dax_mmap(file, vma);
> 
> -       if (ff->open_flags & FOPEN_DIRECT_IO) {
> -               /* Can't provide the coherency needed for MAP_SHARED
> +       if (fuse_file_is_direct_io(file)) {
> +               /*
> +                * Can't provide the coherency needed for MAP_SHARED
>                   * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
>                   */
> 

I cut off the rest of the discussion as this is from my point of view a 
rather major change.

I'm not sure about this. If an application opens with O_DIRECT and then 
does mmap - sure, weird options, but then none of the other network file 
systems requires a special setting for that? And none of the other file 
systems has pages invalidations? Why is it needed for fuse' O_DIRECT? 
The initial invalidation was for MAP_PRIVATE and FOPEN_DIRECT_IO, but 
now gets extended - I get really worried about this special fuse 
handling that none of the other file system has.

Also, NFS and smb/cifs do not have the same coherency guarantees, but 
still allow mmap on O_DIRECT?

And assuming an application does this on an existing fuse file system, 
the application would now need to ask the fuse file system developer to 
set this flag with a new kernel?

At a minimum I wouldn't like to have this without its own change log 
entry and with the risk that everything needs to be reverted, in case a 
regression is reported.


Thanks,
Bernd





^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP}
  2023-12-21 11:14                                                             ` Amir Goldstein
  2023-12-21 14:36                                                               ` Bernd Schubert
@ 2023-12-21 15:08                                                               ` Bernd Schubert
  1 sibling, 0 replies; 48+ messages in thread
From: Bernd Schubert @ 2023-12-21 15:08 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, Tyler Fanelli, linux-fsdevel, mszeredi,
	gmaglione, hreitz, Hao Xu, Dharmendra Singh



On 12/21/23 12:14, Amir Goldstein wrote:
> I think that between you and I, fuse_io_mode is getting very close to
> converging, so queuing it for 6.8 really depends on Miklos' availability
> during the following week.
> 
> I suggest that you incorporate my review comments from github
> and/or use the patches that I pushed to my fuse_io_mode branch
> and post the io mode patches for review on the list as soon as
> possible. I could do that, but I trust that you are testing dio much
> better than I am.

Sure, will do that, but I will back out 
FOPEN_DIRECT_IO/O_DIRECT-are-the-same in fuse_file_mmap. I don't think 
it is needed without parallel mmap and I think it deserves its own 
discussion first.

> 
>>   From my point my dio-v5 branch is also ready, it relies on these
>> patches. Not sure how to post it with the dependency.
> 
> Basically, you just post the io mode patch set and then you
> post the dio patches with a reference to the io mode patches
> that they depend on.
> 
>> I also have no issue to wait for 6.9, for now I'm going to take these
>> patches to our fuse module for ubuntu and rhel9 kernels (quite heavily
>> patched, as it needs to live aside the kernel included module - symbol
>> renames, etc).
>>
> 
> Feels to me like the dio patches are a bit heavier to review than just the
> io mode patches, so not likely to be ready for 6.8, but it's not up to me.
> I can only say that my review of io mode patches is done and that I have
> tested them, while my own ability to review fuse-dio patches for the 6.8
> timeframe is limited.

I'm first going to post the fuse_io_mode branch, no need to add in more 
distraction. Once Miklos has reviewed (and merged that), I can 
immediately post the dio branch.


Thanks,
Bernd

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2023-12-21 15:08 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-20  2:39 [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Tyler Fanelli
2023-09-20  2:40 ` [PATCH 1/2] fs/fuse: Rename DIRECT_IO_RELAX to DIRECT_IO_ALLOW_MMAP Tyler Fanelli
2023-09-20  8:31   ` Hanna Czenczek
2023-09-20  9:04     ` Bernd Schubert
2023-09-20  2:40 ` [PATCH 2/2] docs/fuse-io: Document the usage of DIRECT_IO_ALLOW_MMAP Tyler Fanelli
2023-09-20  8:15 ` [PATCH 0/2] fuse: Rename DIRECT_IO_{RELAX -> ALLOW_MMAP} Miklos Szeredi
2023-11-06 14:08   ` Bernd Schubert
2023-11-08 18:19     ` Tyler Fanelli
2023-12-02 15:06     ` Amir Goldstein
2023-12-03 11:20       ` Amir Goldstein
2023-12-03 23:00         ` Bernd Schubert
2023-12-04  6:50           ` Amir Goldstein
2023-12-04  9:27             ` Miklos Szeredi
2023-12-04 10:04               ` Bernd Schubert
2023-12-04 23:42                 ` Bernd Schubert
2023-12-05  7:00                   ` Amir Goldstein
2023-12-05 14:01                     ` Bernd Schubert
2023-12-05 19:18                       ` Amir Goldstein
2023-12-05 23:56                       ` Bernd Schubert
2023-12-06  8:25                         ` Amir Goldstein
2023-12-06 23:28                           ` Bernd Schubert
2023-12-07  7:39                             ` Amir Goldstein
2023-12-07  9:12                               ` Bernd Schubert
2023-12-07 18:37                               ` Bernd Schubert
2023-12-08  8:39                                 ` Amir Goldstein
2023-12-08 19:49                                   ` Bernd Schubert
2023-12-08 20:46                                     ` Amir Goldstein
2023-12-08 22:38                                       ` Bernd Schubert
2023-12-12 18:30                                         ` Amir Goldstein
2023-12-12 22:07                                           ` Bernd Schubert
     [not found]                                             ` <CAOQ4uxh=aBFEiBVBErEA_d+mWcTOysLgbgWVztSzL+D2BvMLdA@mail.gmail.com>
2023-12-13 10:11                                               ` Bernd Schubert
2023-12-13 11:23                                                 ` Amir Goldstein
2023-12-13 13:03                                                   ` Bernd Schubert
2023-12-13 14:09                                                     ` Bernd Schubert
2023-12-14 11:50                                                       ` Bernd Schubert
2023-12-19  0:03                                           ` Bernd Schubert
2023-12-19 13:01                                             ` Amir Goldstein
2023-12-19 20:47                                               ` Bernd Schubert
2023-12-20  4:18                                                 ` Amir Goldstein
2023-12-20  9:00                                                   ` Bernd Schubert
2023-12-20 12:26                                                     ` Bernd Schubert
2023-12-20 22:13                                                       ` Bernd Schubert
2023-12-21  9:18                                                         ` Amir Goldstein
2023-12-21 10:17                                                           ` Bernd Schubert
2023-12-21 11:14                                                             ` Amir Goldstein
2023-12-21 14:36                                                               ` Bernd Schubert
2023-12-21 15:08                                                               ` Bernd Schubert
2023-09-20  8:42 ` Bernd Schubert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.