linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Bypass filesystems for reading cached pages
@ 2020-06-19 15:50 Matthew Wilcox
  2020-06-19 19:06 ` Chaitanya Kulkarni
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-19 15:50 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, agruenba; +Cc: linux-kernel


This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
The advantage of this patch is that we can avoid taking any filesystem
lock, as long as the pages being accessed are in the cache (and we don't
need to readahead any pages into the cache).  We also avoid an indirect
function call in these cases.

I'm sure reusing the name call_read_iter() is the wrong way to go about
this, but renaming all the callers would make this a larger patch.
I'm happy to do it if something like this stands a chance of being
accepted.

Compared to Andreas' patch, I removed the -ECANCELED return value.
We can happily return 0 from generic_file_buffered_read() and it's less
code to handle that.  I bypass the attempt to read from the page cache
for O_DIRECT reads, and for inodes which have no cached pages.  Hopefully
this will avoid calling generic_file_buffered_read() for drivers which
implement read_iter() (although I haven't audited them all to check that

This could go horribly wrong if filesystems rely on doing work in their
->read_iter implementation (eg checking i_size after acquiring their
lock) instead of keeping the page cache uptodate.  On the other hand,
the ->map_pages() method is already called without locks, so filesystems
should already be prepared for this.

Arguably we could do something similar for writes.  I'm a little more
scared of that patch since filesystems are more likely to want to do
things to keep their fies in sync for writes.

diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..7b899538d3c0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 				read_write == READ ? MAY_READ : MAY_WRITE);
 }
 
+ssize_t call_read_iter(struct file *file, struct kiocb *iocb,
+				     struct iov_iter *iter)
+{
+	ssize_t written, ret = 0;
+
+	if (iocb->ki_flags & IOCB_DIRECT)
+		goto uncached;
+	if (!file->f_mapping->nrpages)
+		goto uncached;
+
+	iocb->ki_flags |= IOCB_CACHED;
+	ret = generic_file_buffered_read(iocb, iter, 0);
+	iocb->ki_flags &= ~IOCB_CACHED;
+
+	if (likely(!iov_iter_count(iter)))
+		return ret;
+
+	if (ret == -EAGAIN) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return ret;
+		ret = 0;
+	} else if (ret < 0) {
+		return ret;
+	}
+
+uncached:
+	written = ret;
+
+	ret = file->f_op->read_iter(iocb, iter);
+	if (ret > 0)
+		written += ret;
+
+	return written ? written : ret;
+}
+
 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = len };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..0985773feffd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+#define IOCB_CACHED		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -1895,11 +1896,7 @@ struct inode_operations {
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 } ____cacheline_aligned;
 
-static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
-				     struct iov_iter *iter)
-{
-	return file->f_op->read_iter(kio, iter);
-}
+ssize_t call_read_iter(struct file *, struct kiocb *, struct iov_iter *);
 
 static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
 				      struct iov_iter *iter)
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..4ee97941a1f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 		page = find_get_page(mapping, index);
 		if (!page) {
-			if (iocb->ki_flags & IOCB_NOWAIT)
+			if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED))
 				goto would_block;
 			page_cache_sync_readahead(mapping,
 					ra, filp,
@@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 				goto no_cached_page;
 		}
 		if (PageReadahead(page)) {
+			if (iocb->ki_flags & IOCB_CACHED) {
+				put_page(page);
+				goto out;
+			}
 			page_cache_async_readahead(mapping,
 					ra, filp, page,
 					index, last_index - index);
 		}
 		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_NOWAIT) {
+			if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) {
 				put_page(page);
 				goto would_block;
 			}


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-19 15:50 [RFC] Bypass filesystems for reading cached pages Matthew Wilcox
@ 2020-06-19 19:06 ` Chaitanya Kulkarni
  2020-06-19 20:12   ` Matthew Wilcox
  2020-06-20  6:19 ` Amir Goldstein
  2020-06-22  0:32 ` Dave Chinner
  2 siblings, 1 reply; 18+ messages in thread
From: Chaitanya Kulkarni @ 2020-06-19 19:06 UTC (permalink / raw)
  To: Matthew Wilcox, linux-fsdevel, linux-mm, agruenba; +Cc: linux-kernel

On 6/19/20 8:50 AM, Matthew Wilcox wrote:
> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> The advantage of this patch is that we can avoid taking any filesystem
> lock, as long as the pages being accessed are in the cache (and we don't
> need to readahead any pages into the cache).  We also avoid an indirect
> function call in these cases.
> 
> I'm sure reusing the name call_read_iter() is the wrong way to go about
> this, but renaming all the callers would make this a larger patch.
> I'm happy to do it if something like this stands a chance of being
> accepted.
> 
> Compared to Andreas' patch, I removed the -ECANCELED return value.
> We can happily return 0 from generic_file_buffered_read() and it's less
> code to handle that.  I bypass the attempt to read from the page cache
> for O_DIRECT reads, and for inodes which have no cached pages.  Hopefully
> this will avoid calling generic_file_buffered_read() for drivers which
> implement read_iter() (although I haven't audited them all to check that
> 
> This could go horribly wrong if filesystems rely on doing work in their
> ->read_iter implementation (eg checking i_size after acquiring their
> lock) instead of keeping the page cache uptodate.  On the other hand,
> the ->map_pages() method is already called without locks, so filesystems
> should already be prepared for this.
> 
> Arguably we could do something similar for writes.  I'm a little more
> scared of that patch since filesystems are more likely to want to do
> things to keep their fies in sync for writes.

I did a testing with NVMeOF target file backend with buffered I/O
enabled with your patch and setting the IOCB_CACHED for each I/O ored 
'|' with IOCB_NOWAIT calling call_read_iter_cached() [1].

The name was changed from call_read_iter() -> call_read_iter_cached() [2]).

For the file system I've used XFS and device was null_blk with memory
backed so entire file was cached into the DRAM.

Following are the performance numbers :-

IOPS/Bandwidth :-

default-page-cache:      read:  IOPS=1389k,  BW=5424MiB/s  (5688MB/s)
default-page-cache:      read:  IOPS=1381k,  BW=5395MiB/s  (5657MB/s)
default-page-cache:      read:  IOPS=1391k,  BW=5432MiB/s  (5696MB/s)
iocb-cached-page-cache:  read:  IOPS=1403k,  BW=5481MiB/s  (5747MB/s)
iocb-cached-page-cache:  read:  IOPS=1393k,  BW=5439MiB/s  (5704MB/s)
iocb-cached-page-cache:  read:  IOPS=1399k,  BW=5465MiB/s  (5731MB/s)

Submission lat :-

default-page-cache:      slat  (usec):  min=2,  max=1076,  avg=  3.71,
default-page-cache:      slat  (usec):  min=2,  max=489,   avg=  3.72,
default-page-cache:      slat  (usec):  min=2,  max=1078,  avg=  3.70,
iocb-cached-page-cache:  slat  (usec):  min=2,  max=1731,  avg=  3.70,
iocb-cached-page-cache:  slat  (usec):  min=2,  max=2115,  avg=  3.69,
iocb-cached-page-cache:  slat  (usec):  min=2,  max=3055,  avg=  3.70,

CPU :-

default-page-cache:      cpu  :  usr=10.36%,  sys=49.29%,  ctx=5207722,
default-page-cache:      cpu  :  usr=10.49%,  sys=49.15%,  ctx=5179714,
default-page-cache:      cpu  :  usr=10.56%,  sys=49.22%,  ctx=5215474,
iocb-cached-page-cache:  cpu  :  usr=10.26%,  sys=49.53%,  ctx=5262214,
iocb-cached-page-cache:  cpu  :  usr=10.43%,  sys=49.35%,  ctx=5222433,
iocb-cached-page-cache:  cpu  :  usr=10.47%,  sys=49.59%,  ctx=5247344,


Regards,
Chaitanya

[1]

diff --git a/drivers/nvme/target/io-cmd-file.c 
b/drivers/nvme/target/io-cmd-file.c
index 0abbefd9925e..02fa272399b6 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -120,6 +120,9 @@ static ssize_t nvmet_file_submit_bvec(struct 
nvmet_req *req, loff_t pos,
         iocb->ki_filp = req->ns->file;
         iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);

+       if (rw == READ)
+               return call_read_iter_cached(req->ns->file, iocb, &iter);
+
         return call_iter(iocb, &iter);
  }

@@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)

         if (req->ns->buffered_io) {
                 if (likely(!req->f.mpool_alloc) &&
-                               nvmet_file_execute_io(req, IOCB_NOWAIT))
+                               nvmet_file_execute_io(req,
+                                       IOCB_NOWAIT |IOCB_CACHED))
                         return;
                 nvmet_file_submit_buffered_io(req);


[2]
diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..d4bf2581ff0b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file 
*file, const loff_t *ppos, size_t
                                 read_write == READ ? MAY_READ : MAY_WRITE);
  }

+ssize_t call_read_iter_cached(struct file *file, struct kiocb *iocb,
+                                    struct iov_iter *iter)
+{
+       ssize_t written, ret = 0;
+
+       if (iocb->ki_flags & IOCB_DIRECT)
+               goto uncached;
+       if (!file->f_mapping->nrpages)
+               goto uncached;
+
+       iocb->ki_flags |= IOCB_CACHED;
+       ret = generic_file_buffered_read(iocb, iter, 0);
+       iocb->ki_flags &= ~IOCB_CACHED;
+
+       if (likely(!iov_iter_count(iter)))
+               return ret;
+
+       if (ret == -EAGAIN) {
+               if (iocb->ki_flags & IOCB_NOWAIT)
+                       return ret;
+               ret = 0;
+       } else if (ret < 0) {
+               return ret;
+       }
+
+uncached:
+       written = ret;
+
+       ret = file->f_op->read_iter(iocb, iter);
+       if (ret > 0)
+               written += ret;
+
+       return written ? written : ret;
+}
+
  static ssize_t new_sync_read(struct file *filp, char __user *buf, 
size_t len, loff_t *ppos)
  {
         struct iovec iov = { .iov_base = buf, .iov_len = len };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..3fc8b00cd140 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ enum rw_hint {
  #define IOCB_SYNC              (1 << 5)
  #define IOCB_WRITE             (1 << 6) 

  #define IOCB_NOWAIT            (1 << 7)
+#define IOCB_CACHED            (1 << 8)

  struct kiocb {
         struct file             *ki_filp;
@@ -1901,6 +1902,8 @@ static inline ssize_t call_read_iter(struct file 
*file, struct kiocb *kio,
         return file->f_op->read_iter(kio, iter);
  }

+ssize_t call_read_iter_cached(struct file *, struct kiocb *, struct 
iov_iter *);
+
  static inline ssize_t call_write_iter(struct file *file, struct kiocb 
*kio,
                                       struct iov_iter *iter)
  {
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..4ee97941a1f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,

                 page = find_get_page(mapping, index);
                 if (!page) {
-                       if (iocb->ki_flags & IOCB_NOWAIT)
+                       if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED))
                                 goto would_block;
                         page_cache_sync_readahead(mapping,
                                         ra, filp,
@@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb 
*iocb,
                                 goto no_cached_page;
                 }
                 if (PageReadahead(page)) {
+                       if (iocb->ki_flags & IOCB_CACHED) {
+                               put_page(page);
+                               goto out;
+                       }
                         page_cache_async_readahead(mapping,
                                         ra, filp, page,
                                         index, last_index - index);
                 }
                 if (!PageUptodate(page)) {
-                       if (iocb->ki_flags & IOCB_NOWAIT) {
+                       if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) {
                                 put_page(page);
                                 goto would_block;
                         }
-- 
2.26.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-19 19:06 ` Chaitanya Kulkarni
@ 2020-06-19 20:12   ` Matthew Wilcox
  2020-06-19 21:25     ` Chaitanya Kulkarni
  0 siblings, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-19 20:12 UTC (permalink / raw)
  To: Chaitanya Kulkarni; +Cc: linux-fsdevel, linux-mm, agruenba, linux-kernel

On Fri, Jun 19, 2020 at 07:06:19PM +0000, Chaitanya Kulkarni wrote:
> On 6/19/20 8:50 AM, Matthew Wilcox wrote:
> > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > The advantage of this patch is that we can avoid taking any filesystem
> > lock, as long as the pages being accessed are in the cache (and we don't
> > need to readahead any pages into the cache).  We also avoid an indirect
> > function call in these cases.
> 
> I did a testing with NVMeOF target file backend with buffered I/O
> enabled with your patch and setting the IOCB_CACHED for each I/O ored 
> '|' with IOCB_NOWAIT calling call_read_iter_cached() [1].
>
> The name was changed from call_read_iter() -> call_read_iter_cached() [2]).
> 
> For the file system I've used XFS and device was null_blk with memory
> backed so entire file was cached into the DRAM.

Thanks for testing!  Can you elaborate a little more on what the test does?
Are there many threads or tasks?  What is the I/O path?  XFS on an NVMEoF
device, talking over loopback to localhost with nullblk as the server?

The nullblk device will have all the data in its pagecache, but each XFS
file will have an empty pagecache initially.  Then it'll be populated by
the test, so does the I/O pattern revisit previously accessed data at all?

> Following are the performance numbers :-
> 
> IOPS/Bandwidth :-
> 
> default-page-cache:      read:  IOPS=1389k,  BW=5424MiB/s  (5688MB/s)
> default-page-cache:      read:  IOPS=1381k,  BW=5395MiB/s  (5657MB/s)
> default-page-cache:      read:  IOPS=1391k,  BW=5432MiB/s  (5696MB/s)
> iocb-cached-page-cache:  read:  IOPS=1403k,  BW=5481MiB/s  (5747MB/s)
> iocb-cached-page-cache:  read:  IOPS=1393k,  BW=5439MiB/s  (5704MB/s)
> iocb-cached-page-cache:  read:  IOPS=1399k,  BW=5465MiB/s  (5731MB/s)

That doesn't look bad at all ... about 0.7% increase in IOPS.

> Submission lat :-
> 
> default-page-cache:      slat  (usec):  min=2,  max=1076,  avg=  3.71,
> default-page-cache:      slat  (usec):  min=2,  max=489,   avg=  3.72,
> default-page-cache:      slat  (usec):  min=2,  max=1078,  avg=  3.70,
> iocb-cached-page-cache:  slat  (usec):  min=2,  max=1731,  avg=  3.70,
> iocb-cached-page-cache:  slat  (usec):  min=2,  max=2115,  avg=  3.69,
> iocb-cached-page-cache:  slat  (usec):  min=2,  max=3055,  avg=  3.70,

Average latency unchanged, max latency up a little ... makes sense,
since we'll do a little more work in the worst case.

> @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
> 
>          if (req->ns->buffered_io) {
>                  if (likely(!req->f.mpool_alloc) &&
> -                               nvmet_file_execute_io(req, IOCB_NOWAIT))
> +                               nvmet_file_execute_io(req,
> +                                       IOCB_NOWAIT |IOCB_CACHED))
>                          return;
>                  nvmet_file_submit_buffered_io(req);

You'll need a fallback path here, right?  IOCB_CACHED can get part-way
through doing a request, and then need to be finished off after taking
the mutex.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-19 20:12   ` Matthew Wilcox
@ 2020-06-19 21:25     ` Chaitanya Kulkarni
  0 siblings, 0 replies; 18+ messages in thread
From: Chaitanya Kulkarni @ 2020-06-19 21:25 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, agruenba, linux-kernel

Matthew,

On 6/19/20 1:12 PM, Matthew Wilcox wrote:
> On Fri, Jun 19, 2020 at 07:06:19PM +0000, Chaitanya Kulkarni wrote:
>> On 6/19/20 8:50 AM, Matthew Wilcox wrote:
>>> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
>>> The advantage of this patch is that we can avoid taking any filesystem
>>> lock, as long as the pages being accessed are in the cache (and we don't
>>> need to readahead any pages into the cache).  We also avoid an indirect
>>> function call in these cases.
>>
>> I did a testing with NVMeOF target file backend with buffered I/O
>> enabled with your patch and setting the IOCB_CACHED for each I/O ored
>> '|' with IOCB_NOWAIT calling call_read_iter_cached() [1].
>>
>> The name was changed from call_read_iter() -> call_read_iter_cached() [2]).
>>
>> For the file system I've used XFS and device was null_blk with memory
>> backed so entire file was cached into the DRAM.
> 
> Thanks for testing!  Can you elaborate a little more on what the test does?
> Are there many threads or tasks?  What is the I/O path?  XFS on an NVMEoF
> device, talking over loopback to localhost with nullblk as the server?
> 
> The nullblk device will have all the data in its pagecache, but each XFS
> file will have an empty pagecache initially.  Then it'll be populated by
> the test, so does the I/O pattern revisit previously accessed data at all?
> 

Will share details shortly.

>> Following are the performance numbers :-
>>
>> IOPS/Bandwidth :-
>>
>> default-page-cache:      read:  IOPS=1389k,  BW=5424MiB/s  (5688MB/s)
>> default-page-cache:      read:  IOPS=1381k,  BW=5395MiB/s  (5657MB/s)
>> default-page-cache:      read:  IOPS=1391k,  BW=5432MiB/s  (5696MB/s)
>> iocb-cached-page-cache:  read:  IOPS=1403k,  BW=5481MiB/s  (5747MB/s)
>> iocb-cached-page-cache:  read:  IOPS=1393k,  BW=5439MiB/s  (5704MB/s)
>> iocb-cached-page-cache:  read:  IOPS=1399k,  BW=5465MiB/s  (5731MB/s)
> 
> That doesn't look bad at all ... about 0.7% increase in IOPS.
> 
>> Submission lat :-
>>
>> default-page-cache:      slat  (usec):  min=2,  max=1076,  avg=  3.71,
>> default-page-cache:      slat  (usec):  min=2,  max=489,   avg=  3.72,
>> default-page-cache:      slat  (usec):  min=2,  max=1078,  avg=  3.70,
>> iocb-cached-page-cache:  slat  (usec):  min=2,  max=1731,  avg=  3.70,
>> iocb-cached-page-cache:  slat  (usec):  min=2,  max=2115,  avg=  3.69,
>> iocb-cached-page-cache:  slat  (usec):  min=2,  max=3055,  avg=  3.70,
> 
> Average latency unchanged, max latency up a little ... makes sense,
> since we'll do a little more work in the worst case.
> 
>> @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
>>
>>           if (req->ns->buffered_io) {
>>                   if (likely(!req->f.mpool_alloc) &&
>> -                               nvmet_file_execute_io(req, IOCB_NOWAIT))
>> +                               nvmet_file_execute_io(req,
>> +                                       IOCB_NOWAIT |IOCB_CACHED))
>>                           return;
>>                   nvmet_file_submit_buffered_io(req);
> 
> You'll need a fallback path here, right?  IOCB_CACHED can get part-way
> through doing a request, and then need to be finished off after taking
> the mutex.
> 
> 

That is what something needs to be done in the call_read_iter cached 
otherwise with a flag otherwise callers will have to duplicate the code
all over the kernel. Correct me if I'm wrong.

For now I'm populating all the data in the cache for fio runs so 2nd and 
3rd run is coming from the cache.


Also, looking at the code now I don't think we need to use IOCB_CACHED
flag as long as we call call_read_iter_cached() since it takes care of
adding this flags which is a correct way of abstracting the code, than
caller duplicate this flag all over the kernel.

Am I missing something ?

I'll run test again and share more details.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-19 15:50 [RFC] Bypass filesystems for reading cached pages Matthew Wilcox
  2020-06-19 19:06 ` Chaitanya Kulkarni
@ 2020-06-20  6:19 ` Amir Goldstein
  2020-06-20 19:15   ` Matthew Wilcox
  2020-06-22  0:32 ` Dave Chinner
  2 siblings, 1 reply; 18+ messages in thread
From: Amir Goldstein @ 2020-06-20  6:19 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, Linux MM, Andreas Gruenbacher, linux-kernel

On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote:
>
>
> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> The advantage of this patch is that we can avoid taking any filesystem
> lock, as long as the pages being accessed are in the cache (and we don't
> need to readahead any pages into the cache).  We also avoid an indirect
> function call in these cases.
>
> I'm sure reusing the name call_read_iter() is the wrong way to go about
> this, but renaming all the callers would make this a larger patch.
> I'm happy to do it if something like this stands a chance of being
> accepted.
>
> Compared to Andreas' patch, I removed the -ECANCELED return value.
> We can happily return 0 from generic_file_buffered_read() and it's less
> code to handle that.  I bypass the attempt to read from the page cache
> for O_DIRECT reads, and for inodes which have no cached pages.  Hopefully
> this will avoid calling generic_file_buffered_read() for drivers which
> implement read_iter() (although I haven't audited them all to check that
>
> This could go horribly wrong if filesystems rely on doing work in their
> ->read_iter implementation (eg checking i_size after acquiring their
> lock) instead of keeping the page cache uptodate.  On the other hand,
> the ->map_pages() method is already called without locks, so filesystems
> should already be prepared for this.
>

XFS is taking i_rwsem lock in read_iter() for a surprising reason:
https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/
In that post I claim that ocfs2 and cifs also do some work in read_iter().
I didn't go back to check what, but it sounds like cache coherence among
nodes.

So filesystems will need to opt-in to this behavior.

I wonder if we should make this behavior also opt-in by userspace,
for example, RWF_OPPORTUNISTIC_CACHED.

Because if I am not mistaken, even though this change has a potential
to improve many workloads, it may also degrade some workloads in cases
where case readahead is not properly tuned. Imagine reading a large file
and getting only a few pages worth of data read on every syscall.
Or did I misunderstand your patch's behavior in that case?

Another up side of user opt-in flag - it can be used to mitigate the objection
of XFS developers against changing the "atomic write vs. read" behavior.
New flag - no commitment to an XFS specific behavior that nobody knows
if any application out there relies on.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-20  6:19 ` Amir Goldstein
@ 2020-06-20 19:15   ` Matthew Wilcox
  2020-06-21  6:00     ` Amir Goldstein
  2020-06-22  1:02     ` Dave Chinner
  0 siblings, 2 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-20 19:15 UTC (permalink / raw)
  To: Amir Goldstein; +Cc: linux-fsdevel, Linux MM, Andreas Gruenbacher, linux-kernel

On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote:
> On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote:
> > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > The advantage of this patch is that we can avoid taking any filesystem
> > lock, as long as the pages being accessed are in the cache (and we don't
> > need to readahead any pages into the cache).  We also avoid an indirect
> > function call in these cases.
> 
> XFS is taking i_rwsem lock in read_iter() for a surprising reason:
> https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/
> In that post I claim that ocfs2 and cifs also do some work in read_iter().
> I didn't go back to check what, but it sounds like cache coherence among
> nodes.

That's out of date.  Here's POSIX-2017:

https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html

  "I/O is intended to be atomic to ordinary files and pipes and
  FIFOs. Atomic means that all the bytes from a single operation that
  started out together end up together, without interleaving from other
  I/O operations. It is a known attribute of terminals that this is not
  honored, and terminals are explicitly (and implicitly permanently)
  excepted, making the behavior unspecified. The behavior for other
  device types is also left unspecified, but the wording is intended to
  imply that future standards might choose to specify atomicity (or not)."

That _doesn't_ say "a read cannot observe a write in progress".  It says
"Two writes cannot interleave".  Indeed, further down in that section, it says:

  "Earlier versions of this standard allowed two very different behaviors
  with regard to the handling of interrupts. In order to minimize the
  resulting confusion, it was decided that POSIX.1-2017 should support
  only one of these behaviors. Historical practice on AT&T-derived systems
  was to have read() and write() return -1 and set errno to [EINTR] when
  interrupted after some, but not all, of the data requested had been
  transferred. However, the US Department of Commerce FIPS 151-1 and FIPS
  151-2 require the historical BSD behavior, in which read() and write()
  return the number of bytes actually transferred before the interrupt. If
  -1 is returned when any data is transferred, it is difficult to recover
  from the error on a seekable device and impossible on a non-seekable
  device. Most new implementations support this behavior. The behavior
  required by POSIX.1-2017 is to return the number of bytes transferred."

That explicitly allows for a write to be interrupted by a signal and
later resumed, allowing a read to observe a half-complete write.

> Because if I am not mistaken, even though this change has a potential
> to improve many workloads, it may also degrade some workloads in cases
> where case readahead is not properly tuned. Imagine reading a large file
> and getting only a few pages worth of data read on every syscall.
> Or did I misunderstand your patch's behavior in that case?

I think you did.  If the IOCB_CACHED read hits a readahead page,
it returns early.  Then call_read_iter() notices the read is not yet
complete, and calls ->read_iter() to finish the read.  So it's two
calls to generic_file_buffered_read() rather than one, but it's still
one syscall.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-20 19:15   ` Matthew Wilcox
@ 2020-06-21  6:00     ` Amir Goldstein
  2020-06-22  1:02     ` Dave Chinner
  1 sibling, 0 replies; 18+ messages in thread
From: Amir Goldstein @ 2020-06-21  6:00 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-fsdevel, Linux MM, Andreas Gruenbacher, linux-kernel,
	linux-xfs, Dave Chinner, Jan Kara

[CC: Dave Chinner, Jan Kara, xfs]

On Sat, Jun 20, 2020 at 10:15 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote:
> > On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > > The advantage of this patch is that we can avoid taking any filesystem
> > > lock, as long as the pages being accessed are in the cache (and we don't
> > > need to readahead any pages into the cache).  We also avoid an indirect
> > > function call in these cases.
> >
> > XFS is taking i_rwsem lock in read_iter() for a surprising reason:
> > https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/
> > In that post I claim that ocfs2 and cifs also do some work in read_iter().
> > I didn't go back to check what, but it sounds like cache coherence among
> > nodes.
>
> That's out of date.  Here's POSIX-2017:
>
> https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html
>
>   "I/O is intended to be atomic to ordinary files and pipes and
>   FIFOs. Atomic means that all the bytes from a single operation that
>   started out together end up together, without interleaving from other
>   I/O operations. It is a known attribute of terminals that this is not
>   honored, and terminals are explicitly (and implicitly permanently)
>   excepted, making the behavior unspecified. The behavior for other
>   device types is also left unspecified, but the wording is intended to
>   imply that future standards might choose to specify atomicity (or not)."
>
> That _doesn't_ say "a read cannot observe a write in progress".  It says
> "Two writes cannot interleave".  Indeed, further down in that section, it says:
>
>   "Earlier versions of this standard allowed two very different behaviors
>   with regard to the handling of interrupts. In order to minimize the
>   resulting confusion, it was decided that POSIX.1-2017 should support
>   only one of these behaviors. Historical practice on AT&T-derived systems
>   was to have read() and write() return -1 and set errno to [EINTR] when
>   interrupted after some, but not all, of the data requested had been
>   transferred. However, the US Department of Commerce FIPS 151-1 and FIPS
>   151-2 require the historical BSD behavior, in which read() and write()
>   return the number of bytes actually transferred before the interrupt. If
>   -1 is returned when any data is transferred, it is difficult to recover
>   from the error on a seekable device and impossible on a non-seekable
>   device. Most new implementations support this behavior. The behavior
>   required by POSIX.1-2017 is to return the number of bytes transferred."
>
> That explicitly allows for a write to be interrupted by a signal and
> later resumed, allowing a read to observe a half-complete write.
>

Tell that to Dave Chinner (cc). I too, find it surprising that XFS developers
choose to "not regress" a behavior that is XFS specific and there is no
proof or even clues of any application that could rely on such behavior.
While the price that is being paid by all real world applications that do
mixed random rw workload is very much real and very much significant.

The original discussion on the original post quickly steered towards the
behavior change of rwsem [1], which you Matthew also participated in.
The reason for taking the rwsem lock in the first place was never seriously
challenged.

I posted a followup patch that fixes the performance issue without breaking
the "atomic rw" behavior [2] by calling generic_file_read_iter() once without
i_rwsem to pre-populate the page cache.
Dave had some technical concerns about this patch, regarding racing
with truncate_pagecache_range(), which later led to a fix by Jan Kara to
solve a readahead(2) vs. hole punch race [3].

At the time, Jan Kara wrote [3]:
"...other filesystems need similar protections but e.g. in case of ext4 it isn't
so simple without seriously regressing mixed rw workload performance so
I'm pushing just xfs fix at this moment which is simple."

And w.r.t solving the race without taking i_rwsem:
"...So I have an idea how it could be solved: Change calling convention for
->readpage() so that it gets called without page locked and take
i_mmap_sem there (and in ->readpages()) to protect from the race..."

My question to both Jan and Matthew is - does the new aops ->readahead()
API make things any better in that regard?
Will it make it easier for us to address the readahead vs. hole punch race
without having to take i_rwsem before readahead()?

Thanks,
Amir.

[1] https://lore.kernel.org/linux-xfs/20190325154731.GT1183@magnolia/
[2] https://lore.kernel.org/linux-xfs/20190404165737.30889-1-amir73il@gmail.com/
[3] https://lore.kernel.org/linux-xfs/20200120165830.GB28285@quack2.suse.cz/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-19 15:50 [RFC] Bypass filesystems for reading cached pages Matthew Wilcox
  2020-06-19 19:06 ` Chaitanya Kulkarni
  2020-06-20  6:19 ` Amir Goldstein
@ 2020-06-22  0:32 ` Dave Chinner
  2020-06-22 14:35   ` Andreas Gruenbacher
  2020-06-22 19:18   ` Matthew Wilcox
  2 siblings, 2 replies; 18+ messages in thread
From: Dave Chinner @ 2020-06-22  0:32 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, agruenba, linux-kernel

On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> 
> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> The advantage of this patch is that we can avoid taking any filesystem
> lock, as long as the pages being accessed are in the cache (and we don't
> need to readahead any pages into the cache).  We also avoid an indirect
> function call in these cases.

What does this micro-optimisation actually gain us except for more
complexity in the IO path?

i.e. if a filesystem lock has such massive overhead that it slows
down the cached readahead path in production workloads, then that's
something the filesystem needs to address, not unconditionally
bypass the filesystem before the IO gets anywhere near it.

> This could go horribly wrong if filesystems rely on doing work in their
> ->read_iter implementation (eg checking i_size after acquiring their
> lock) instead of keeping the page cache uptodate.  On the other hand,
> the ->map_pages() method is already called without locks, so filesystems
> should already be prepared for this.

Oh, gawd, we have *yet another* unlocked page cache read path that
can race with invalidations during fallocate() operations?

/me goes and looks at filemap_map_pages()

Yup, filemap_map_pages() is only safe against invalidations beyond
EOF (i.e. truncate) and can still race with invalidations within
EOF. So, yes, I'm right in that this path is not safe to run without
filesystem locking to serialise the IO against fallocate()...

Darrick, it looks like we need to wrap filemap_map_pages() with the
XFS_MMAPLOCK_SHARED like we do for all the other page fault paths
that can call into the IO path.

> Arguably we could do something similar for writes.  I'm a little more
> scared of that patch since filesystems are more likely to want to do
> things to keep their fies in sync for writes.

Please, no.  We can have uptodate cached pages over holes, unwritten
extents, shared extents, etc but they all require filesystem level
serialisation and space/block allocation work *before* we copy data
into the page. i.e. if allocation/space reservation fails, we need
to abort before changing data.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-20 19:15   ` Matthew Wilcox
  2020-06-21  6:00     ` Amir Goldstein
@ 2020-06-22  1:02     ` Dave Chinner
  1 sibling, 0 replies; 18+ messages in thread
From: Dave Chinner @ 2020-06-22  1:02 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Amir Goldstein, linux-fsdevel, Linux MM, Andreas Gruenbacher,
	linux-kernel

On Sat, Jun 20, 2020 at 12:15:21PM -0700, Matthew Wilcox wrote:
> On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote:
> > On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > > The advantage of this patch is that we can avoid taking any filesystem
> > > lock, as long as the pages being accessed are in the cache (and we don't
> > > need to readahead any pages into the cache).  We also avoid an indirect
> > > function call in these cases.
> > 
> > XFS is taking i_rwsem lock in read_iter() for a surprising reason:
> > https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/
> > In that post I claim that ocfs2 and cifs also do some work in read_iter().
> > I didn't go back to check what, but it sounds like cache coherence among
> > nodes.
> 
> That's out of date.  Here's POSIX-2017:
> 
> https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html
> 
>   "I/O is intended to be atomic to ordinary files and pipes and
>   FIFOs. Atomic means that all the bytes from a single operation that
>   started out together end up together, without interleaving from other
>   I/O operations. It is a known attribute of terminals that this is not
>   honored, and terminals are explicitly (and implicitly permanently)
>   excepted, making the behavior unspecified. The behavior for other
>   device types is also left unspecified, but the wording is intended to
>   imply that future standards might choose to specify atomicity (or not)."
> 
> That _doesn't_ say "a read cannot observe a write in progress".  It says
> "Two writes cannot interleave".  Indeed, further down in that section, it says:

Nope, it says "... without interleaving from other I/O operations".

That means read() needs to be atomic w.r.t truncate, hole punching,
extent zeroing, etc, not just other write() syscalls.

Really, though, I'm not going to get drawn into a language lawyering
argument here. We've discussed this before, and it's pretty clear
the language supports both arguments in one way or another.

And that means we are not going to change behaviour that XFS has
provided for 27 years now. Last time this came up, I said:

"XFS was designed with the intent that buffered writes are
atomic w.r.t. to all other file accesses."

Christoph said:

"Downgrading these long standing guarantees is simply not an option"

Darrick:

"I don't like the idea of adding a O_BROKENLOCKINGPONIES flag"

Nothing has changed since this was last discussed. 

Well, except for the fact that since then I've seen the source code
to some 20+ year old enterprise applications that have been ported
to Linux and that has made me even more certain that we need to
maintain XFS's existing behaviour....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22  0:32 ` Dave Chinner
@ 2020-06-22 14:35   ` Andreas Gruenbacher
  2020-06-22 18:13     ` Matthew Wilcox
  2020-06-23  0:52     ` Dave Chinner
  2020-06-22 19:18   ` Matthew Wilcox
  1 sibling, 2 replies; 18+ messages in thread
From: Andreas Gruenbacher @ 2020-06-22 14:35 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Matthew Wilcox, linux-fsdevel, Linux-MM, LKML

On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote:
> On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> >
> > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > The advantage of this patch is that we can avoid taking any filesystem
> > lock, as long as the pages being accessed are in the cache (and we don't
> > need to readahead any pages into the cache).  We also avoid an indirect
> > function call in these cases.
>
> What does this micro-optimisation actually gain us except for more
> complexity in the IO path?
>
> i.e. if a filesystem lock has such massive overhead that it slows
> down the cached readahead path in production workloads, then that's
> something the filesystem needs to address, not unconditionally
> bypass the filesystem before the IO gets anywhere near it.

I'm fine with not moving that functionality into the VFS. The problem
I have in gfs2 is that taking glocks is really expensive. Part of that
overhead is accidental, but we definitely won't be able to fix it in
the short term. So something like the IOCB_CACHED flag that prevents
generic_file_read_iter from issuing readahead I/O would save the day
for us. Does that idea stand a chance?

We could theoretically also create a copy of
generic_file_buffered_read in gfs2 and strip out the I/O parts, but
that would be very messy.

Thanks,
Andreas


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22 14:35   ` Andreas Gruenbacher
@ 2020-06-22 18:13     ` Matthew Wilcox
  2020-06-24 12:35       ` Andreas Gruenbacher
  2020-06-23  0:52     ` Dave Chinner
  1 sibling, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-22 18:13 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: Dave Chinner, linux-fsdevel, Linux-MM, LKML

On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> I'm fine with not moving that functionality into the VFS. The problem
> I have in gfs2 is that taking glocks is really expensive. Part of that
> overhead is accidental, but we definitely won't be able to fix it in
> the short term. So something like the IOCB_CACHED flag that prevents
> generic_file_read_iter from issuing readahead I/O would save the day
> for us. Does that idea stand a chance?

For the short-term fix, is switching to a trylock in gfs2_readahead()
acceptable?

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..6ccd478c81ff 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -600,7 +600,7 @@ static void gfs2_readahead(struct readahead_control *rac)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
 
-       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh);
        if (gfs2_glock_nq(&gh))
                goto out_uninit;
        if (!gfs2_is_stuffed(ip))


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22  0:32 ` Dave Chinner
  2020-06-22 14:35   ` Andreas Gruenbacher
@ 2020-06-22 19:18   ` Matthew Wilcox
  2020-06-23  2:35     ` Dave Chinner
  1 sibling, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-22 19:18 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-mm, agruenba, linux-kernel

On Mon, Jun 22, 2020 at 10:32:15AM +1000, Dave Chinner wrote:
> On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> > 
> > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > The advantage of this patch is that we can avoid taking any filesystem
> > lock, as long as the pages being accessed are in the cache (and we don't
> > need to readahead any pages into the cache).  We also avoid an indirect
> > function call in these cases.
> 
> What does this micro-optimisation actually gain us except for more
> complexity in the IO path?
> 
> i.e. if a filesystem lock has such massive overhead that it slows
> down the cached readahead path in production workloads, then that's
> something the filesystem needs to address, not unconditionally
> bypass the filesystem before the IO gets anywhere near it.

You're been talking about adding a range lock to XFS for a while now.
I remain quite sceptical that range locks are a good idea; they have not
worked out well as a replacement for the mmap_sem, although the workload
for the mmap_sem is quite different and they may yet show promise for
the XFS iolock.

There are production workloads that do not work well on top of a single
file on an XFS filesystem.  For example, using an XFS file in a host as
the backing store for a guest block device.  People tend to work around
that kind of performance bug rather than report it.

Do you agree that the guarantees that XFS currently supplies regarding
locked operation will be maintained if the I/O is contained within a
single page and the mutex is not taken?  ie add this check to the original
patch:

        if (iocb->ki_pos / PAGE_SIZE !=
            (iocb->ki_pos + iov_iter_count(iter) - 1) / PAGE_SIZE)
                goto uncached;

I think that gets me almost everything I want.  Small I/Os are going to
notice the pain of the mutex more than large I/Os.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22 14:35   ` Andreas Gruenbacher
  2020-06-22 18:13     ` Matthew Wilcox
@ 2020-06-23  0:52     ` Dave Chinner
  2020-06-23  7:41       ` Andreas Gruenbacher
  1 sibling, 1 reply; 18+ messages in thread
From: Dave Chinner @ 2020-06-23  0:52 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: Matthew Wilcox, linux-fsdevel, Linux-MM, LKML

On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote:
> > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> > >
> > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > > The advantage of this patch is that we can avoid taking any filesystem
> > > lock, as long as the pages being accessed are in the cache (and we don't
> > > need to readahead any pages into the cache).  We also avoid an indirect
> > > function call in these cases.
> >
> > What does this micro-optimisation actually gain us except for more
> > complexity in the IO path?
> >
> > i.e. if a filesystem lock has such massive overhead that it slows
> > down the cached readahead path in production workloads, then that's
> > something the filesystem needs to address, not unconditionally
> > bypass the filesystem before the IO gets anywhere near it.
> 
> I'm fine with not moving that functionality into the VFS. The problem
> I have in gfs2 is that taking glocks is really expensive. Part of that
> overhead is accidental, but we definitely won't be able to fix it in
> the short term. So something like the IOCB_CACHED flag that prevents
> generic_file_read_iter from issuing readahead I/O would save the day
> for us. Does that idea stand a chance?

I have no problem with a "NOREADAHEAD" flag being passed to
generic_file_read_iter(). It's not a "already cached" flag though,
it's a "don't start any IO" directive, just like the NOWAIT flag is
a "don't block on locks or IO in progress" directive and not an
"already cached" flag. Readahead is something we should be doing,
unless a filesystem has a very good reason not to, such as the gfs2
locking case here...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22 19:18   ` Matthew Wilcox
@ 2020-06-23  2:35     ` Dave Chinner
  0 siblings, 0 replies; 18+ messages in thread
From: Dave Chinner @ 2020-06-23  2:35 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, linux-mm, agruenba, linux-kernel

On Mon, Jun 22, 2020 at 08:18:57PM +0100, Matthew Wilcox wrote:
> On Mon, Jun 22, 2020 at 10:32:15AM +1000, Dave Chinner wrote:
> > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> > > 
> > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > > The advantage of this patch is that we can avoid taking any filesystem
> > > lock, as long as the pages being accessed are in the cache (and we don't
> > > need to readahead any pages into the cache).  We also avoid an indirect
> > > function call in these cases.
> > 
> > What does this micro-optimisation actually gain us except for more
> > complexity in the IO path?
> > 
> > i.e. if a filesystem lock has such massive overhead that it slows
> > down the cached readahead path in production workloads, then that's
> > something the filesystem needs to address, not unconditionally
> > bypass the filesystem before the IO gets anywhere near it.
> 
> You're been talking about adding a range lock to XFS for a while now.

I don't see what that has to do with this patch.

> I remain quite sceptical that range locks are a good idea; they have not
> worked out well as a replacement for the mmap_sem, although the workload
> for the mmap_sem is quite different and they may yet show promise for
> the XFS iolock.

<shrug>

That was a really poor implementation of a range lock. It had no
concurrency to speak of, because the tracking tree required a
spinlock to be taken for every lock or unlock the range lock
performed. Hence it had an expensive critical section that could not
scale past the number of ops a single CPU could perform on that
tree. IOWs, it topped out at about 150k lock cycles a second with
2-3 concurrent AIO+DIO threads, and only went slower as the number
of concurrent IO submitters went up.

So, yeah, if you are going to talk about range locks, you need to
forget about the what was tried on the mmap_sem because nobody
actually scalability tested the lock implementation by itself and it
turned out to be total crap....

> There are production workloads that do not work well on top of a single
> file on an XFS filesystem.  For example, using an XFS file in a host as
> the backing store for a guest block device.  People tend to work around
> that kind of performance bug rather than report it.

*cough* AIO+DIO *cough*

You may not like that answer, but anyone who cares about IO
performance, especially single file IO performance, is using
AIO+DIO. Buffered IO for VM image files in production environments
tends to be the exception, not the norm, because caching is done in
the guest by the guest page cache. Double caching IO data is
generally considered a waste of resources that could otherwise be
sold to customers.

> Do you agree that the guarantees that XFS currently supplies regarding
> locked operation will be maintained if the I/O is contained within a
> single page and the mutex is not taken?

Not at first glance because block size < file size configurations
exist and hence filesystems might be punching out extents from a
sub-page range....

> ie add this check to the original
> patch:
> 
>         if (iocb->ki_pos / PAGE_SIZE !=
>             (iocb->ki_pos + iov_iter_count(iter) - 1) / PAGE_SIZE)
>                 goto uncached;
> 
> I think that gets me almost everything I want.  Small I/Os are going to
> notice the pain of the mutex more than large I/Os.

Exactly what are you trying to optimise, Willy? You haven't
explained to anyone what workload needs these micro-optimisations,
and without understanding why you want to cut the filesystems out of
the readahead path, I can't suggest alternative solutions...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-23  0:52     ` Dave Chinner
@ 2020-06-23  7:41       ` Andreas Gruenbacher
  0 siblings, 0 replies; 18+ messages in thread
From: Andreas Gruenbacher @ 2020-06-23  7:41 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Matthew Wilcox, linux-fsdevel, Linux-MM, LKML

On Tue, Jun 23, 2020 at 2:52 AM Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> > On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote:
> > > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote:
> > > >
> > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> > > > The advantage of this patch is that we can avoid taking any filesystem
> > > > lock, as long as the pages being accessed are in the cache (and we don't
> > > > need to readahead any pages into the cache).  We also avoid an indirect
> > > > function call in these cases.
> > >
> > > What does this micro-optimisation actually gain us except for more
> > > complexity in the IO path?
> > >
> > > i.e. if a filesystem lock has such massive overhead that it slows
> > > down the cached readahead path in production workloads, then that's
> > > something the filesystem needs to address, not unconditionally
> > > bypass the filesystem before the IO gets anywhere near it.
> >
> > I'm fine with not moving that functionality into the VFS. The problem
> > I have in gfs2 is that taking glocks is really expensive. Part of that
> > overhead is accidental, but we definitely won't be able to fix it in
> > the short term. So something like the IOCB_CACHED flag that prevents
> > generic_file_read_iter from issuing readahead I/O would save the day
> > for us. Does that idea stand a chance?
>
> I have no problem with a "NOREADAHEAD" flag being passed to
> generic_file_read_iter(). It's not a "already cached" flag though,
> it's a "don't start any IO" directive, just like the NOWAIT flag is
> a "don't block on locks or IO in progress" directive and not an
> "already cached" flag. Readahead is something we should be doing,
> unless a filesystem has a very good reason not to, such as the gfs2
> locking case here...

The requests coming in can have the IOCB_NOWAIT flag set or cleared.
The idea was to have an additional flag that implies IOCB_NOWAIT so
that you can do:

    iocb->ki_flags |= IOCB_NOIO;
    generic_file_read_iter()
    if ("failed because of IOCB_NOIO") {
        if ("failed because of IOCB_NOWAIT")
            return -EAGAIN;
        iocb->ki_flags &= ~IOCB_NOIO;
        "locking"
         generic_file_read_iter()
        "unlocking"
    }

without having to save iocb->ki_flags. The alternative would be:

    int flags = iocb->ki_flags;
    iocb->ki_flags |= IOCB_NOIO | IOCB_NOWAIT;
    ret = generic_file_read_iter()
    if ("failed because of IOCB_NOIO or IOCB_NOWAIT") {
        if ("failed because of IOCB_NOWAIT" && (flags & IOCB_NOWAIT))
            return -EAGAIN;
        iocb->ki_flags &= ~IOCB_NOIO;
        "locking"
         generic_file_read_iter()
        "unlocking"
    }

Andreas


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-22 18:13     ` Matthew Wilcox
@ 2020-06-24 12:35       ` Andreas Gruenbacher
  2020-07-02 15:16         ` Andreas Gruenbacher
  0 siblings, 1 reply; 18+ messages in thread
From: Andreas Gruenbacher @ 2020-06-24 12:35 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Dave Chinner, linux-fsdevel, Linux-MM, LKML

On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote:
> On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> > I'm fine with not moving that functionality into the VFS. The problem
> > I have in gfs2 is that taking glocks is really expensive. Part of that
> > overhead is accidental, but we definitely won't be able to fix it in
> > the short term. So something like the IOCB_CACHED flag that prevents
> > generic_file_read_iter from issuing readahead I/O would save the day
> > for us. Does that idea stand a chance?
>
> For the short-term fix, is switching to a trylock in gfs2_readahead()
> acceptable?

Well, it's the only thing we can do for now, right?

> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index 72c9560f4467..6ccd478c81ff 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -600,7 +600,7 @@ static void gfs2_readahead(struct readahead_control *rac)
>         struct gfs2_inode *ip = GFS2_I(inode);
>         struct gfs2_holder gh;
>
> -       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
> +       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh);
>         if (gfs2_glock_nq(&gh))
>                 goto out_uninit;
>         if (!gfs2_is_stuffed(ip))

Thanks,
Andreas


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-06-24 12:35       ` Andreas Gruenbacher
@ 2020-07-02 15:16         ` Andreas Gruenbacher
  2020-07-02 17:30           ` Matthew Wilcox
  0 siblings, 1 reply; 18+ messages in thread
From: Andreas Gruenbacher @ 2020-07-02 15:16 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Dave Chinner, linux-fsdevel, Linux-MM, LKML

On Wed, Jun 24, 2020 at 2:35 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
> On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote:
> > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> > > I'm fine with not moving that functionality into the VFS. The problem
> > > I have in gfs2 is that taking glocks is really expensive. Part of that
> > > overhead is accidental, but we definitely won't be able to fix it in
> > > the short term. So something like the IOCB_CACHED flag that prevents
> > > generic_file_read_iter from issuing readahead I/O would save the day
> > > for us. Does that idea stand a chance?
> >
> > For the short-term fix, is switching to a trylock in gfs2_readahead()
> > acceptable?
>
> Well, it's the only thing we can do for now, right?

It turns out that gfs2 can still deadlock with a trylock in
gfs2_readahead, just differently: in this instance, gfs2_glock_nq will
call inode_dio_wait. When there is pending direct I/O, we'll end up
waiting for iomap_dio_complete, which will call
invalidate_inode_pages2_range, which will try to lock the pages
already locked for gfs2_readahead.

This late in the 5.8 release cycle, I'd like to propose converting
gfs2 back to use mpage_readpages. This requires reinstating
mpage_readpages, but it's otherwise relatively trivial.
We can then introduce an IOCB_CACHED or equivalent flag, fix the
locking order in gfs2, convert gfs2 to mpage_readahead, and finally
remove mage_readpages in 5.9.

I'll post a patch queue that does this for comment.

Thanks,
Andreas


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Bypass filesystems for reading cached pages
  2020-07-02 15:16         ` Andreas Gruenbacher
@ 2020-07-02 17:30           ` Matthew Wilcox
  0 siblings, 0 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-07-02 17:30 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: Dave Chinner, linux-fsdevel, Linux-MM, LKML

On Thu, Jul 02, 2020 at 05:16:43PM +0200, Andreas Gruenbacher wrote:
> On Wed, Jun 24, 2020 at 2:35 PM Andreas Gruenbacher <agruenba@redhat.com> wrote:
> > On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote:
> > > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote:
> > > > I'm fine with not moving that functionality into the VFS. The problem
> > > > I have in gfs2 is that taking glocks is really expensive. Part of that
> > > > overhead is accidental, but we definitely won't be able to fix it in
> > > > the short term. So something like the IOCB_CACHED flag that prevents
> > > > generic_file_read_iter from issuing readahead I/O would save the day
> > > > for us. Does that idea stand a chance?
> > >
> > > For the short-term fix, is switching to a trylock in gfs2_readahead()
> > > acceptable?
> >
> > Well, it's the only thing we can do for now, right?
> 
> It turns out that gfs2 can still deadlock with a trylock in
> gfs2_readahead, just differently: in this instance, gfs2_glock_nq will
> call inode_dio_wait. When there is pending direct I/O, we'll end up
> waiting for iomap_dio_complete, which will call
> invalidate_inode_pages2_range, which will try to lock the pages
> already locked for gfs2_readahead.

That seems like a bug in trylock.  If I trylock something I'm not
expecting it to wait; i'm expecting it to fail because it would have to wait.
ie something like this:

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c84887769b5a..97ca8f5ed22b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -470,7 +470,10 @@ static int inode_go_lock(struct gfs2_holder *gh)
                        return error;
        }
 
-       if (gh->gh_state != LM_ST_DEFERRED)
+       if (gh->gh_flags & LM_FLAG_TRY) {
+               if (atomic_read(&ip->i_inode.i_dio_count))
+                       return -EWOULDBLOCK;
+       } else if (gh->gh_state != LM_ST_DEFERRED)
                inode_dio_wait(&ip->i_inode);
 
        if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&

... but probably not exactly that because I didn't try to figure out the
calling conventions or whether I should set some state in the gfs2_holder.

> This late in the 5.8 release cycle, I'd like to propose converting
> gfs2 back to use mpage_readpages. This requires reinstating
> mpage_readpages, but it's otherwise relatively trivial.
> We can then introduce an IOCB_CACHED or equivalent flag, fix the
> locking order in gfs2, convert gfs2 to mpage_readahead, and finally
> remove mage_readpages in 5.9.

I would rather not do that.

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-07-02 17:30 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-19 15:50 [RFC] Bypass filesystems for reading cached pages Matthew Wilcox
2020-06-19 19:06 ` Chaitanya Kulkarni
2020-06-19 20:12   ` Matthew Wilcox
2020-06-19 21:25     ` Chaitanya Kulkarni
2020-06-20  6:19 ` Amir Goldstein
2020-06-20 19:15   ` Matthew Wilcox
2020-06-21  6:00     ` Amir Goldstein
2020-06-22  1:02     ` Dave Chinner
2020-06-22  0:32 ` Dave Chinner
2020-06-22 14:35   ` Andreas Gruenbacher
2020-06-22 18:13     ` Matthew Wilcox
2020-06-24 12:35       ` Andreas Gruenbacher
2020-07-02 15:16         ` Andreas Gruenbacher
2020-07-02 17:30           ` Matthew Wilcox
2020-06-23  0:52     ` Dave Chinner
2020-06-23  7:41       ` Andreas Gruenbacher
2020-06-22 19:18   ` Matthew Wilcox
2020-06-23  2:35     ` Dave Chinner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).