* [PATCH 00/10] MM changes to improve swap-over-NFS support
@ 2022-03-06 23:49 NeilBrown
2022-03-06 23:49 ` [PATCH 06/10] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
` (9 more replies)
0 siblings, 10 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
Hi Andrew,
these are the non-NFS-specific parts of the swap-over-NFS series that
I've posted a few times. I've address all comments, most recently
style improvements in mm/swap.h.
Would it be possible for these to land in the next merge window?
Thanks,
NeilBrown
---
NeilBrown (10):
MM: create new mm/swap.h header file.
MM: drop swap_set_page_dirty
MM: move responsibility for setting SWP_FS_OPS to ->swap_activate
MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space
MM: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space
MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw
DOC: update documentation for swap_activate and swap_rw
MM: submit multipage reads for SWP_FS_OPS swap-space
MM: submit multipage write for SWP_FS_OPS swap-space
VFS: Add FMODE_CAN_ODIRECT file flag
Documentation/filesystems/locking.rst | 18 +-
Documentation/filesystems/vfs.rst | 17 +-
drivers/block/loop.c | 4 +-
fs/cifs/file.c | 7 +-
fs/fcntl.c | 9 +-
fs/nfs/file.c | 20 ++-
fs/open.c | 9 +-
fs/overlayfs/file.c | 13 +-
include/linux/fs.h | 4 +
include/linux/swap.h | 7 +-
include/linux/writeback.h | 7 +
mm/madvise.c | 8 +-
mm/memory.c | 2 +-
mm/page_io.c | 244 ++++++++++++++++++++------
mm/swap.h | 30 +++-
mm/swap_state.c | 22 ++-
mm/swapfile.c | 13 +-
mm/vmscan.c | 38 ++--
18 files changed, 347 insertions(+), 125 deletions(-)
--
Signature
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 01/10] MM: create new mm/swap.h header file.
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (7 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 04/10] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
Many functions declared in include/linux/swap.h are only used within mm/
Create a new "mm/swap.h" and move some of these declarations there.
Remove the redundant 'extern' from the function declarations.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
include/linux/swap.h | 121 ---------------------------------------------
mm/huge_memory.c | 1
mm/madvise.c | 1
mm/memcontrol.c | 1
mm/memory.c | 1
mm/mincore.c | 1
mm/page_alloc.c | 1
mm/page_io.c | 1
mm/shmem.c | 1
mm/swap.h | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 1
mm/swapfile.c | 1
mm/util.c | 1
mm/vmscan.c | 1
mm/zswap.c | 2 +
15 files changed, 147 insertions(+), 121 deletions(-)
create mode 100644 mm/swap.h
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1d38d9475c4d..3f54a8941c9d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -419,62 +419,19 @@ extern void kswapd_stop(int nid);
#ifdef CONFIG_SWAP
-#include <linux/blk_types.h> /* for bio_end_io_t */
-
-/* linux/mm/page_io.c */
-extern int swap_readpage(struct page *page, bool do_poll);
-extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern void end_swap_bio_write(struct bio *bio);
-extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func);
extern int swap_set_page_dirty(struct page *page);
-
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
sector_t *);
-/* linux/mm/swap_state.c */
-/* One swap address space for each 64M swap space */
-#define SWAP_ADDRESS_SPACE_SHIFT 14
-#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
-extern struct address_space *swapper_spaces[];
-#define swap_address_space(entry) \
- (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
- >> SWAP_ADDRESS_SPACE_SHIFT])
static inline unsigned long total_swapcache_pages(void)
{
return global_node_page_state(NR_SWAPCACHE);
}
-extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *page);
-extern void *get_shadow_from_swap_cache(swp_entry_t entry);
-extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
- gfp_t gfp, void **shadowp);
-extern void __delete_from_swap_cache(struct page *page,
- swp_entry_t entry, void *shadow);
-extern void delete_from_swap_cache(struct page *);
-extern void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end);
-extern void free_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
-extern struct page *lookup_swap_cache(swp_entry_t entry,
- struct vm_area_struct *vma,
- unsigned long addr);
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
-extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
- struct vm_area_struct *vma, unsigned long addr,
- bool do_poll);
-extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
- struct vm_area_struct *vma, unsigned long addr,
- bool *new_page_allocated);
-extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
-extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
-
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
@@ -528,12 +485,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
}
#else /* CONFIG_SWAP */
-
-static inline int swap_readpage(struct page *page, bool do_poll)
-{
- return 0;
-}
-
static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return NULL;
@@ -548,11 +499,6 @@ static inline void put_swap_device(struct swap_info_struct *si)
{
}
-static inline struct address_space *swap_address_space(swp_entry_t entry)
-{
- return NULL;
-}
-
#define get_nr_swap_pages() 0L
#define total_swap_pages 0L
#define total_swapcache_pages() 0UL
@@ -567,14 +513,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
#define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr));
-static inline void free_swap_cache(struct page *page)
-{
-}
-
-static inline void show_swap_cache_info(void)
-{
-}
-
/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
#define free_swap_and_cache(e) is_pfn_swap_entry(e)
@@ -600,65 +538,6 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp)
{
}
-static inline struct page *swap_cluster_readahead(swp_entry_t entry,
- gfp_t gfp_mask, struct vm_fault *vmf)
-{
- return NULL;
-}
-
-static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
- struct vm_fault *vmf)
-{
- return NULL;
-}
-
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
-{
- return 0;
-}
-
-static inline struct page *lookup_swap_cache(swp_entry_t swp,
- struct vm_area_struct *vma,
- unsigned long addr)
-{
- return NULL;
-}
-
-static inline
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
-{
- return find_get_page(mapping, index);
-}
-
-static inline int add_to_swap(struct page *page)
-{
- return 0;
-}
-
-static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
-{
- return NULL;
-}
-
-static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
- gfp_t gfp_mask, void **shadowp)
-{
- return -1;
-}
-
-static inline void __delete_from_swap_cache(struct page *page,
- swp_entry_t entry, void *shadow)
-{
-}
-
-static inline void delete_from_swap_cache(struct page *page)
-{
-}
-
-static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
-{
-}
static inline int page_swapcount(struct page *page)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 406a3c28c026..dae090f09038 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -38,6 +38,7 @@
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "swap.h"
/*
* By default, transparent hugepage support is disabled in order to avoid
diff --git a/mm/madvise.c b/mm/madvise.c
index 38d0f515d548..20e1a38b3adf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -35,6 +35,7 @@
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
struct madvise_walk_private {
struct mmu_gather *tlb;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36e9f38c919d..7856c3ad24b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,6 +66,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include "swap.h"
#include <linux/uaccess.h>
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..d25372340107 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -86,6 +86,7 @@
#include "pgalloc-track.h"
#include "internal.h"
+#include "swap.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
diff --git a/mm/mincore.c b/mm/mincore.c
index 9122676b54d6..f4f627325e12 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -20,6 +20,7 @@
#include <linux/pgtable.h>
#include <linux/uaccess.h>
+#include "swap.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589febc6d31..221aa3c10b78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -81,6 +81,7 @@
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
+#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
diff --git a/mm/page_io.c b/mm/page_io.c
index 0bf8e40f4e57..f8c26092e869 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,6 +26,7 @@
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
+#include "swap.h"
void end_swap_bio_write(struct bio *bio)
{
diff --git a/mm/shmem.c b/mm/shmem.c
index a09b29ec2b45..c8b8819fe2e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
+#include "swap.h"
static struct vfsmount *shm_mnt;
diff --git a/mm/swap.h b/mm/swap.h
new file mode 100644
index 000000000000..f8265bf0ce00
--- /dev/null
+++ b/mm/swap.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_H
+#define _MM_SWAP_H
+
+#ifdef CONFIG_SWAP
+#include <linux/blk_types.h> /* for bio_end_io_t */
+
+/* linux/mm/page_io.c */
+int swap_readpage(struct page *page, bool do_poll);
+int swap_writepage(struct page *page, struct writeback_control *wbc);
+void end_swap_bio_write(struct bio *bio);
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ bio_end_io_t end_write_func);
+
+/* linux/mm/swap_state.c */
+/* One swap address space for each 64M swap space */
+#define SWAP_ADDRESS_SPACE_SHIFT 14
+#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
+extern struct address_space *swapper_spaces[];
+#define swap_address_space(entry) \
+ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
+ >> SWAP_ADDRESS_SPACE_SHIFT])
+
+void show_swap_cache_info(void);
+int add_to_swap(struct page *page);
+void *get_shadow_from_swap_cache(swp_entry_t entry);
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp);
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow);
+void delete_from_swap_cache(struct page *page);
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end);
+void free_swap_cache(struct page *page);
+struct page *lookup_swap_cache(swp_entry_t entry,
+ struct vm_area_struct *vma,
+ unsigned long addr);
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
+
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool do_poll);
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool *new_page_allocated);
+struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+
+#else /* CONFIG_SWAP */
+static inline int swap_readpage(struct page *page, bool do_poll)
+{
+ return 0;
+}
+
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline void free_swap_cache(struct page *page)
+{
+}
+
+static inline void show_swap_cache_info(void)
+{
+}
+
+static inline struct page *swap_cluster_readahead(swp_entry_t entry,
+ gfp_t gfp_mask, struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+ return 0;
+}
+
+static inline struct page *lookup_swap_cache(swp_entry_t swp,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return NULL;
+}
+
+static inline
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ return find_get_page(mapping, index);
+}
+
+static inline int add_to_swap(struct page *page)
+{
+ return 0;
+}
+
+static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp_mask, void **shadowp)
+{
+ return -1;
+}
+
+static inline void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
+{
+}
+
+static inline void delete_from_swap_cache(struct page *page)
+{
+}
+
+static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+}
+
+#endif /* CONFIG_SWAP */
+#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d4104242100..bb38453425c7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap.h"
/*
* swapper_space is a fiction, retained to simplify the path through
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf0df7aa7158..71c7a31dd291 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -44,6 +44,7 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
diff --git a/mm/util.c b/mm/util.c
index d3102081add0..ce99f5ce8730 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include "internal.h"
+#include "swap.h"
/**
* kfree_const - conditionally free memory
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 59b14e0d696c..8a178400eea9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,6 +58,7 @@
#include <linux/balloon_compaction.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
diff --git a/mm/zswap.c b/mm/zswap.c
index cdf6950fcb2e..9192dc5f678f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -36,6 +36,8 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
+#include "swap.h"
+
/*********************************
* statistics
**********************************/
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 02/10] MM: drop swap_set_page_dirty
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (2 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 05/10] MM: introduce ->swap_rw and use it for reads from " NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 07/10] DOC: update documentation for swap_activate and swap_rw NeilBrown
` (5 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
Pages that are written to swap are owned by the MM subsystem - not any
filesystem.
When such a page is passed to a filesystem to be written out to a
swap-file, the filesystem handles the data, but the page itself does not
belong to the filesystem. So calling the filesystem's set_page_dirty
address_space operation makes no sense. This is for pages in the given
address space, and a page to be written to swap does not exist in the
given address space.
So drop swap_set_page_dirty() which calls the address-space's
set_page_dirty, and alway use __set_page_dirty_no_writeback, which is
appropriate for pages being swapped out.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
include/linux/swap.h | 1 -
mm/page_io.c | 14 --------------
mm/swap_state.c | 2 +-
3 files changed, 1 insertion(+), 16 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3f54a8941c9d..a43929f7033e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -419,7 +419,6 @@ extern void kswapd_stop(int nid);
#ifdef CONFIG_SWAP
-extern int swap_set_page_dirty(struct page *page);
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
diff --git a/mm/page_io.c b/mm/page_io.c
index f8c26092e869..34b12d6f94d7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -438,17 +438,3 @@ int swap_readpage(struct page *page, bool synchronous)
delayacct_swapin_end();
return ret;
}
-
-int swap_set_page_dirty(struct page *page)
-{
- struct swap_info_struct *sis = page_swap_info(page);
-
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct address_space *mapping = sis->swap_file->f_mapping;
-
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- return mapping->a_ops->set_page_dirty(page);
- } else {
- return __set_page_dirty_no_writeback(page);
- }
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bb38453425c7..514b86b05488 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -31,7 +31,7 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .set_page_dirty = swap_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 03/10] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (4 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 07/10] DOC: update documentation for swap_activate and swap_rw NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 10/10] VFS: Add FMODE_CAN_ODIRECT file flag NeilBrown
` (3 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
If a filesystem wishes to handle all swap IO itself (via ->direct_IO and
->readpage), rather than just providing devices addresses for
submit_bio(), SWP_FS_OPS must be set.
Currently the protocol for setting this it to have ->swap_activate
return zero. In that case SWP_FS_OPS is set, and add_swap_extent()
is called for the entire file.
This is a little clumsy as different return values for ->swap_activate
have quite different meanings, and it makes it hard to search for which
filesystems require SWP_FS_OPS to be set.
So remove the special meaning of a zero return, and require the
filesystem to set SWP_FS_OPS if it so desires, and to always call
add_swap_extent() as required.
Currently only NFS and CIFS return zero for add_swap_extent().
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/cifs/file.c | 3 ++-
fs/nfs/file.c | 13 +++++++++++--
include/linux/swap.h | 6 ++++++
mm/swapfile.c | 10 +++-------
4 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e7af802dcfa6..fe49f1cab018 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4917,7 +4917,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
* from reading or writing the file
*/
- return 0;
+ sis->flags |= SWP_FS_OPS;
+ return add_swap_extent(sis, 0, sis->max, 0);
}
static void cifs_swap_deactivate(struct file *file)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 76d76acbc594..d5aa55c7edb0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -488,6 +488,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
+ int ret;
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
struct inode *inode = file->f_mapping->host;
@@ -500,9 +501,17 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
return -EINVAL;
}
+ ret = rpc_clnt_swap_activate(clnt);
+ if (ret)
+ return ret;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ if (ret < 0) {
+ rpc_clnt_swap_deactivate(clnt);
+ return ret;
+ }
*span = sis->pages;
-
- return rpc_clnt_swap_activate(clnt);
+ sis->flags |= SWP_FS_OPS;
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a43929f7033e..b57cff3c5ac2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -573,6 +573,12 @@ static inline swp_entry_t get_swap_page(struct page *page)
return entry;
}
+static inline int add_swap_extent(struct swap_info_struct *sis,
+ unsigned long start_page,
+ unsigned long nr_pages, sector_t start_block)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_SWAP */
#ifdef CONFIG_THP_SWAP
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71c7a31dd291..ed6028aea8bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2347,13 +2347,9 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
- if (ret >= 0)
- sis->flags |= SWP_ACTIVATED;
- if (!ret) {
- sis->flags |= SWP_FS_OPS;
- ret = add_swap_extent(sis, 0, sis->max, 0);
- *span = sis->pages;
- }
+ if (ret < 0)
+ return ret;
+ sis->flags |= SWP_ACTIVATED;
return ret;
}
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 04/10] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (8 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 01/10] MM: create new mm/swap.h header file NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
If swap-out is using filesystem operations (SWP_FS_OPS), then it is not
safe to enter the FS for reclaim.
So only down-grade the requirement for swap pages to __GFP_IO after
checking that SWP_FS_OPS are not being used.
This makes the calculation of "may_enter_fs" slightly more complex, so
move it into a separate function. with that done, there is little value
in maintaining the bool variable any more. So replace the
may_enter_fs variable with a may_enter_fs() function. This removes any
risk for the variable becoming out-of-date.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
mm/swap.h | 8 ++++++++
mm/vmscan.c | 29 ++++++++++++++++++++---------
2 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/mm/swap.h b/mm/swap.h
index f8265bf0ce00..e19f185df5e2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -50,6 +50,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
+static inline unsigned int page_swap_flags(struct page *page)
+{
+ return page_swap_info(page)->flags;
+}
#else /* CONFIG_SWAP */
static inline int swap_readpage(struct page *page, bool do_poll)
{
@@ -129,5 +133,9 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
{
}
+static inline unsigned int page_swap_flags(struct page *page)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8a178400eea9..ffae4ba82eae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1508,6 +1508,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return nr_succeeded;
}
+static bool may_enter_fs(struct page *page, gfp_t gfp_mask)
+{
+ if (gfp_mask & __GFP_FS)
+ return true;
+ if (!PageSwapCache(page) || !(gfp_mask & __GFP_IO))
+ return false;
+ /*
+ * We can "enter_fs" for swap-cache with only __GFP_IO
+ * providing this isn't SWP_FS_OPS.
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ return !data_race(page_swap_flags(page) & SWP_FS_OPS);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1533,7 +1549,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
enum page_references references = PAGEREF_RECLAIM;
- bool dirty, writeback, may_enter_fs;
+ bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
@@ -1557,9 +1573,6 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1604,7 +1617,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
* not to fs). In this case mark the page for immediate
* reclaim and continue scanning.
*
- * Require may_enter_fs because we would wait on fs, which
+ * Require may_enter_fs() because we would wait on fs, which
* may not have submitted IO yet. And the loop driver might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
@@ -1636,7 +1649,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
- !PageReclaim(page) || !may_enter_fs) {
+ !PageReclaim(page) || !may_enter_fs(page, sc->gfp_mask)) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then
@@ -1726,8 +1739,6 @@ static unsigned int shrink_page_list(struct list_head *page_list,
goto activate_locked_split;
}
- may_enter_fs = true;
-
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
@@ -1797,7 +1808,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
- if (!may_enter_fs)
+ if (!may_enter_fs(page, sc->gfp_mask))
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 05/10] MM: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
2022-03-06 23:49 ` [PATCH 06/10] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
2022-03-06 23:49 ` [PATCH 09/10] MM: submit multipage write for SWP_FS_OPS swap-space NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 02/10] MM: drop swap_set_page_dirty NeilBrown
` (6 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
swap currently uses ->readpage to read swap pages. This can only
request one page at a time from the filesystem, which is not most
efficient.
swap uses ->direct_IO for writes which while this is adequate is an
inappropriate over-loading. ->direct_IO may need to had handle allocate
space for holes or other details that are not relevant for swap.
So this patch introduces a new address_space operation: ->swap_rw.
In this patch it is used for reads, and a subsequent patch will switch
writes to use it.
No filesystem yet supports ->swap_rw, but that is not a problem because
no filesystem actually works with filesystem-based swap.
Only two filesystems set SWP_FS_OPS:
- cifs sets the flag, but ->direct_IO always fails so swap cannot work.
- nfs sets the flag, but ->direct_IO calls generic_write_checks()
which has failed on swap files for several releases.
To ensure that a NULL ->swap_rw isn't called, ->activate_swap() for both
NFS and cifs are changed to fail if ->swap_rw is not set. This can be
removed if/when the function is added.
Future patches will restore swap-over-NFS functionality.
To submit an async read with ->swap_rw() we need to allocate a structure
to hold the kiocb and other details. swap_readpage() cannot handle
transient failure, so we create a mempool to provide the structures.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/cifs/file.c | 4 +++
fs/nfs/file.c | 4 +++
include/linux/fs.h | 1 +
mm/page_io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-----
mm/swap.h | 1 +
mm/swapfile.c | 5 ++++
6 files changed, 77 insertions(+), 6 deletions(-)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fe49f1cab018..6ae5b404b04b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4889,6 +4889,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
cifs_dbg(FYI, "swap activate\n");
+ if (!swap_file->f_mapping->a_ops->swap_rw)
+ /* Cannot support swap */
+ return -EINVAL;
+
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d5aa55c7edb0..3dbef2c31567 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -492,6 +492,10 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
struct inode *inode = file->f_mapping->host;
+ if (!file->f_mapping->a_ops->swap_rw)
+ /* Cannot support swap */
+ return -EINVAL;
+
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2d892b201b0..57e3b387cb17 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -409,6 +409,7 @@ struct address_space_operations {
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
sector_t *span);
void (*swap_deactivate)(struct file *file);
+ int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};
extern const struct address_space_operations empty_aops;
diff --git a/mm/page_io.c b/mm/page_io.c
index 34b12d6f94d7..e90a3231f225 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -284,6 +284,25 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
#define bio_associate_blkg_from_page(bio, page) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
+struct swap_iocb {
+ struct kiocb iocb;
+ struct bio_vec bvec;
+};
+static mempool_t *sio_pool;
+
+int sio_pool_init(void)
+{
+ if (!sio_pool) {
+ mempool_t *pool = mempool_create_kmalloc_pool(
+ SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
+ if (cmpxchg(&sio_pool, NULL, pool))
+ mempool_destroy(pool);
+ }
+ if (!sio_pool)
+ return -ENOMEM;
+ return 0;
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -355,6 +374,48 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
+static void sio_read_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ struct page *page = sio->bvec.bv_page;
+
+ if (ret != 0 && ret != PAGE_SIZE) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ pr_alert_ratelimited("Read-error on swap-device\n");
+ } else {
+ SetPageUptodate(page);
+ count_vm_event(PSWPIN);
+ }
+ unlock_page(page);
+ mempool_free(sio, sio_pool);
+}
+
+static int swap_readpage_fs(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct iov_iter from;
+ struct swap_iocb *sio;
+ loff_t pos = page_file_offset(page);
+ int ret;
+
+ sio = mempool_alloc(sio_pool, GFP_KERNEL);
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_pos = pos;
+ sio->iocb.ki_complete = sio_read_complete;
+ sio->bvec.bv_page = page;
+ sio->bvec.bv_len = PAGE_SIZE;
+ sio->bvec.bv_offset = 0;
+
+ iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_read_complete(&sio->iocb, ret);
+ return ret;
+}
+
int swap_readpage(struct page *page, bool synchronous)
{
struct bio *bio;
@@ -381,12 +442,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
if (data_race(sis->flags & SWP_FS_OPS)) {
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
-
- ret = mapping->a_ops->readpage(swap_file, page);
- if (!ret)
- count_vm_event(PSWPIN);
+ ret = swap_readpage_fs(page);
goto out;
}
diff --git a/mm/swap.h b/mm/swap.h
index e19f185df5e2..eafac80b18d9 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -6,6 +6,7 @@
#include <linux/blk_types.h> /* for bio_end_io_t */
/* linux/mm/page_io.c */
+int sio_pool_init(void);
int swap_readpage(struct page *page, bool do_poll);
int swap_writepage(struct page *page, struct writeback_control *wbc);
void end_swap_bio_write(struct bio *bio);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ed6028aea8bf..c800c17bf0c8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2350,6 +2350,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (ret < 0)
return ret;
sis->flags |= SWP_ACTIVATED;
+ if ((sis->flags & SWP_FS_OPS) &&
+ sio_pool_init() != 0) {
+ destroy_swap_extents(sis);
+ return -ENOMEM;
+ }
return ret;
}
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 06/10] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 09/10] MM: submit multipage write for SWP_FS_OPS swap-space NeilBrown
` (8 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
This patch switches swap-out to SWP_FS_OPS swap-spaces to use ->swap_rw
and makes the writes asynchronous, like they are for other swap spaces.
To make it async we need to allocate the kiocb struct from a mempool.
This may block, but won't block as long as waiting for the write to
complete. At most it will wait for some previous swap IO to complete.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
mm/page_io.c | 98 ++++++++++++++++++++++++++++++++++------------------------
1 file changed, 58 insertions(+), 40 deletions(-)
diff --git a/mm/page_io.c b/mm/page_io.c
index e90a3231f225..0f20295d8632 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -303,6 +303,57 @@ int sio_pool_init(void)
return 0;
}
+static void sio_write_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ struct page *page = sio->bvec.bv_page;
+
+ if (ret != PAGE_SIZE) {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * folio_rotate_reclaimable but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+ ret, page_file_offset(page));
+ } else
+ count_vm_event(PSWPOUT);
+ end_page_writeback(page);
+ mempool_free(sio, sio_pool);
+}
+
+static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
+{
+ struct swap_iocb *sio;
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct iov_iter from;
+ int ret;
+
+ set_page_writeback(page);
+ unlock_page(page);
+ sio = mempool_alloc(sio_pool, GFP_NOIO);
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_complete = sio_write_complete;
+ sio->iocb.ki_pos = page_file_offset(page);
+ sio->bvec.bv_page = page;
+ sio->bvec.bv_len = PAGE_SIZE;
+ sio->bvec.bv_offset = 0;
+ iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_write_complete(&sio->iocb, ret);
+ return ret;
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -311,46 +362,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct kiocb kiocb;
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from;
-
- iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
- init_sync_kiocb(&kiocb, swap_file);
- kiocb.ki_pos = page_file_offset(page);
-
- set_page_writeback(page);
- unlock_page(page);
- ret = mapping->a_ops->direct_IO(&kiocb, &from);
- if (ret == PAGE_SIZE) {
- count_vm_event(PSWPOUT);
- ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * folio_rotate_reclaimable but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
- set_page_dirty(page);
- ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
- page_file_offset(page));
- }
- end_page_writeback(page);
- return ret;
- }
+ /*
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ if (data_race(sis->flags & SWP_FS_OPS))
+ return swap_writepage_fs(page, wbc);
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 07/10] DOC: update documentation for swap_activate and swap_rw
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (3 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 02/10] MM: drop swap_set_page_dirty NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 03/10] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate NeilBrown
` (4 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
This documentation for ->swap_activate() has been out-of-date for a long
time. This patch updates it to match recent changes, and adds
documentation for the associated ->swap_rw()
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
Documentation/filesystems/locking.rst | 18 ++++++++++++------
Documentation/filesystems/vfs.rst | 17 ++++++++++++-----
2 files changed, 24 insertions(+), 11 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 3f9b1497ebb8..fbb10378d5ee 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -260,8 +260,9 @@ prototypes::
int (*launder_page)(struct page *);
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
int (*error_remove_page)(struct address_space *, struct page *);
- int (*swap_activate)(struct file *);
+ int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
int (*swap_deactivate)(struct file *);
+ int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
locking rules:
All except set_page_dirty and freepage may block
@@ -290,6 +291,7 @@ is_partially_uptodate: yes
error_remove_page: yes
swap_activate: no
swap_deactivate: no
+swap_rw: yes, unlocks
====================== ======================== ========= ===============
->write_begin(), ->write_end() and ->readpage() may be called from
@@ -392,15 +394,19 @@ cleaned, or an error value if not. Note that in order to prevent the page
getting mapped back in and redirtied, it needs to be kept locked
across the entire operation.
-->swap_activate will be called with a non-zero argument on
-files backing (non block device backed) swapfiles. A return value
-of zero indicates success, in which case this file can be used for
-backing swapspace. The swapspace operations will be proxied to the
-address space operations.
+->swap_activate() will be called to prepare the given file for swap. It
+should perform any validation and preparation necessary to ensure that
+writes can be performed with minimal memory allocation. It should call
+add_swap_extent(), or the helper iomap_swapfile_activate(), and return
+the number of extents added. If IO should be submitted through
+->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted
+directly to the block device ``sis->bdev``.
->swap_deactivate() will be called in the sys_swapoff()
path after ->swap_activate() returned success.
+->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate().
+
file_lock_operations
====================
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index bf5c48066fac..779d23fc7954 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -751,8 +751,9 @@ cache in your filesystem. The following members are defined:
unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
- int (*swap_activate)(struct file *);
+ int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
int (*swap_deactivate)(struct file *);
+ int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};
``writepage``
@@ -959,15 +960,21 @@ cache in your filesystem. The following members are defined:
unless you have them locked or reference counts increased.
``swap_activate``
- Called when swapon is used on a file to allocate space if
- necessary and pin the block lookup information in memory. A
- return value of zero indicates success, in which case this file
- can be used to back swapspace.
+
+ Called to prepare the given file for swap. It should perform
+ any validation and preparation necessary to ensure that writes
+ can be performed with minimal memory allocation. It should call
+ add_swap_extent(), or the helper iomap_swapfile_activate(), and
+ return the number of extents added. If IO should be submitted
+ through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will
+ be submitted directly to the block device ``sis->bdev``.
``swap_deactivate``
Called during swapoff on files where swap_activate was
successful.
+``swap_rw``
+ Called to read or write swap pages when SWP_FS_OPS is set.
The File Object
===============
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (6 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 10/10] VFS: Add FMODE_CAN_ODIRECT file flag NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 01/10] MM: create new mm/swap.h header file NeilBrown
2022-03-06 23:49 ` [PATCH 04/10] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
swap_readpage() is given one page at a time, but may be called
repeatedly in succession.
For block-device swap-space, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS
are single page reads.
With this patch we pass in a pointer-to-pointer when swap_readpage can
store state between calls - much like the effect of blk_plug. After
calling swap_readpage() some number of times, the state will be passed
to swap_read_unplug() which can submit the combined request.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
mm/madvise.c | 8 +++-
mm/memory.c | 2 +
mm/page_io.c | 104 ++++++++++++++++++++++++++++++++++++-------------------
mm/swap.h | 17 +++++++--
mm/swap_state.c | 20 +++++++----
5 files changed, 104 insertions(+), 47 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e1a38b3adf..eb73aff278ae 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -197,6 +197,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+ struct swap_iocb *splug = NULL;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
@@ -218,10 +219,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false);
+ vma, index, false, &splug);
if (page)
put_page(page);
}
+ swap_read_unplug(splug);
return 0;
}
@@ -237,6 +239,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
struct page *page;
+ struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
@@ -249,13 +252,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false);
+ NULL, 0, false, &splug);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
diff --git a/mm/memory.c b/mm/memory.c
index d25372340107..8bd18c54eaa4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3559,7 +3559,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
- swap_readpage(page, true);
+ swap_readpage(page, true, NULL);
set_page_private(page, 0);
}
} else {
diff --git a/mm/page_io.c b/mm/page_io.c
index 0f20295d8632..4e8abbfbe388 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -286,7 +286,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
struct swap_iocb {
struct kiocb iocb;
- struct bio_vec bvec;
+ struct bio_vec bvec[SWAP_CLUSTER_MAX];
+ int pages;
};
static mempool_t *sio_pool;
@@ -306,7 +307,7 @@ int sio_pool_init(void)
static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec.bv_page;
+ struct page *page = sio->bvec[0].bv_page;
if (ret != PAGE_SIZE) {
/*
@@ -344,10 +345,10 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
init_sync_kiocb(&sio->iocb, swap_file);
sio->iocb.ki_complete = sio_write_complete;
sio->iocb.ki_pos = page_file_offset(page);
- sio->bvec.bv_page = page;
- sio->bvec.bv_len = PAGE_SIZE;
- sio->bvec.bv_offset = 0;
- iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE);
+ sio->bvec[0].bv_page = page;
+ sio->bvec[0].bv_len = PAGE_SIZE;
+ sio->bvec[0].bv_offset = 0;
+ iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -395,46 +396,66 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
static void sio_read_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec.bv_page;
+ int p;
- if (ret != 0 && ret != PAGE_SIZE) {
- SetPageError(page);
- ClearPageUptodate(page);
- pr_alert_ratelimited("Read-error on swap-device\n");
+ if (ret == PAGE_SIZE * sio->pages) {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ count_vm_events(PSWPIN, sio->pages);
} else {
- SetPageUptodate(page);
- count_vm_event(PSWPIN);
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageError(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ pr_alert_ratelimited("Read-error on swap-device\n");
}
- unlock_page(page);
mempool_free(sio, sio_pool);
}
-static int swap_readpage_fs(struct page *page)
+static void swap_readpage_fs(struct page *page,
+ struct swap_iocb **plug)
{
struct swap_info_struct *sis = page_swap_info(page);
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct iov_iter from;
- struct swap_iocb *sio;
+ struct swap_iocb *sio = NULL;
loff_t pos = page_file_offset(page);
- int ret;
-
- sio = mempool_alloc(sio_pool, GFP_KERNEL);
- init_sync_kiocb(&sio->iocb, swap_file);
- sio->iocb.ki_pos = pos;
- sio->iocb.ki_complete = sio_read_complete;
- sio->bvec.bv_page = page;
- sio->bvec.bv_len = PAGE_SIZE;
- sio->bvec.bv_offset = 0;
- iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE);
- ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
- if (ret != -EIOCBQUEUED)
- sio_read_complete(&sio->iocb, ret);
- return ret;
+ if (plug)
+ sio = *plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != sis->swap_file ||
+ sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_KERNEL);
+ init_sync_kiocb(&sio->iocb, sis->swap_file);
+ sio->iocb.ki_pos = pos;
+ sio->iocb.ki_complete = sio_read_complete;
+ sio->pages = 0;
+ }
+ sio->bvec[sio->pages].bv_page = page;
+ sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+ sio->bvec[sio->pages].bv_offset = 0;
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ if (plug)
+ *plug = sio;
}
-int swap_readpage(struct page *page, bool synchronous)
+int swap_readpage(struct page *page, bool synchronous,
+ struct swap_iocb **plug)
{
struct bio *bio;
int ret = 0;
@@ -460,7 +481,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
if (data_race(sis->flags & SWP_FS_OPS)) {
- ret = swap_readpage_fs(page);
+ swap_readpage_fs(page, plug);
goto out;
}
@@ -512,3 +533,16 @@ int swap_readpage(struct page *page, bool synchronous)
delayacct_swapin_end();
return ret;
}
+
+void __swap_read_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
+
+ iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
+ PAGE_SIZE * sio->pages);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_read_complete(&sio->iocb, ret);
+}
diff --git a/mm/swap.h b/mm/swap.h
index eafac80b18d9..0389ab147837 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -7,7 +7,15 @@
/* linux/mm/page_io.c */
int sio_pool_init(void);
-int swap_readpage(struct page *page, bool do_poll);
+struct swap_iocb;
+int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+ if (unlikely(plug))
+ __swap_read_unplug(plug);
+}
int swap_writepage(struct page *page, struct writeback_control *wbc);
void end_swap_bio_write(struct bio *bio);
int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -41,7 +49,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
- bool do_poll);
+ bool do_poll,
+ struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
@@ -56,7 +65,9 @@ static inline unsigned int page_swap_flags(struct page *page)
return page_swap_info(page)->flags;
}
#else /* CONFIG_SWAP */
-static inline int swap_readpage(struct page *page, bool do_poll)
+struct swap_iocb;
+static inline int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 514b86b05488..c84779e2518b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -520,14 +520,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+ struct vm_area_struct *vma,
+ unsigned long addr, bool do_poll,
+ struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll);
+ swap_readpage(retpage, do_poll, plug);
return retpage;
}
@@ -621,6 +623,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
bool do_poll = true, page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -647,7 +650,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (offset != entry_offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -656,10 +659,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+ /* The page was likely read above, so no need for plugging here */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -790,6 +795,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
pte_t *pte, pentry;
@@ -820,7 +826,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (i != ra_info.offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -829,10 +835,12 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain();
skip:
+ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1);
+ ra_info.win == 1, NULL);
}
/**
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 09/10] MM: submit multipage write for SWP_FS_OPS swap-space
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
2022-03-06 23:49 ` [PATCH 06/10] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 05/10] MM: introduce ->swap_rw and use it for reads from " NeilBrown
` (7 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
swap_writepage() is given one page at a time, but may be called repeatedly
in succession.
For block-device swapspace, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS
are single page reads.
With this patch we pass a pointer-to-pointer via the wbc.
swap_writepage can store state between calls - much like the pointer
passed explicitly to swap_readpage. After calling swap_writepage() some
number of times, the state will be passed to swap_write_unplug() which
can submit the combined request.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
include/linux/writeback.h | 7 ++++
mm/page_io.c | 78 ++++++++++++++++++++++++++++++++-------------
mm/swap.h | 4 ++
mm/vmscan.c | 9 ++++-
4 files changed, 74 insertions(+), 24 deletions(-)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fec248ab1fec..32b35f21cb97 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -80,6 +80,13 @@ struct writeback_control {
unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
+ /* To enable batching of swap writes to non-block-device backends,
+ * "plug" can be set point to a 'struct swap_iocb *'. When all swap
+ * writes have been submitted, if with swap_iocb is not NULL,
+ * swap_write_unplug() should be called.
+ */
+ struct swap_iocb **swap_plug;
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
diff --git a/mm/page_io.c b/mm/page_io.c
index 4e8abbfbe388..3bf6547d6789 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -308,8 +308,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
struct page *page = sio->bvec[0].bv_page;
+ int p;
- if (ret != PAGE_SIZE) {
+ if (ret != PAGE_SIZE * sio->pages) {
/*
* In the case of swap-over-nfs, this can be a
* temporary failure if the system has limited
@@ -320,43 +321,63 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
* the normal direct-to-bio case as it could
* be temporary.
*/
- set_page_dirty(page);
- ClearPageReclaim(page);
pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
ret, page_file_offset(page));
+ for (p = 0; p < sio->pages; p++) {
+ page = sio->bvec[p].bv_page;
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ }
} else
- count_vm_event(PSWPOUT);
- end_page_writeback(page);
+ count_vm_events(PSWPOUT, sio->pages);
+
+ for (p = 0; p < sio->pages; p++)
+ end_page_writeback(sio->bvec[p].bv_page);
+
mempool_free(sio, sio_pool);
}
static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
{
- struct swap_iocb *sio;
+ struct swap_iocb *sio = NULL;
struct swap_info_struct *sis = page_swap_info(page);
struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct iov_iter from;
- int ret;
+ loff_t pos = page_file_offset(page);
set_page_writeback(page);
unlock_page(page);
- sio = mempool_alloc(sio_pool, GFP_NOIO);
- init_sync_kiocb(&sio->iocb, swap_file);
- sio->iocb.ki_complete = sio_write_complete;
- sio->iocb.ki_pos = page_file_offset(page);
- sio->bvec[0].bv_page = page;
- sio->bvec[0].bv_len = PAGE_SIZE;
- sio->bvec[0].bv_offset = 0;
- iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
- ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
- if (ret != -EIOCBQUEUED)
- sio_write_complete(&sio->iocb, ret);
- return ret;
+ if (wbc->swap_plug)
+ sio = *wbc->swap_plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != swap_file ||
+ sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_NOIO);
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_complete = sio_write_complete;
+ sio->iocb.ki_pos = pos;
+ sio->pages = 0;
+ }
+ sio->bvec[sio->pages].bv_page = page;
+ sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+ sio->bvec[sio->pages].bv_offset = 0;
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
+ if (wbc->swap_plug)
+ *wbc->swap_plug = sio;
+
+ return 0;
}
int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+ bio_end_io_t end_write_func)
{
struct bio *bio;
int ret;
@@ -393,6 +414,19 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
+void swap_write_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
+
+ iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages,
+ PAGE_SIZE * sio->pages);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_write_complete(&sio->iocb, ret);
+}
+
static void sio_read_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
diff --git a/mm/swap.h b/mm/swap.h
index 0389ab147837..a6da8f612904 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -16,6 +16,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
if (unlikely(plug))
__swap_read_unplug(plug);
}
+void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
void end_swap_bio_write(struct bio *bio);
int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -71,6 +72,9 @@ static inline int swap_readpage(struct page *page, bool do_poll,
{
return 0;
}
+static inline void swap_write_unplug(struct swap_iocb *sio)
+{
+}
static inline struct address_space *swap_address_space(swp_entry_t entry)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ffae4ba82eae..1918650abf39 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1166,7 +1166,8 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+ struct swap_iocb **plug)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -1213,6 +1214,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
+ .swap_plug = plug,
};
SetPageReclaim(page);
@@ -1539,6 +1541,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
+ struct swap_iocb *plug = NULL;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1819,7 +1822,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(page, mapping)) {
+ switch (pageout(page, mapping, &plug)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -1973,6 +1976,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ if (plug)
+ swap_write_unplug(plug);
return nr_reclaimed;
}
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 10/10] VFS: Add FMODE_CAN_ODIRECT file flag
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
` (5 preceding siblings ...)
2022-03-06 23:49 ` [PATCH 03/10] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate NeilBrown
@ 2022-03-06 23:49 ` NeilBrown
2022-03-06 23:49 ` [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space NeilBrown
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-06 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
Currently various places test if direct IO is possible on a file by
checking for the existence of the direct_IO address space operation.
This is a poor choice, as the direct_IO operation may not be used - it is
only used if the generic_file_*_iter functions are called for direct IO
and some filesystems - particularly NFS - don't do this.
Instead, introduce a new f_mode flag: FMODE_CAN_ODIRECT and change the
various places to check this (avoiding pointer dereferences).
do_dentry_open() will set this flag if ->direct_IO is present, so
filesystems do not need to be changed.
NFS *is* changed, to set the flag explicitly and discard the direct_IO
entry in the address_space_operations for files.
Other filesystems which currently use noop_direct_IO could usefully be
changed to set this flag instead.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
drivers/block/loop.c | 4 ++--
fs/fcntl.c | 9 ++++-----
fs/nfs/file.c | 3 ++-
fs/open.c | 9 ++++-----
fs/overlayfs/file.c | 13 ++++---------
include/linux/fs.h | 3 +++
6 files changed, 19 insertions(+), 22 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 19fe19eaa50e..59166c7bbcc0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -185,8 +185,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
*/
if (dio) {
if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
- !(lo->lo_offset & dio_align) &&
- mapping->a_ops->direct_IO)
+ !(lo->lo_offset & dio_align) &&
+ (file->f_mode & FMODE_CAN_ODIRECT))
use_dio = true;
else
use_dio = false;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..11e665242a76 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
- if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
- if (!filp->f_mapping || !filp->f_mapping->a_ops ||
- !filp->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if (!S_ISFIFO(inode->i_mode) &&
+ (arg & O_DIRECT) &&
+ !(filp->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3dbef2c31567..9e2def045111 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -74,6 +74,8 @@ nfs_file_open(struct inode *inode, struct file *filp)
return res;
res = nfs_open(inode, filp);
+ if (res == 0)
+ filp->f_mode |= FMODE_CAN_ODIRECT;
return res;
}
@@ -535,7 +537,6 @@ const struct address_space_operations nfs_file_aops = {
.write_end = nfs_write_end,
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
- .direct_IO = nfs_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
diff --git a/fs/open.c b/fs/open.c
index 9ff2f621b760..76ddf9014499 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -834,17 +834,16 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
+ if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
+ f->f_mode |= FMODE_CAN_ODIRECT;
f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fa125feed0ff..9d69b4dbb8c4 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
return -EPERM;
- if (flags & O_DIRECT) {
- if (!file->f_mapping->a_ops ||
- !file->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (file->f_op->check_flags) {
err = file->f_op->check_flags(flags);
@@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
@@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 57e3b387cb17..c34c53267415 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -161,6 +161,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is stream-like */
#define FMODE_STREAM ((__force fmode_t)0x200000)
+/* File supports DIRECT IO */
+#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space
2022-03-29 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
@ 2022-03-29 23:49 ` NeilBrown
0 siblings, 0 replies; 12+ messages in thread
From: NeilBrown @ 2022-03-29 23:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, David Howells, linux-nfs, linux-mm, linux-kernel
swap_readpage() is given one page at a time, but may be called
repeatedly in succession.
For block-device swap-space, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS
are single page reads.
With this patch we pass in a pointer-to-pointer when swap_readpage can
store state between calls - much like the effect of blk_plug. After
calling swap_readpage() some number of times, the state will be passed
to swap_read_unplug() which can submit the combined request.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
mm/madvise.c | 8 +++-
mm/memory.c | 2 +
mm/page_io.c | 104 ++++++++++++++++++++++++++++++++++++-------------------
mm/swap.h | 17 +++++++--
mm/swap_state.c | 20 +++++++----
5 files changed, 104 insertions(+), 47 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 4f48e48432e8..297de11f73d6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -198,6 +198,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+ struct swap_iocb *splug = NULL;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
@@ -219,10 +220,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false);
+ vma, index, false, &splug);
if (page)
put_page(page);
}
+ swap_read_unplug(splug);
return 0;
}
@@ -238,6 +240,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
struct page *page;
+ struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
@@ -250,13 +253,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false);
+ NULL, 0, false, &splug);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
diff --git a/mm/memory.c b/mm/memory.c
index 92ea8ac374a4..8de0ad307cb2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3586,7 +3586,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
- swap_readpage(page, true);
+ swap_readpage(page, true, NULL);
set_page_private(page, 0);
}
} else {
diff --git a/mm/page_io.c b/mm/page_io.c
index a01cc273bb00..8735707ea349 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -286,7 +286,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
struct swap_iocb {
struct kiocb iocb;
- struct bio_vec bvec;
+ struct bio_vec bvec[SWAP_CLUSTER_MAX];
+ int pages;
};
static mempool_t *sio_pool;
@@ -306,7 +307,7 @@ int sio_pool_init(void)
static void sio_write_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec.bv_page;
+ struct page *page = sio->bvec[0].bv_page;
if (ret != PAGE_SIZE) {
/*
@@ -344,10 +345,10 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
init_sync_kiocb(&sio->iocb, swap_file);
sio->iocb.ki_complete = sio_write_complete;
sio->iocb.ki_pos = page_file_offset(page);
- sio->bvec.bv_page = page;
- sio->bvec.bv_len = PAGE_SIZE;
- sio->bvec.bv_offset = 0;
- iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE);
+ sio->bvec[0].bv_page = page;
+ sio->bvec[0].bv_len = PAGE_SIZE;
+ sio->bvec[0].bv_offset = 0;
+ iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
@@ -395,46 +396,66 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
static void sio_read_complete(struct kiocb *iocb, long ret)
{
struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec.bv_page;
+ int p;
- if (ret != 0 && ret != PAGE_SIZE) {
- SetPageError(page);
- ClearPageUptodate(page);
- pr_alert_ratelimited("Read-error on swap-device\n");
+ if (ret == PAGE_SIZE * sio->pages) {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ count_vm_events(PSWPIN, sio->pages);
} else {
- SetPageUptodate(page);
- count_vm_event(PSWPIN);
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageError(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ pr_alert_ratelimited("Read-error on swap-device\n");
}
- unlock_page(page);
mempool_free(sio, sio_pool);
}
-static int swap_readpage_fs(struct page *page)
+static void swap_readpage_fs(struct page *page,
+ struct swap_iocb **plug)
{
struct swap_info_struct *sis = page_swap_info(page);
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct iov_iter from;
- struct swap_iocb *sio;
+ struct swap_iocb *sio = NULL;
loff_t pos = page_file_offset(page);
- int ret;
-
- sio = mempool_alloc(sio_pool, GFP_KERNEL);
- init_sync_kiocb(&sio->iocb, swap_file);
- sio->iocb.ki_pos = pos;
- sio->iocb.ki_complete = sio_read_complete;
- sio->bvec.bv_page = page;
- sio->bvec.bv_len = PAGE_SIZE;
- sio->bvec.bv_offset = 0;
- iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE);
- ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
- if (ret != -EIOCBQUEUED)
- sio_read_complete(&sio->iocb, ret);
- return ret;
+ if (plug)
+ sio = *plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != sis->swap_file ||
+ sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_KERNEL);
+ init_sync_kiocb(&sio->iocb, sis->swap_file);
+ sio->iocb.ki_pos = pos;
+ sio->iocb.ki_complete = sio_read_complete;
+ sio->pages = 0;
+ }
+ sio->bvec[sio->pages].bv_page = page;
+ sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+ sio->bvec[sio->pages].bv_offset = 0;
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ if (plug)
+ *plug = sio;
}
-int swap_readpage(struct page *page, bool synchronous)
+int swap_readpage(struct page *page, bool synchronous,
+ struct swap_iocb **plug)
{
struct bio *bio;
int ret = 0;
@@ -462,7 +483,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
if (data_race(sis->flags & SWP_FS_OPS)) {
- ret = swap_readpage_fs(page);
+ swap_readpage_fs(page, plug);
goto out;
}
@@ -513,3 +534,16 @@ int swap_readpage(struct page *page, bool synchronous)
delayacct_swapin_end();
return ret;
}
+
+void __swap_read_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
+
+ iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
+ PAGE_SIZE * sio->pages);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_read_complete(&sio->iocb, ret);
+}
diff --git a/mm/swap.h b/mm/swap.h
index eafac80b18d9..0389ab147837 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -7,7 +7,15 @@
/* linux/mm/page_io.c */
int sio_pool_init(void);
-int swap_readpage(struct page *page, bool do_poll);
+struct swap_iocb;
+int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+ if (unlikely(plug))
+ __swap_read_unplug(plug);
+}
int swap_writepage(struct page *page, struct writeback_control *wbc);
void end_swap_bio_write(struct bio *bio);
int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -41,7 +49,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
- bool do_poll);
+ bool do_poll,
+ struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr,
@@ -56,7 +65,9 @@ static inline unsigned int page_swap_flags(struct page *page)
return page_swap_info(page)->flags;
}
#else /* CONFIG_SWAP */
-static inline int swap_readpage(struct page *page, bool do_poll)
+struct swap_iocb;
+static inline int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f3ab01801629..d41746a572a2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -520,14 +520,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+ struct vm_area_struct *vma,
+ unsigned long addr, bool do_poll,
+ struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll);
+ swap_readpage(retpage, do_poll, plug);
return retpage;
}
@@ -621,6 +623,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
bool do_poll = true, page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -647,7 +650,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (offset != entry_offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -656,10 +659,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+ /* The page was likely read above, so no need for plugging here */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -790,6 +795,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
pte_t *pte, pentry;
@@ -820,7 +826,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (i != ra_info.offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -829,10 +835,12 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain();
skip:
+ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1);
+ ra_info.win == 1, NULL);
}
/**
^ permalink raw reply related [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-03-29 23:53 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-06 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
2022-03-06 23:49 ` [PATCH 06/10] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
2022-03-06 23:49 ` [PATCH 09/10] MM: submit multipage write for SWP_FS_OPS swap-space NeilBrown
2022-03-06 23:49 ` [PATCH 05/10] MM: introduce ->swap_rw and use it for reads from " NeilBrown
2022-03-06 23:49 ` [PATCH 02/10] MM: drop swap_set_page_dirty NeilBrown
2022-03-06 23:49 ` [PATCH 07/10] DOC: update documentation for swap_activate and swap_rw NeilBrown
2022-03-06 23:49 ` [PATCH 03/10] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate NeilBrown
2022-03-06 23:49 ` [PATCH 10/10] VFS: Add FMODE_CAN_ODIRECT file flag NeilBrown
2022-03-06 23:49 ` [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space NeilBrown
2022-03-06 23:49 ` [PATCH 01/10] MM: create new mm/swap.h header file NeilBrown
2022-03-06 23:49 ` [PATCH 04/10] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
2022-03-29 23:49 [PATCH 00/10] MM changes to improve swap-over-NFS support NeilBrown
2022-03-29 23:49 ` [PATCH 08/10] MM: submit multipage reads for SWP_FS_OPS swap-space NeilBrown
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).