All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Cc: linux-kernel@vger.kernel.org, linux-xfs@vger.kernel.org,
	nvdimm@lists.linux.dev, linux-mm@kvack.org,
	linux-fsdevel@vger.kernel.org, dan.j.williams@intel.com,
	david@fromorbit.com, hch@infradead.org, jane.chu@oracle.com
Subject: Re: [PATCH v7 6/8] mm: Introduce mf_dax_kill_procs() for fsdax case
Date: Thu, 14 Oct 2021 12:32:41 -0700	[thread overview]
Message-ID: <20211014193241.GK24307@magnolia> (raw)
In-Reply-To: <20210924130959.2695749-7-ruansy.fnst@fujitsu.com>

On Fri, Sep 24, 2021 at 09:09:57PM +0800, Shiyang Ruan wrote:
> This function is called at the end of RMAP routine, i.e. filesystem
> recovery function, to collect and kill processes using a shared page of
> DAX file.  The difference between mf_generic_kill_procs() is,
> it accepts file's mapping,offset instead of struct page.  Because
> different file's mappings and offsets may share the same page in fsdax
> mode.  So, it is called when filesystem RMAP results are found.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  fs/dax.c            | 10 ------
>  include/linux/dax.h |  9 +++++
>  include/linux/mm.h  |  2 ++
>  mm/memory-failure.c | 83 ++++++++++++++++++++++++++++++++++++++++-----
>  4 files changed, 86 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 509b65e60478..2536c105ec7f 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -852,16 +852,6 @@ static void *dax_insert_entry(struct xa_state *xas,
>  	return entry;
>  }
>  
> -static inline
> -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
> -{
> -	unsigned long address;
> -
> -	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> -	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> -	return address;
> -}
> -
>  /* Walk all mappings of a given index of a file and writeprotect them */
>  static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
>  		unsigned long pfn)
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 65411bee4312..3d90becbd160 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -258,6 +258,15 @@ static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
>  }
> +static inline unsigned long pgoff_address(pgoff_t pgoff,
> +		struct vm_area_struct *vma)
> +{
> +	unsigned long address;
> +
> +	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> +	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> +	return address;
> +}
>  
>  #ifdef CONFIG_DEV_DAX_HMEM_DEVICES
>  void hmem_register_device(int target_nid, struct resource *r);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 73a52aba448f..d06af0051e53 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3114,6 +3114,8 @@ enum mf_flags {
>  	MF_MUST_KILL = 1 << 2,
>  	MF_SOFT_OFFLINE = 1 << 3,
>  };
> +extern int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> +			     size_t size, int flags);
>  extern int memory_failure(unsigned long pfn, int flags);
>  extern void memory_failure_queue(unsigned long pfn, int flags);
>  extern void memory_failure_queue_kick(int cpu);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 85eab206b68f..a9d0d487d205 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -302,10 +302,9 @@ void shake_page(struct page *p)
>  }
>  EXPORT_SYMBOL_GPL(shake_page);
>  
> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> +static unsigned long dev_pagemap_mapping_shift(unsigned long address,
>  		struct vm_area_struct *vma)
>  {
> -	unsigned long address = vma_address(page, vma);
>  	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
> @@ -345,7 +344,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
>   * Schedule a process for later kill.
>   * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
>   */
> -static void add_to_kill(struct task_struct *tsk, struct page *p,
> +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,

Hm, so I guess you're passing the page and the pgoff now because
page->index is meaningless for shared dax pages?  Ok.

>  		       struct vm_area_struct *vma,
>  		       struct list_head *to_kill)
>  {
> @@ -358,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
>  	}
>  
>  	tk->addr = page_address_in_vma(p, vma);
> -	if (is_zone_device_page(p))
> -		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> -	else
> +	if (is_zone_device_page(p)) {
> +		/*
> +		 * Since page->mapping is no more used for fsdax, we should
> +		 * calculate the address in a fsdax way.
> +		 */
> +		if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
> +			tk->addr = pgoff_address(pgoff, vma);
> +		tk->size_shift = dev_pagemap_mapping_shift(tk->addr, vma);
> +	} else
>  		tk->size_shift = page_shift(compound_head(p));
>  
>  	/*
> @@ -508,7 +513,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
>  			if (!page_mapped_in_vma(page, vma))
>  				continue;
>  			if (vma->vm_mm == t->mm)
> -				add_to_kill(t, page, vma, to_kill);
> +				add_to_kill(t, page, 0, vma, to_kill);
>  		}
>  	}
>  	read_unlock(&tasklist_lock);
> @@ -544,7 +549,32 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
>  			 * to be informed of all such data corruptions.
>  			 */
>  			if (vma->vm_mm == t->mm)
> -				add_to_kill(t, page, vma, to_kill);
> +				add_to_kill(t, page, 0, vma, to_kill);
> +		}
> +	}
> +	read_unlock(&tasklist_lock);
> +	i_mmap_unlock_read(mapping);
> +}
> +
> +/*
> + * Collect processes when the error hit a fsdax page.
> + */
> +static void collect_procs_fsdax(struct page *page, struct address_space *mapping,
> +		pgoff_t pgoff, struct list_head *to_kill)
> +{
> +	struct vm_area_struct *vma;
> +	struct task_struct *tsk;
> +
> +	i_mmap_lock_read(mapping);
> +	read_lock(&tasklist_lock);
> +	for_each_process(tsk) {
> +		struct task_struct *t = task_early_kill(tsk, true);
> +
> +		if (!t)
> +			continue;
> +		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> +			if (vma->vm_mm == t->mm)
> +				add_to_kill(t, page, pgoff, vma, to_kill);
>  		}
>  	}
>  	read_unlock(&tasklist_lock);
> @@ -1503,6 +1533,43 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags,
>  	return 0;
>  }
>  
> +/**
> + * mf_dax_kill_procs - Collect and kill processes who are using this file range
> + * @mapping:	the file in use
> + * @index:	start offset of the range
> + * @size:	length of the range

It feels odd that one argument is in units of pgoff_t but the other is
in bytes.

> + * @flags:	memory failure flags
> + */
> +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> +		size_t size, int flags)
> +{
> +	LIST_HEAD(to_kill);
> +	dax_entry_t cookie;
> +	struct page *page;
> +	size_t end = (index << PAGE_SHIFT) + size;
> +
> +	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;

Hm.  What flags will we be passing to the xfs_dax_notify_failure_fn?
Does XFS itself have to care about what's in the flags values, or is it
really just a magic cookie to be passed from the mm layer into the fs
and back to mf_dax_kill_procs?

--D

> +
> +	for (; (index << PAGE_SHIFT) < end; index++) {
> +		page = NULL;
> +		cookie = dax_lock_mapping_entry(mapping, index, &page);
> +		if (!cookie)
> +			return -EBUSY;
> +		if (!page)
> +			goto unlock;
> +
> +		SetPageHWPoison(page);
> +
> +		collect_procs_fsdax(page, mapping, index, &to_kill);
> +		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
> +				index, flags);
> +unlock:
> +		dax_unlock_mapping_entry(mapping, index, cookie);
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
> +
>  static int memory_failure_hugetlb(unsigned long pfn, int flags)
>  {
>  	struct page *p = pfn_to_page(pfn);
> -- 
> 2.33.0
> 
> 
> 

  reply	other threads:[~2021-10-14 19:32 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-24 13:09 [PATCH v7 0/8] [PATCH v7 0/8] fsdax: introduce fs query to support reflink Shiyang Ruan
2021-09-24 13:09 ` [PATCH v7 1/8] dax: Use rwsem for dax_{read,write}_lock() Shiyang Ruan
2021-10-14 17:48   ` Darrick J. Wong
2021-10-20  5:19     ` Shiyang Ruan
2021-10-15  6:30   ` Christoph Hellwig
2021-09-24 13:09 ` [PATCH v7 2/8] dax: Introduce holder for dax_device Shiyang Ruan
2021-10-14 18:00   ` Darrick J. Wong
2021-10-20  6:58     ` Shiyang Ruan
2021-09-24 13:09 ` [PATCH v7 3/8] mm: factor helpers for memory_failure_dev_pagemap Shiyang Ruan
2021-10-14 18:02   ` Darrick J. Wong
2021-10-15  6:33   ` Christoph Hellwig
2021-09-24 13:09 ` [PATCH v7 4/8] pagemap,pmem: Introduce ->memory_failure() Shiyang Ruan
2021-10-14 18:05   ` Darrick J. Wong
2021-10-20  5:25     ` Shiyang Ruan
2021-10-15  6:36   ` Christoph Hellwig
2021-09-24 13:09 ` [PATCH v7 5/8] fsdax: Introduce dax_lock_mapping_entry() Shiyang Ruan
2021-10-14 18:17   ` Darrick J. Wong
2021-09-24 13:09 ` [PATCH v7 6/8] mm: Introduce mf_dax_kill_procs() for fsdax case Shiyang Ruan
2021-10-14 19:32   ` Darrick J. Wong [this message]
2021-10-20  5:47     ` Shiyang Ruan
2021-09-24 13:09 ` [PATCH v7 7/8] xfs: Implement ->notify_failure() for XFS Shiyang Ruan
2021-10-14 19:21   ` Darrick J. Wong
2021-10-15  6:41   ` Christoph Hellwig
2021-09-24 13:09 ` [PATCH v7 8/8] fsdax: add exception for reflinked files Shiyang Ruan
2021-10-14 19:24   ` Darrick J. Wong
2021-10-15  6:38     ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211014193241.GK24307@magnolia \
    --to=djwong@kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=david@fromorbit.com \
    --cc=hch@infradead.org \
    --cc=jane.chu@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=nvdimm@lists.linux.dev \
    --cc=ruansy.fnst@fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.