All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
@ 2023-08-21  9:13 Tong Tiangen
  2023-08-21 18:33 ` Matthew Wilcox
  0 siblings, 1 reply; 8+ messages in thread
From: Tong Tiangen @ 2023-08-21  9:13 UTC (permalink / raw)
  To: Andrew Morton, Naoya Horiguchi, Miaohe Lin, wangkefeng.wang
  Cc: linux-mm, linux-kernel, Tong Tiangen

We found a softlock issue in our test, analyzed the logs, and found that
the relevant CPU call trace as follows:

CPU0:
  _do_fork
    -> copy_process()
      -> write_lock_irq(&tasklist_lock)  //Disable irq,waiting for
      					 //tasklist_lock

CPU1:
  wp_page_copy()
    ->pte_offset_map_lock()
      -> spin_lock(&page->ptl);        //Hold page->ptl
    -> ptep_clear_flush()
      -> flush_tlb_others() ...
        -> smp_call_function_many()
          -> arch_send_call_function_ipi_mask()
            -> csd_lock_wait()         //Waiting for other CPUs respond
	                               //IPI

CPU2:
  collect_procs_anon()
    -> read_lock(&tasklist_lock)       //Hold tasklist_lock
      ->for_each_process(tsk)
        -> page_mapped_in_vma()
          -> page_vma_mapped_walk()
	    -> map_pte()
              ->spin_lock(&page->ptl)  //Waiting for page->ptl

We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
softlockup is triggered.

For collect_procs_anon(), we will not modify the tasklist, but only perform
read traversal. Therefore, we can use rcu lock instead of spin lock
tasklist_lock, from this, we can break the softlock chain above.

The same logic can also be applied to:
 - collect_procs_file()
 - collect_procs_fsdax()
 - collect_procs_ksm()

Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
Since v1:
 - 1. According to Matthew's suggestion, only the comments of
      find_early_kill_thread() are modified, no need to hold the rcu lock.

Changes since RFC[1]:
 - 1. According to Naoya's suggestion, modify the tasklist_lock in the
      comment about locking order in mm/filemap.c.
 - 2. According to Kefeng's suggestion, optimize the implementation of
      find_early_kill_thread() without functional changes.
 - 3. Modify the title description.

[1] https://lore.kernel.org/lkml/20230815130154.1100779-1-tongtiangen@huawei.com/
---
 mm/filemap.c        |  3 ---
 mm/ksm.c            |  4 ++--
 mm/memory-failure.c | 16 ++++++++--------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 014b73eb96a1..dfade1ef1765 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,9 +121,6 @@
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
- *
- * ->i_mmap_rwsem
- *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
 static void page_cache_delete(struct address_space *mapping,
diff --git a/mm/ksm.c b/mm/ksm.c
index 8d6aee05421d..981af9c72e7a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2925,7 +2925,7 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 		struct anon_vma *av = rmap_item->anon_vma;
 
 		anon_vma_lock_read(av);
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		for_each_process(tsk) {
 			struct anon_vma_chain *vmac;
 			unsigned long addr;
@@ -2944,7 +2944,7 @@ void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 				}
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		anon_vma_unlock_read(av);
 	}
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7b01fffe7a79..4d6e43c88489 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -547,8 +547,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
  * on behalf of the thread group. Return task_struct of the (first found)
  * dedicated thread if found, and return NULL otherwise.
  *
- * We already hold read_lock(&tasklist_lock) in the caller, so we don't
- * have to call rcu_read_lock/unlock() in this function.
+ * We already hold rcu lock in the caller, so we don't have to call
+ * rcu_read_lock/unlock() in this function.
  */
 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 {
@@ -609,7 +609,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 		return;
 
 	pgoff = page_to_pgoff(page);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(tsk) {
 		struct anon_vma_chain *vmac;
 		struct task_struct *t = task_early_kill(tsk, force_early);
@@ -626,7 +626,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			add_to_kill_anon_file(t, page, vma, to_kill);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	anon_vma_unlock_read(av);
 }
 
@@ -642,7 +642,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 	pgoff_t pgoff;
 
 	i_mmap_lock_read(mapping);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	pgoff = page_to_pgoff(page);
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, force_early);
@@ -662,7 +662,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 				add_to_kill_anon_file(t, page, vma, to_kill);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	i_mmap_unlock_read(mapping);
 }
 
@@ -685,7 +685,7 @@ static void collect_procs_fsdax(struct page *page,
 	struct task_struct *tsk;
 
 	i_mmap_lock_read(mapping);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, true);
 
@@ -696,7 +696,7 @@ static void collect_procs_fsdax(struct page *page,
 				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	i_mmap_unlock_read(mapping);
 }
 #endif /* CONFIG_FS_DAX */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-21  9:13 [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs() Tong Tiangen
@ 2023-08-21 18:33 ` Matthew Wilcox
  2023-08-22  3:41   ` Tong Tiangen
  0 siblings, 1 reply; 8+ messages in thread
From: Matthew Wilcox @ 2023-08-21 18:33 UTC (permalink / raw)
  To: Tong Tiangen
  Cc: Andrew Morton, Naoya Horiguchi, Miaohe Lin, wangkefeng.wang,
	linux-mm, linux-kernel

On Mon, Aug 21, 2023 at 05:13:12PM +0800, Tong Tiangen wrote:
> We found a softlock issue in our test, analyzed the logs, and found that
> the relevant CPU call trace as follows:
> 
> CPU0:
>   _do_fork
>     -> copy_process()
>       -> write_lock_irq(&tasklist_lock)  //Disable irq,waiting for
>       					 //tasklist_lock
> 
> CPU1:
>   wp_page_copy()
>     ->pte_offset_map_lock()
>       -> spin_lock(&page->ptl);        //Hold page->ptl
>     -> ptep_clear_flush()
>       -> flush_tlb_others() ...
>         -> smp_call_function_many()
>           -> arch_send_call_function_ipi_mask()
>             -> csd_lock_wait()         //Waiting for other CPUs respond
> 	                               //IPI
> 
> CPU2:
>   collect_procs_anon()
>     -> read_lock(&tasklist_lock)       //Hold tasklist_lock
>       ->for_each_process(tsk)
>         -> page_mapped_in_vma()
>           -> page_vma_mapped_walk()
> 	    -> map_pte()
>               ->spin_lock(&page->ptl)  //Waiting for page->ptl
> 
> We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
> unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
> softlockup is triggered.
> 
> For collect_procs_anon(), we will not modify the tasklist, but only perform
> read traversal. Therefore, we can use rcu lock instead of spin lock
> tasklist_lock, from this, we can break the softlock chain above.

The only thing that's giving me pause is that there's no discussion
about why this is safe.  "We're not modifying it" isn't really enough
to justify going from read_lock() to rcu_read_lock().  When you take a
normal read_lock(), writers are not permitted and so you see an atomic
snapshot of the list.  With rcu_read_lock() you can see inconsistencies.
For example, if new tasks can be added to the tasklist, they may not
be seen by an iteration.  Is this OK?  Tasks may be removed from the
tasklist after they have been seen by the iteration.  Is this OK?

As I understand the list RCU code, it guarantees that all tasks which
were on the list before rcu_read_lock() and remain on the list after
rcu_read_unlock() will be seen by a list iteration, while tasks which
are added or removed during that time may or may not be seen.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-21 18:33 ` Matthew Wilcox
@ 2023-08-22  3:41   ` Tong Tiangen
  2023-08-22 12:08     ` Matthew Wilcox
  0 siblings, 1 reply; 8+ messages in thread
From: Tong Tiangen @ 2023-08-22  3:41 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, Naoya Horiguchi, Miaohe Lin, wangkefeng.wang,
	linux-mm, linux-kernel



在 2023/8/22 2:33, Matthew Wilcox 写道:
> On Mon, Aug 21, 2023 at 05:13:12PM +0800, Tong Tiangen wrote:
>> We found a softlock issue in our test, analyzed the logs, and found that
>> the relevant CPU call trace as follows:
>>
>> CPU0:
>>    _do_fork
>>      -> copy_process()
>>        -> write_lock_irq(&tasklist_lock)  //Disable irq,waiting for
>>        					 //tasklist_lock
>>
>> CPU1:
>>    wp_page_copy()
>>      ->pte_offset_map_lock()
>>        -> spin_lock(&page->ptl);        //Hold page->ptl
>>      -> ptep_clear_flush()
>>        -> flush_tlb_others() ...
>>          -> smp_call_function_many()
>>            -> arch_send_call_function_ipi_mask()
>>              -> csd_lock_wait()         //Waiting for other CPUs respond
>> 	                               //IPI
>>
>> CPU2:
>>    collect_procs_anon()
>>      -> read_lock(&tasklist_lock)       //Hold tasklist_lock
>>        ->for_each_process(tsk)
>>          -> page_mapped_in_vma()
>>            -> page_vma_mapped_walk()
>> 	    -> map_pte()
>>                ->spin_lock(&page->ptl)  //Waiting for page->ptl
>>
>> We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
>> unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
>> softlockup is triggered.
>>
>> For collect_procs_anon(), we will not modify the tasklist, but only perform
>> read traversal. Therefore, we can use rcu lock instead of spin lock
>> tasklist_lock, from this, we can break the softlock chain above.
> 
> The only thing that's giving me pause is that there's no discussion
> about why this is safe.  "We're not modifying it" isn't really enough
> to justify going from read_lock() to rcu_read_lock().  When you take a
> normal read_lock(), writers are not permitted and so you see an atomic
> snapshot of the list.  With rcu_read_lock() you can see inconsistencies.

Hi Matthew:

When rcu_read_lock() is used, the task list can be modified during the 
iteration, but cannot be seen during iteration. After the iteration is 
complete, the task list can be updated in the RCU mechanism. Therefore, 
the task list used by iteration can also be considered as a snapshot.

> For example, if new tasks can be added to the tasklist, they may not
> be seen by an iteration.  Is this OK?  

The newly added tasks does not access the HWPoison page, because the 
HWPoison page has been isolated from the 
buddy(memory_failure()->take_page_off_buddy()). Therefore, it is safe to 
see the newly added task during the iteration and not be seen by iteration.

Tasks may be removed from the
> tasklist after they have been seen by the iteration.  Is this OK?

Task be seen during iteration are deleted from the task list after 
iteration, it's task_struct is not released because reference counting 
is added in __add_to_kill(). Therefore, the subsequent processing of 
kill_procs() is not affected (sending signals to the task deleted from 
task list). so i think it's safe too.

> 
> As I understand the list RCU code, it guarantees that all tasks which
> were on the list before rcu_read_lock() and remain on the list after
> rcu_read_unlock() will be seen by a list iteration, while tasks which
> are added or removed during that time may or may not be seen.

As described above, i understand that the write update is not visible 
during the RCU read.

Thanks,
Tong.

> 
> .

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-22  3:41   ` Tong Tiangen
@ 2023-08-22 12:08     ` Matthew Wilcox
  2023-08-25  6:02       ` Naoya Horiguchi
  0 siblings, 1 reply; 8+ messages in thread
From: Matthew Wilcox @ 2023-08-22 12:08 UTC (permalink / raw)
  To: Tong Tiangen
  Cc: Andrew Morton, Naoya Horiguchi, Miaohe Lin, wangkefeng.wang,
	linux-mm, linux-kernel

On Tue, Aug 22, 2023 at 11:41:41AM +0800, Tong Tiangen wrote:
> 在 2023/8/22 2:33, Matthew Wilcox 写道:
> > On Mon, Aug 21, 2023 at 05:13:12PM +0800, Tong Tiangen wrote:
> > > We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
> > > unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
> > > softlockup is triggered.
> > > 
> > > For collect_procs_anon(), we will not modify the tasklist, but only perform
> > > read traversal. Therefore, we can use rcu lock instead of spin lock
> > > tasklist_lock, from this, we can break the softlock chain above.
> > 
> > The only thing that's giving me pause is that there's no discussion
> > about why this is safe.  "We're not modifying it" isn't really enough
> > to justify going from read_lock() to rcu_read_lock().  When you take a
> > normal read_lock(), writers are not permitted and so you see an atomic
> > snapshot of the list.  With rcu_read_lock() you can see inconsistencies.
> 
> Hi Matthew:
> 
> When rcu_read_lock() is used, the task list can be modified during the
> iteration, but cannot be seen during iteration. After the iteration is
> complete, the task list can be updated in the RCU mechanism. Therefore, the
> task list used by iteration can also be considered as a snapshot.

No, that's not true!  You are not iterating a snapshot of the list,
you're iterating the live list.  It will change under you.  RCU provides
you with some guarantees about that list.  See Documentation/RCU/listRCU.rst

> > For example, if new tasks can be added to the tasklist, they may not
> > be seen by an iteration.  Is this OK?
> 
> The newly added tasks does not access the HWPoison page, because the
> HWPoison page has been isolated from the
> buddy(memory_failure()->take_page_off_buddy()). Therefore, it is safe to see
> the newly added task during the iteration and not be seen by iteration.
> 
> Tasks may be removed from the
> > tasklist after they have been seen by the iteration.  Is this OK?
> 
> Task be seen during iteration are deleted from the task list after
> iteration, it's task_struct is not released because reference counting is
> added in __add_to_kill(). Therefore, the subsequent processing of
> kill_procs() is not affected (sending signals to the task deleted from task
> list). so i think it's safe too.

I don't know this code, but it seems unsafe to me.  Look:

collect_procs_anon:
        for_each_process(tsk) {
                struct task_struct *t = task_early_kill(tsk, force_early);
                        add_to_kill_anon_file(t, page, vma, to_kill);

add_to_kill_anon_file:
        __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);

__add_to_kill:
        get_task_struct(tsk);

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */

I don't see anything that prevents that refcount_inc from seeing a zero
refcount.  Usually that would be prevented by tasklist_lock, right?

Andrew, I think this patch is bad and needs to be dropped.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-22 12:08     ` Matthew Wilcox
@ 2023-08-25  6:02       ` Naoya Horiguchi
  2023-08-26  1:46         ` Tong Tiangen
  0 siblings, 1 reply; 8+ messages in thread
From: Naoya Horiguchi @ 2023-08-25  6:02 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Tong Tiangen, Andrew Morton, Naoya Horiguchi, Miaohe Lin,
	wangkefeng.wang, linux-mm, linux-kernel

On Tue, Aug 22, 2023 at 01:08:52PM +0100, Matthew Wilcox wrote:
> On Tue, Aug 22, 2023 at 11:41:41AM +0800, Tong Tiangen wrote:
> > 在 2023/8/22 2:33, Matthew Wilcox 写道:
> > > On Mon, Aug 21, 2023 at 05:13:12PM +0800, Tong Tiangen wrote:
> > > > We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
> > > > unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
> > > > softlockup is triggered.
> > > > 
> > > > For collect_procs_anon(), we will not modify the tasklist, but only perform
> > > > read traversal. Therefore, we can use rcu lock instead of spin lock
> > > > tasklist_lock, from this, we can break the softlock chain above.
> > > 
> > > The only thing that's giving me pause is that there's no discussion
> > > about why this is safe.  "We're not modifying it" isn't really enough
> > > to justify going from read_lock() to rcu_read_lock().  When you take a
> > > normal read_lock(), writers are not permitted and so you see an atomic
> > > snapshot of the list.  With rcu_read_lock() you can see inconsistencies.
> > 
> > Hi Matthew:
> > 
> > When rcu_read_lock() is used, the task list can be modified during the
> > iteration, but cannot be seen during iteration. After the iteration is
> > complete, the task list can be updated in the RCU mechanism. Therefore, the
> > task list used by iteration can also be considered as a snapshot.
> 
> No, that's not true!  You are not iterating a snapshot of the list,
> you're iterating the live list.  It will change under you.  RCU provides
> you with some guarantees about that list.  See Documentation/RCU/listRCU.rst
> 
> > > For example, if new tasks can be added to the tasklist, they may not
> > > be seen by an iteration.  Is this OK?
> > 
> > The newly added tasks does not access the HWPoison page, because the
> > HWPoison page has been isolated from the
> > buddy(memory_failure()->take_page_off_buddy()). Therefore, it is safe to see
> > the newly added task during the iteration and not be seen by iteration.
> > 
> > Tasks may be removed from the
> > > tasklist after they have been seen by the iteration.  Is this OK?
> > 
> > Task be seen during iteration are deleted from the task list after
> > iteration, it's task_struct is not released because reference counting is
> > added in __add_to_kill(). Therefore, the subsequent processing of
> > kill_procs() is not affected (sending signals to the task deleted from task
> > list). so i think it's safe too.
> 
> I don't know this code, but it seems unsafe to me.  Look:
> 
> collect_procs_anon:
>         for_each_process(tsk) {
>                 struct task_struct *t = task_early_kill(tsk, force_early);
>                         add_to_kill_anon_file(t, page, vma, to_kill);
> 
> add_to_kill_anon_file:
>         __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
> 
> __add_to_kill:
>         get_task_struct(tsk);
> 
> static inline struct task_struct *get_task_struct(struct task_struct *t)
> {
>         refcount_inc(&t->usage);
>         return t;
> }
> 
> /**
>  * refcount_inc - increment a refcount
>  * @r: the refcount to increment
>  *
>  * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
>  *
>  * Provides no memory ordering, it is assumed the caller already has a
>  * reference on the object.
>  *
>  * Will WARN if the refcount is 0, as this represents a possible use-after-free
>  * condition.
>  */
> 
> I don't see anything that prevents that refcount_inc from seeing a zero
> refcount.  Usually that would be prevented by tasklist_lock, right?

This "calling get_task_struct in for_each_process loop with read_rcu_lock"
pattern seems to be used also in mm/oom_kill.c (for example in select_bad_process()),
so this might be more generic problem.
I tried to see how OOM code prevents the issue, but there seems nothing to do that.
oom_kill_process(), which is responsible for terminating the victim process, directly
tries to acquire task_lock(victim), despite *victim could be freed at this point.
If someone knows OOM code is safe for some reason, hwpoison could potentially follow
the approach.

Thanks,
Naoya Horiguchi

> 
> Andrew, I think this patch is bad and needs to be dropped.
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-25  6:02       ` Naoya Horiguchi
@ 2023-08-26  1:46         ` Tong Tiangen
  2023-08-26 20:28           ` Matthew Wilcox
  0 siblings, 1 reply; 8+ messages in thread
From: Tong Tiangen @ 2023-08-26  1:46 UTC (permalink / raw)
  To: Naoya Horiguchi, Matthew Wilcox, Paul E. McKenney
  Cc: Andrew Morton, Naoya Horiguchi, Miaohe Lin, wangkefeng.wang,
	linux-mm, linux-kernel



在 2023/8/25 14:02, Naoya Horiguchi 写道:
> On Tue, Aug 22, 2023 at 01:08:52PM +0100, Matthew Wilcox wrote:
>> On Tue, Aug 22, 2023 at 11:41:41AM +0800, Tong Tiangen wrote:
>>> 在 2023/8/22 2:33, Matthew Wilcox 写道:
>>>> On Mon, Aug 21, 2023 at 05:13:12PM +0800, Tong Tiangen wrote:
>>>>> We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
>>>>> unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
>>>>> softlockup is triggered.
>>>>>
>>>>> For collect_procs_anon(), we will not modify the tasklist, but only perform
>>>>> read traversal. Therefore, we can use rcu lock instead of spin lock
>>>>> tasklist_lock, from this, we can break the softlock chain above.
>>>>
>>>> The only thing that's giving me pause is that there's no discussion
>>>> about why this is safe.  "We're not modifying it" isn't really enough
>>>> to justify going from read_lock() to rcu_read_lock().  When you take a
>>>> normal read_lock(), writers are not permitted and so you see an atomic
>>>> snapshot of the list.  With rcu_read_lock() you can see inconsistencies.
>>>
>>> Hi Matthew:
>>>
>>> When rcu_read_lock() is used, the task list can be modified during the
>>> iteration, but cannot be seen during iteration. After the iteration is
>>> complete, the task list can be updated in the RCU mechanism. Therefore, the
>>> task list used by iteration can also be considered as a snapshot.
>>
>> No, that's not true!  You are not iterating a snapshot of the list,
>> you're iterating the live list.  It will change under you.  RCU provides
>> you with some guarantees about that list.  See Documentation/RCU/listRCU.rst
>>
>>>> For example, if new tasks can be added to the tasklist, they may not
>>>> be seen by an iteration.  Is this OK?
>>>
>>> The newly added tasks does not access the HWPoison page, because the
>>> HWPoison page has been isolated from the
>>> buddy(memory_failure()->take_page_off_buddy()). Therefore, it is safe to see
>>> the newly added task during the iteration and not be seen by iteration.
>>>
>>> Tasks may be removed from the
>>>> tasklist after they have been seen by the iteration.  Is this OK?
>>>
>>> Task be seen during iteration are deleted from the task list after
>>> iteration, it's task_struct is not released because reference counting is
>>> added in __add_to_kill(). Therefore, the subsequent processing of
>>> kill_procs() is not affected (sending signals to the task deleted from task
>>> list). so i think it's safe too.
>>
>> I don't know this code, but it seems unsafe to me.  Look:
>>
>> collect_procs_anon:
>>          for_each_process(tsk) {
>>                  struct task_struct *t = task_early_kill(tsk, force_early);
>>                          add_to_kill_anon_file(t, page, vma, to_kill);
>>
>> add_to_kill_anon_file:
>>          __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
>>
>> __add_to_kill:
>>          get_task_struct(tsk);
>>
>> static inline struct task_struct *get_task_struct(struct task_struct *t)
>> {
>>          refcount_inc(&t->usage);
>>          return t;
>> }
>>
>> /**
>>   * refcount_inc - increment a refcount
>>   * @r: the refcount to increment
>>   *
>>   * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
>>   *
>>   * Provides no memory ordering, it is assumed the caller already has a
>>   * reference on the object.
>>   *
>>   * Will WARN if the refcount is 0, as this represents a possible use-after-free
>>   * condition.
>>   */
>>
>> I don't see anything that prevents that refcount_inc from seeing a zero
>> refcount.  Usually that would be prevented by tasklist_lock, right?
> 
> This "calling get_task_struct in for_each_process loop with read_rcu_lock"
> pattern seems to be used also in mm/oom_kill.c (for example in select_bad_process()),
> so this might be more generic problem.

Task list iteration is described in Documentation/RCU/listRCU.rst, part 
of the description is as follows:

" the ``task_struct`` object is freed only after one or more
grace periods elapse, with the help of call_rcu(), which is invoked via
put_task_struct_rcu_user(). "

Combined with the code,when the task exits:

release_task()
	__exit_signal()
		__unhash_process()
			list_del_rcu(&p->tasks)
	
	put_task_struct_rcu_user()
		call_rcu(&task->rcu, delayed_put_task_struct);
			
delayed_put_task_struct()
	put_task_struct()
		if (refcount_sub_and_test(nr, &t->usage))
			__put_task_struct()
				free_task()
	
The code is consistent with the description in the document.

According to this understanding, i think for_each_process() under the 
protection of rcu locl is safe, that is, task_struct in the list will 
not be destroyed, and get_task_struct() is also safe.

Maybe Paul can make some answers :)

Thanks,
Tong.

> I tried to see how OOM code prevents the issue, but there seems nothing to do that.
> oom_kill_process(), which is responsible for terminating the victim process, directly
> tries to acquire task_lock(victim), despite *victim could be freed at this point.
> If someone knows OOM code is safe for some reason, hwpoison could potentially follow
> the approach.
> 
> Thanks,
> Naoya Horiguchi
> 
>>
>> Andrew, I think this patch is bad and needs to be dropped.
>>
>>
> .

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-26  1:46         ` Tong Tiangen
@ 2023-08-26 20:28           ` Matthew Wilcox
  2023-08-28  2:36             ` Tong Tiangen
  0 siblings, 1 reply; 8+ messages in thread
From: Matthew Wilcox @ 2023-08-26 20:28 UTC (permalink / raw)
  To: Tong Tiangen
  Cc: Naoya Horiguchi, Paul E. McKenney, Andrew Morton,
	Naoya Horiguchi, Miaohe Lin, wangkefeng.wang, linux-mm,
	linux-kernel

On Sat, Aug 26, 2023 at 09:46:53AM +0800, Tong Tiangen wrote:
> " the ``task_struct`` object is freed only after one or more
> grace periods elapse, with the help of call_rcu(), which is invoked via
> put_task_struct_rcu_user(). "
> 
> Combined with the code,when the task exits:
> 
> release_task()
> 	__exit_signal()
> 		__unhash_process()
> 			list_del_rcu(&p->tasks)
> 	
> 	put_task_struct_rcu_user()
> 		call_rcu(&task->rcu, delayed_put_task_struct);
> 			
> delayed_put_task_struct()
> 	put_task_struct()
> 		if (refcount_sub_and_test(nr, &t->usage))
> 			__put_task_struct()
> 				free_task()
> 	
> The code is consistent with the description in the document.
> 
> According to this understanding, i think for_each_process() under the
> protection of rcu locl is safe, that is, task_struct in the list will not be
> destroyed, and get_task_struct() is also safe.

Aha!  This is different from the usual pattern.  What I'm used to seeing
is:

if (refcount_sub_and_test()) {
	list_del_rcu();
	rcu_free();
}

and then on the read side you need a refcount_inc_not_zero(), which we
didn't have here.  Given this new information you've found, I withdraw
my objection.  It'd be nice to include some of this analysis in an
updated changelog (and maybe improved documentation for tasklist?).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()
  2023-08-26 20:28           ` Matthew Wilcox
@ 2023-08-28  2:36             ` Tong Tiangen
  0 siblings, 0 replies; 8+ messages in thread
From: Tong Tiangen @ 2023-08-28  2:36 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Naoya Horiguchi, Paul E. McKenney, Andrew Morton,
	Naoya Horiguchi, Miaohe Lin, wangkefeng.wang, linux-mm,
	linux-kernel



在 2023/8/27 4:28, Matthew Wilcox 写道:
> On Sat, Aug 26, 2023 at 09:46:53AM +0800, Tong Tiangen wrote:
>> " the ``task_struct`` object is freed only after one or more
>> grace periods elapse, with the help of call_rcu(), which is invoked via
>> put_task_struct_rcu_user(). "
>>
>> Combined with the code,when the task exits:
>>
>> release_task()
>> 	__exit_signal()
>> 		__unhash_process()
>> 			list_del_rcu(&p->tasks)
>> 	
>> 	put_task_struct_rcu_user()
>> 		call_rcu(&task->rcu, delayed_put_task_struct);
>> 			
>> delayed_put_task_struct()
>> 	put_task_struct()
>> 		if (refcount_sub_and_test(nr, &t->usage))
>> 			__put_task_struct()
>> 				free_task()
>> 	
>> The code is consistent with the description in the document.
>>
>> According to this understanding, i think for_each_process() under the
>> protection of rcu locl is safe, that is, task_struct in the list will not be
>> destroyed, and get_task_struct() is also safe.
> 
> Aha!  This is different from the usual pattern.  What I'm used to seeing
> is:
> 
> if (refcount_sub_and_test()) {
> 	list_del_rcu();
> 	rcu_free();
> }
> 
> and then on the read side you need a refcount_inc_not_zero(), which we
> didn't have here.  Given this new information you've found, I withdraw
> my objection.  It'd be nice to include some of this analysis in an
> updated changelog (and maybe improved documentation for tasklist?).

OK, commit message and changelog have been updated, and a new patch 
version v3 has been sent.

Thanks,
Tong.

> 
> .

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-08-28  2:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-21  9:13 [PATCH v2] mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs() Tong Tiangen
2023-08-21 18:33 ` Matthew Wilcox
2023-08-22  3:41   ` Tong Tiangen
2023-08-22 12:08     ` Matthew Wilcox
2023-08-25  6:02       ` Naoya Horiguchi
2023-08-26  1:46         ` Tong Tiangen
2023-08-26 20:28           ` Matthew Wilcox
2023-08-28  2:36             ` Tong Tiangen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.