linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
@ 2019-12-11 19:46 Waiman Long
  2019-12-11 22:04 ` Mike Kravetz
  2019-12-11 23:05 ` Andi Kleen
  0 siblings, 2 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-11 19:46 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton
  Cc: linux-kernel, linux-mm, Matthew Wilcox, Waiman Long

The following lockdep splat was observed when a certain hugetlbfs test
was run:

[  612.388273] ================================
[  612.411273] WARNING: inconsistent lock state
[  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
[  612.469273] --------------------------------
[  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
[  612.576273] {SOFTIRQ-ON-W} state was registered at:
[  612.598273]   lock_acquire+0x14f/0x3b0
[  612.616273]   _raw_spin_lock+0x30/0x70
[  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
[  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
[  612.681273]   proc_sys_call_handler+0x37f/0x450
[  612.703273]   vfs_write+0x157/0x460
[  612.719273]   ksys_write+0xb8/0x170
[  612.736273]   do_syscall_64+0xa5/0x4d0
[  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[  612.777273] irq event stamp: 691296
[  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
[  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
[  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
[  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
[  612.962273]
[  612.962273] other info that might help us debug this:
[  612.993273]  Possible unsafe locking scenario:
[  612.993273]
[  613.020273]        CPU0
[  613.031273]        ----
[  613.042273]   lock(hugetlb_lock);
[  613.057273]   <Interrupt>
[  613.069273]     lock(hugetlb_lock);
[  613.085273]
[  613.085273]  *** DEADLOCK ***
      :
[  613.245273] Call Trace:
[  613.256273]  <IRQ>
[  613.265273]  dump_stack+0x9a/0xf0
[  613.281273]  mark_lock+0xd0c/0x12f0
[  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
[  613.322273]  ? sched_clock_cpu+0x18/0x1e0
[  613.341273]  __lock_acquire+0x146b/0x48c0
[  613.360273]  ? trace_hardirqs_on+0x10/0x10
[  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
[  613.401273]  lock_acquire+0x14f/0x3b0
[  613.419273]  ? free_huge_page+0x36f/0xaa0
[  613.440273]  _raw_spin_lock+0x30/0x70
[  613.458273]  ? free_huge_page+0x36f/0xaa0
[  613.477273]  free_huge_page+0x36f/0xaa0
[  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
[  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
[  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
[  613.558273]  ? bio_endio+0x4ba/0x930
[  613.575273]  ? blk_account_io_completion+0x400/0x530
[  613.598273]  blk_update_request+0x276/0xe50
[  613.617273]  scsi_end_request+0x7b/0x6a0
[  613.636273]  ? lock_downgrade+0x6f0/0x6f0
[  613.654273]  scsi_io_completion+0x1c6/0x1570
[  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
[  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
[  613.718273]  blk_done_softirq+0x22e/0x350
[  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
[  613.758273]  __do_softirq+0x23d/0xad8
[  613.776273]  irq_exit+0x23e/0x2b0
[  613.792273]  do_IRQ+0x11a/0x200
[  613.806273]  common_interrupt+0xf/0xf
[  613.823273]  </IRQ>

Since hugetlb_lock can be taken from both process and softIRQ contexts,
we need to protect the lock from nested locking by disabling softIRQ
using spin_lock_bh() before taking it.

Currently, only free_huge_page() is known to be called from softIRQ
context.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 mm/hugetlb.c | 100 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 45 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..5426f59683d9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -59,7 +59,9 @@ static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
- * free_huge_pages, and surplus_huge_pages.
+ * free_huge_pages, and surplus_huge_pages. These protected data can be
+ * accessed from both process and softIRQ contexts. Therefore, the spinlock
+ * needs to be acquired with softIRQ disabled to prevent nested locking.
  */
 DEFINE_SPINLOCK(hugetlb_lock);
 
@@ -1175,7 +1177,7 @@ void free_huge_page(struct page *page)
 			restore_reserve = true;
 	}
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	clear_page_huge_active(page);
 	hugetlb_cgroup_uncharge_page(hstate_index(h),
 				     pages_per_huge_page(h), page);
@@ -1196,18 +1198,18 @@ void free_huge_page(struct page *page)
 		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
 	}
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	set_hugetlb_cgroup(page, NULL);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1438,7 +1440,7 @@ int dissolve_free_huge_page(struct page *page)
 	if (!PageHuge(page))
 		return 0;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	if (!PageHuge(page)) {
 		rc = 0;
 		goto out;
@@ -1466,7 +1468,7 @@ int dissolve_free_huge_page(struct page *page)
 		rc = 0;
 	}
 out:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 	return rc;
 }
 
@@ -1508,16 +1510,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	if (hstate_is_gigantic(h))
 		return NULL;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
 		goto out_unlock;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 	if (!page)
 		return NULL;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	/*
 	 * We could have raced with the pool size change.
 	 * Double check that and simply deallocate the new page
@@ -1527,7 +1529,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 */
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 		SetPageHugeTemporary(page);
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_bh(&hugetlb_lock);
 		put_page(page);
 		return NULL;
 	} else {
@@ -1536,7 +1538,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	}
 
 out_unlock:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	return page;
 }
@@ -1591,10 +1593,10 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
 	if (nid != NUMA_NO_NODE)
 		gfp_mask |= __GFP_THISNODE;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	if (h->free_huge_pages - h->resv_huge_pages > 0)
 		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	if (!page)
 		page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
@@ -1608,17 +1610,17 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h);
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	if (h->free_huge_pages - h->resv_huge_pages > 0) {
 		struct page *page;
 
 		page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
 		if (page) {
-			spin_unlock(&hugetlb_lock);
+			spin_unlock_bh(&hugetlb_lock);
 			return page;
 		}
 	}
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
@@ -1664,7 +1666,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 
 	ret = -ENOMEM;
 retry:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
 		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
 				NUMA_NO_NODE, NULL);
@@ -1681,7 +1683,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
 	 * because either resv_huge_pages or free_huge_pages may have changed.
 	 */
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	needed = (h->resv_huge_pages + delta) -
 			(h->free_huge_pages + allocated);
 	if (needed > 0) {
@@ -1719,12 +1721,12 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 		enqueue_huge_page(h, page);
 	}
 free:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	/* Free unnecessary surplus pages to the buddy allocator */
 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
 		put_page(page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 
 	return ret;
 }
@@ -1738,7 +1740,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  *    the reservation.  As many as unused_resv_pages may be freed.
  *
  * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
+ * reacquired) around calls to cond_resched().  Whenever dropping the lock,
  * we must make sure nobody else can claim pages we are in the process of
  * freeing.  Do this by ensuring resv_huge_page always is greater than the
  * number of huge pages we plan to free when dropping the lock.
@@ -1775,7 +1777,11 @@ static void return_unused_surplus_pages(struct hstate *h,
 		unused_resv_pages--;
 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
 			goto out;
-		cond_resched_lock(&hugetlb_lock);
+		if (need_resched()) {
+			spin_unlock_bh(&hugetlb_lock);
+			cond_resched();
+			spin_lock_bh(&hugetlb_lock);
+		}
 	}
 
 out:
@@ -1994,7 +2000,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (ret)
 		goto out_subpool_put;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	/*
 	 * glb_chg is passed to indicate whether or not a page must be taken
 	 * from the global free pool (global change).  gbl_chg == 0 indicates
@@ -2002,7 +2008,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 */
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
 	if (!page) {
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_bh(&hugetlb_lock);
 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
 		if (!page)
 			goto out_uncharge_cgroup;
@@ -2010,12 +2016,12 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 			SetPagePrivate(page);
 			h->resv_huge_pages--;
 		}
-		spin_lock(&hugetlb_lock);
+		spin_lock_bh(&hugetlb_lock);
 		list_move(&page->lru, &h->hugepage_activelist);
 		/* Fall through */
 	}
 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	set_page_private(page, (unsigned long)spool);
 
@@ -2269,7 +2275,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	else
 		return -ENOMEM;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 
 	/*
 	 * Check for a node specific request.
@@ -2300,7 +2306,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	 */
 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
 		if (count > persistent_huge_pages(h)) {
-			spin_unlock(&hugetlb_lock);
+			spin_unlock_bh(&hugetlb_lock);
 			NODEMASK_FREE(node_alloc_noretry);
 			return -EINVAL;
 		}
@@ -2329,14 +2335,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_bh(&hugetlb_lock);
 
 		/* yield cpu to avoid soft lockup */
 		cond_resched();
 
 		ret = alloc_pool_huge_page(h, nodes_allowed,
 						node_alloc_noretry);
-		spin_lock(&hugetlb_lock);
+		spin_lock_bh(&hugetlb_lock);
 		if (!ret)
 			goto out;
 
@@ -2366,7 +2372,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	while (min_count < persistent_huge_pages(h)) {
 		if (!free_pool_huge_page(h, nodes_allowed, 0))
 			break;
-		cond_resched_lock(&hugetlb_lock);
+		if (need_resched()) {
+			spin_unlock_bh(&hugetlb_lock);
+			cond_resched();
+			spin_lock_bh(&hugetlb_lock);
+		}
 	}
 	while (count < persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2374,7 +2384,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	}
 out:
 	h->max_huge_pages = persistent_huge_pages(h);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	NODEMASK_FREE(node_alloc_noretry);
 
@@ -2528,9 +2538,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 	if (err)
 		return err;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	h->nr_overcommit_huge_pages = input;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 
 	return count;
 }
@@ -2978,9 +2988,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 		goto out;
 
 	if (write) {
-		spin_lock(&hugetlb_lock);
+		spin_lock_bh(&hugetlb_lock);
 		h->nr_overcommit_huge_pages = tmp;
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_bh(&hugetlb_lock);
 	}
 out:
 	return ret;
@@ -3071,7 +3081,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
 	int ret = -ENOMEM;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	/*
 	 * When cpuset is configured, it breaks the strict hugetlb page
 	 * reservation as the accounting is done on a global variable. Such
@@ -3104,7 +3114,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
 		return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 	return ret;
 }
 
@@ -4970,7 +4980,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 	bool ret = true;
 
 	VM_BUG_ON_PAGE(!PageHead(page), page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
 		ret = false;
 		goto unlock;
@@ -4978,17 +4988,17 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
 	clear_page_huge_active(page);
 	list_move_tail(&page->lru, list);
 unlock:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 	return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHead(page), page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_bh(&hugetlb_lock);
 	set_page_huge_active(page);
 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_bh(&hugetlb_lock);
 	put_page(page);
 }
 
@@ -5016,11 +5026,11 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 		SetPageHugeTemporary(oldpage);
 		ClearPageHugeTemporary(newpage);
 
-		spin_lock(&hugetlb_lock);
+		spin_lock_bh(&hugetlb_lock);
 		if (h->surplus_huge_pages_node[old_nid]) {
 			h->surplus_huge_pages_node[old_nid]--;
 			h->surplus_huge_pages_node[new_nid]++;
 		}
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_bh(&hugetlb_lock);
 	}
 }
-- 
2.18.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-11 19:46 [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Waiman Long
@ 2019-12-11 22:04 ` Mike Kravetz
  2019-12-11 22:19   ` Waiman Long
  2019-12-11 23:05 ` Andi Kleen
  1 sibling, 1 reply; 27+ messages in thread
From: Mike Kravetz @ 2019-12-11 22:04 UTC (permalink / raw)
  To: Waiman Long, Matthew Wilcox
  Cc: Andrew Morton, linux-kernel, linux-mm, Michal Hocko

Cc: Michal

Sorry for the late reply on this effort.

On 12/11/19 11:46 AM, Waiman Long wrote:
> The following lockdep splat was observed when a certain hugetlbfs test
> was run:
> 
> [  612.388273] ================================
> [  612.411273] WARNING: inconsistent lock state
> [  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
> [  612.469273] --------------------------------
> [  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
> [  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
> [  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
> [  612.576273] {SOFTIRQ-ON-W} state was registered at:
> [  612.598273]   lock_acquire+0x14f/0x3b0
> [  612.616273]   _raw_spin_lock+0x30/0x70
> [  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
> [  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
> [  612.681273]   proc_sys_call_handler+0x37f/0x450
> [  612.703273]   vfs_write+0x157/0x460
> [  612.719273]   ksys_write+0xb8/0x170
> [  612.736273]   do_syscall_64+0xa5/0x4d0
> [  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
> [  612.777273] irq event stamp: 691296
> [  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
> [  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
> [  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
> [  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
> [  612.962273]
> [  612.962273] other info that might help us debug this:
> [  612.993273]  Possible unsafe locking scenario:
> [  612.993273]
> [  613.020273]        CPU0
> [  613.031273]        ----
> [  613.042273]   lock(hugetlb_lock);
> [  613.057273]   <Interrupt>
> [  613.069273]     lock(hugetlb_lock);
> [  613.085273]
> [  613.085273]  *** DEADLOCK ***
>       :
> [  613.245273] Call Trace:
> [  613.256273]  <IRQ>
> [  613.265273]  dump_stack+0x9a/0xf0
> [  613.281273]  mark_lock+0xd0c/0x12f0
> [  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
> [  613.322273]  ? sched_clock_cpu+0x18/0x1e0
> [  613.341273]  __lock_acquire+0x146b/0x48c0
> [  613.360273]  ? trace_hardirqs_on+0x10/0x10
> [  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
> [  613.401273]  lock_acquire+0x14f/0x3b0
> [  613.419273]  ? free_huge_page+0x36f/0xaa0
> [  613.440273]  _raw_spin_lock+0x30/0x70
> [  613.458273]  ? free_huge_page+0x36f/0xaa0
> [  613.477273]  free_huge_page+0x36f/0xaa0
> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
> [  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
> [  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
> [  613.558273]  ? bio_endio+0x4ba/0x930
> [  613.575273]  ? blk_account_io_completion+0x400/0x530
> [  613.598273]  blk_update_request+0x276/0xe50
> [  613.617273]  scsi_end_request+0x7b/0x6a0
> [  613.636273]  ? lock_downgrade+0x6f0/0x6f0
> [  613.654273]  scsi_io_completion+0x1c6/0x1570
> [  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
> [  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
> [  613.718273]  blk_done_softirq+0x22e/0x350
> [  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
> [  613.758273]  __do_softirq+0x23d/0xad8
> [  613.776273]  irq_exit+0x23e/0x2b0
> [  613.792273]  do_IRQ+0x11a/0x200
> [  613.806273]  common_interrupt+0xf/0xf
> [  613.823273]  </IRQ>

This is interesting.  I'm trying to wrap my head around how we ended up
with a BIO pointing to a hugetlbfs page.  My 'guess' is that user space
code passed an address to some system call or driver.  And, that system
call or driver set up the IO.  For the purpose of addressing this issue,
it does not matter.  I am just a little confused/curious.

> Since hugetlb_lock can be taken from both process and softIRQ contexts,
> we need to protect the lock from nested locking by disabling softIRQ
> using spin_lock_bh() before taking it.
> 
> Currently, only free_huge_page() is known to be called from softIRQ
> context.

We discussed this exact same issue more than a year ago.  See,
https://lkml.org/lkml/2018/9/5/398

At that time, the only 'known' caller of put_page for a hugetlbfs page from
softirq context was in powerpc specific code.  IIRC, Aneesh addressed the
issue last year by modifying the powerpc specific code.  The more general
issue in the hugetlbfs code was never addressed. :(

As part of the discussion in the previous e-mail thread, the issue of
whether we should address put_page for hugetlbfs pages for only softirq
or extend to hardirq context was discussed.  The conclusion (or at least
suggestion from Andrew and Michal) was that we should modify code to allow
for calls from hardirq context.  The reasoning IIRC, was that put_page of
other pages was allowed from hardirq context, so hugetlbfs pages should be
no different.

Matthew, do you think that reasoning from last year is still valid?  Should
we be targeting soft or hard irq calls?

One other thing.  free_huge_page may also take a subpool specific lock via
spin_lock().  See hugepage_subpool_put_pages.  This would also need to take
irq context into account.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-11 22:04 ` Mike Kravetz
@ 2019-12-11 22:19   ` Waiman Long
  2019-12-12  1:11     ` Mike Kravetz
  0 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2019-12-11 22:19 UTC (permalink / raw)
  To: Mike Kravetz, Matthew Wilcox
  Cc: Andrew Morton, linux-kernel, linux-mm, Michal Hocko

On 12/11/19 5:04 PM, Mike Kravetz wrote:
> Cc: Michal
>
> Sorry for the late reply on this effort.
>
> On 12/11/19 11:46 AM, Waiman Long wrote:
>> The following lockdep splat was observed when a certain hugetlbfs test
>> was run:
>>
>> [  612.388273] ================================
>> [  612.411273] WARNING: inconsistent lock state
>> [  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
>> [  612.469273] --------------------------------
>> [  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
>> [  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
>> [  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
>> [  612.576273] {SOFTIRQ-ON-W} state was registered at:
>> [  612.598273]   lock_acquire+0x14f/0x3b0
>> [  612.616273]   _raw_spin_lock+0x30/0x70
>> [  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
>> [  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
>> [  612.681273]   proc_sys_call_handler+0x37f/0x450
>> [  612.703273]   vfs_write+0x157/0x460
>> [  612.719273]   ksys_write+0xb8/0x170
>> [  612.736273]   do_syscall_64+0xa5/0x4d0
>> [  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
>> [  612.777273] irq event stamp: 691296
>> [  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
>> [  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
>> [  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
>> [  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
>> [  612.962273]
>> [  612.962273] other info that might help us debug this:
>> [  612.993273]  Possible unsafe locking scenario:
>> [  612.993273]
>> [  613.020273]        CPU0
>> [  613.031273]        ----
>> [  613.042273]   lock(hugetlb_lock);
>> [  613.057273]   <Interrupt>
>> [  613.069273]     lock(hugetlb_lock);
>> [  613.085273]
>> [  613.085273]  *** DEADLOCK ***
>>       :
>> [  613.245273] Call Trace:
>> [  613.256273]  <IRQ>
>> [  613.265273]  dump_stack+0x9a/0xf0
>> [  613.281273]  mark_lock+0xd0c/0x12f0
>> [  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
>> [  613.322273]  ? sched_clock_cpu+0x18/0x1e0
>> [  613.341273]  __lock_acquire+0x146b/0x48c0
>> [  613.360273]  ? trace_hardirqs_on+0x10/0x10
>> [  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
>> [  613.401273]  lock_acquire+0x14f/0x3b0
>> [  613.419273]  ? free_huge_page+0x36f/0xaa0
>> [  613.440273]  _raw_spin_lock+0x30/0x70
>> [  613.458273]  ? free_huge_page+0x36f/0xaa0
>> [  613.477273]  free_huge_page+0x36f/0xaa0
>> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
>> [  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
>> [  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
>> [  613.558273]  ? bio_endio+0x4ba/0x930
>> [  613.575273]  ? blk_account_io_completion+0x400/0x530
>> [  613.598273]  blk_update_request+0x276/0xe50
>> [  613.617273]  scsi_end_request+0x7b/0x6a0
>> [  613.636273]  ? lock_downgrade+0x6f0/0x6f0
>> [  613.654273]  scsi_io_completion+0x1c6/0x1570
>> [  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
>> [  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
>> [  613.718273]  blk_done_softirq+0x22e/0x350
>> [  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
>> [  613.758273]  __do_softirq+0x23d/0xad8
>> [  613.776273]  irq_exit+0x23e/0x2b0
>> [  613.792273]  do_IRQ+0x11a/0x200
>> [  613.806273]  common_interrupt+0xf/0xf
>> [  613.823273]  </IRQ>
> This is interesting.  I'm trying to wrap my head around how we ended up
> with a BIO pointing to a hugetlbfs page.  My 'guess' is that user space
> code passed an address to some system call or driver.  And, that system
> call or driver set up the IO.  For the purpose of addressing this issue,
> it does not matter.  I am just a little confused/curious.
>
>> Since hugetlb_lock can be taken from both process and softIRQ contexts,
>> we need to protect the lock from nested locking by disabling softIRQ
>> using spin_lock_bh() before taking it.
>>
>> Currently, only free_huge_page() is known to be called from softIRQ
>> context.
> We discussed this exact same issue more than a year ago.  See,
> https://lkml.org/lkml/2018/9/5/398
>
> At that time, the only 'known' caller of put_page for a hugetlbfs page from
> softirq context was in powerpc specific code.  IIRC, Aneesh addressed the
> issue last year by modifying the powerpc specific code.  The more general
> issue in the hugetlbfs code was never addressed. :(
>
> As part of the discussion in the previous e-mail thread, the issue of
> whether we should address put_page for hugetlbfs pages for only softirq
> or extend to hardirq context was discussed.  The conclusion (or at least
> suggestion from Andrew and Michal) was that we should modify code to allow
> for calls from hardirq context.  The reasoning IIRC, was that put_page of
> other pages was allowed from hardirq context, so hugetlbfs pages should be
> no different.
>
> Matthew, do you think that reasoning from last year is still valid?  Should
> we be targeting soft or hard irq calls?
>
> One other thing.  free_huge_page may also take a subpool specific lock via
> spin_lock().  See hugepage_subpool_put_pages.  This would also need to take
> irq context into account.

Thanks for the background information.

We will need to use spin_lock_irq() or spin_lock_irqsave() for allowing
hardirq context calls like what is in the v1 patch. I will look further
into the subpool specific lock also.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-11 19:46 [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Waiman Long
  2019-12-11 22:04 ` Mike Kravetz
@ 2019-12-11 23:05 ` Andi Kleen
  1 sibling, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2019-12-11 23:05 UTC (permalink / raw)
  To: Waiman Long
  Cc: Mike Kravetz, Andrew Morton, linux-kernel, linux-mm, Matthew Wilcox

Waiman Long <longman@redhat.com> writes:
>
> Currently, only free_huge_page() is known to be called from softIRQ
> context.

Called from a timer?

Perhaps just move that to a work queue instead.

That seems preferable than to increase softirq latency.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-11 22:19   ` Waiman Long
@ 2019-12-12  1:11     ` Mike Kravetz
  2019-12-12  6:06       ` Davidlohr Bueso
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Kravetz @ 2019-12-12  1:11 UTC (permalink / raw)
  To: Waiman Long, Matthew Wilcox
  Cc: Andrew Morton, linux-kernel, linux-mm, Michal Hocko

On 12/11/19 2:19 PM, Waiman Long wrote:
> On 12/11/19 5:04 PM, Mike Kravetz wrote:
>> Cc: Michal
>>
>> Sorry for the late reply on this effort.
>>
>> On 12/11/19 11:46 AM, Waiman Long wrote:
>>> The following lockdep splat was observed when a certain hugetlbfs test
>>> was run:
>>>
>>> [  612.388273] ================================
>>> [  612.411273] WARNING: inconsistent lock state
>>> [  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
>>> [  612.469273] --------------------------------
>>> [  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
>>> [  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
>>> [  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
>>> [  612.576273] {SOFTIRQ-ON-W} state was registered at:
>>> [  612.598273]   lock_acquire+0x14f/0x3b0
>>> [  612.616273]   _raw_spin_lock+0x30/0x70
>>> [  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
>>> [  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
>>> [  612.681273]   proc_sys_call_handler+0x37f/0x450
>>> [  612.703273]   vfs_write+0x157/0x460
>>> [  612.719273]   ksys_write+0xb8/0x170
>>> [  612.736273]   do_syscall_64+0xa5/0x4d0
>>> [  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
>>> [  612.777273] irq event stamp: 691296
>>> [  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
>>> [  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
>>> [  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
>>> [  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
>>> [  612.962273]
>>> [  612.962273] other info that might help us debug this:
>>> [  612.993273]  Possible unsafe locking scenario:
>>> [  612.993273]
>>> [  613.020273]        CPU0
>>> [  613.031273]        ----
>>> [  613.042273]   lock(hugetlb_lock);
>>> [  613.057273]   <Interrupt>
>>> [  613.069273]     lock(hugetlb_lock);
>>> [  613.085273]
>>> [  613.085273]  *** DEADLOCK ***
>>>       :
>>> [  613.245273] Call Trace:
>>> [  613.256273]  <IRQ>
>>> [  613.265273]  dump_stack+0x9a/0xf0
>>> [  613.281273]  mark_lock+0xd0c/0x12f0
>>> [  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
>>> [  613.322273]  ? sched_clock_cpu+0x18/0x1e0
>>> [  613.341273]  __lock_acquire+0x146b/0x48c0
>>> [  613.360273]  ? trace_hardirqs_on+0x10/0x10
>>> [  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
>>> [  613.401273]  lock_acquire+0x14f/0x3b0
>>> [  613.419273]  ? free_huge_page+0x36f/0xaa0
>>> [  613.440273]  _raw_spin_lock+0x30/0x70
>>> [  613.458273]  ? free_huge_page+0x36f/0xaa0
>>> [  613.477273]  free_huge_page+0x36f/0xaa0
>>> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
>>> [  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
>>> [  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
>>> [  613.558273]  ? bio_endio+0x4ba/0x930
>>> [  613.575273]  ? blk_account_io_completion+0x400/0x530
>>> [  613.598273]  blk_update_request+0x276/0xe50
>>> [  613.617273]  scsi_end_request+0x7b/0x6a0
>>> [  613.636273]  ? lock_downgrade+0x6f0/0x6f0
>>> [  613.654273]  scsi_io_completion+0x1c6/0x1570
>>> [  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
>>> [  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
>>> [  613.718273]  blk_done_softirq+0x22e/0x350
>>> [  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
>>> [  613.758273]  __do_softirq+0x23d/0xad8
>>> [  613.776273]  irq_exit+0x23e/0x2b0
>>> [  613.792273]  do_IRQ+0x11a/0x200
>>> [  613.806273]  common_interrupt+0xf/0xf
>>> [  613.823273]  </IRQ>
>> This is interesting.  I'm trying to wrap my head around how we ended up
>> with a BIO pointing to a hugetlbfs page.  My 'guess' is that user space
>> code passed an address to some system call or driver.  And, that system
>> call or driver set up the IO.  For the purpose of addressing this issue,
>> it does not matter.  I am just a little confused/curious.
>>
>>> Since hugetlb_lock can be taken from both process and softIRQ contexts,
>>> we need to protect the lock from nested locking by disabling softIRQ
>>> using spin_lock_bh() before taking it.
>>>
>>> Currently, only free_huge_page() is known to be called from softIRQ
>>> context.
>> We discussed this exact same issue more than a year ago.  See,
>> https://lkml.org/lkml/2018/9/5/398
>>
>> At that time, the only 'known' caller of put_page for a hugetlbfs page from
>> softirq context was in powerpc specific code.  IIRC, Aneesh addressed the
>> issue last year by modifying the powerpc specific code.  The more general
>> issue in the hugetlbfs code was never addressed. :(
>>
>> As part of the discussion in the previous e-mail thread, the issue of
>> whether we should address put_page for hugetlbfs pages for only softirq
>> or extend to hardirq context was discussed.  The conclusion (or at least
>> suggestion from Andrew and Michal) was that we should modify code to allow
>> for calls from hardirq context.  The reasoning IIRC, was that put_page of
>> other pages was allowed from hardirq context, so hugetlbfs pages should be
>> no different.
>>
>> Matthew, do you think that reasoning from last year is still valid?  Should
>> we be targeting soft or hard irq calls?
>>
>> One other thing.  free_huge_page may also take a subpool specific lock via
>> spin_lock().  See hugepage_subpool_put_pages.  This would also need to take
>> irq context into account.
> 
> Thanks for the background information.
> 
> We will need to use spin_lock_irq() or spin_lock_irqsave() for allowing
> hardirq context calls like what is in the v1 patch. I will look further
> into the subpool specific lock also.

Sorry,

I did not fully read all of Matthew's comments/suggestions on the original
patch.  His initial suggestion was for a workqueue approach that you did
start implementing, but thought was too complex.  Andi also suggested this
approach.

The workqueue approach would address both soft and hard irq context issues.
As a result, I too think this is the approach we should explore.  Since there
is more than one lock involved, this also is reason for a work queue approach.

I'll take a look at initial workqueue implementation.  However, I have not
dealt with workqueues in some time so it may take few days to evaluate.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-12  1:11     ` Mike Kravetz
@ 2019-12-12  6:06       ` Davidlohr Bueso
  2019-12-12  6:30         ` Davidlohr Bueso
  2019-12-12 21:32         ` [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Andi Kleen
  0 siblings, 2 replies; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-12  6:06 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Waiman Long, Matthew Wilcox, Andrew Morton, linux-kernel,
	linux-mm, Michal Hocko

On Wed, 11 Dec 2019, Mike Kravetz wrote:
>The workqueue approach would address both soft and hard irq context issues.
>As a result, I too think this is the approach we should explore.  Since there
>is more than one lock involved, this also is reason for a work queue approach.
>
>I'll take a look at initial workqueue implementation.  However, I have not
>dealt with workqueues in some time so it may take few days to evaluate.

I'm thinking of something like the following; it at least passes all ltp
hugetlb related testcases.

Thanks,
Davidlohr

----8<------------------------------------------------------------------
[PATCH] mm/hugetlb: defer free_huge_page() to a workqueue

There have been deadlock reports[1, 2] where put_page is called
from softirq context and this causes trouble with the hugetlb_lock,
as well as potentially the subpool lock.

For such an unlikely scenario, lets not add irq dancing overhead
to the lock+unlock operations, which could incur in expensive
instruction dependencies, particularly when considering hard-irq
safety. For example PUSHF+POPF on x86.

Instead, just use a workqueue and do the free_huge_page() in regular
task context.

[1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
[2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 mm/hugetlb.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..737108d8d637 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1136,8 +1136,17 @@ static inline void ClearPageHugeTemporary(struct page *page)
 	page[2].mapping = NULL;
 }
 
-void free_huge_page(struct page *page)
+static struct workqueue_struct *hugetlb_free_page_wq;
+struct hugetlb_free_page_work {
+	struct page *page;
+	struct work_struct work;
+};
+
+static void free_huge_page_workfn(struct work_struct *work)
 {
+	struct page *page = container_of(work,
+					 struct hugetlb_free_page_work,
+					 work)->page;
 	/*
 	 * Can't pass hstate in here because it is called from the
 	 * compound page destructor.
@@ -1197,6 +1206,27 @@ void free_huge_page(struct page *page)
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
+
+}
+
+/*
+ * While unlikely, free_huge_page() can be at least called from
+ * softirq context, defer freeing such that the hugetlb_lock and
+ * spool->lock need not have to deal with irq dances just for this.
+ */
+void free_huge_page(struct page *page)
+{
+	struct hugetlb_free_page_work work;
+
+	work.page = page;
+	INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn);
+	queue_work(hugetlb_free_page_wq, &work.work);
+
+	/*
+	 * Wait until free_huge_page is done.
+	 */
+	flush_work(&work.work);
+	destroy_work_on_stack(&work.work);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -2816,6 +2846,12 @@ static int __init hugetlb_init(void)
 
 	for (i = 0; i < num_fault_mutexes; i++)
 		mutex_init(&hugetlb_fault_mutex_table[i]);
+
+	hugetlb_free_page_wq = alloc_workqueue("hugetlb_free_page_wq",
+					       WQ_MEM_RECLAIM, 0);
+	if (!hugetlb_free_page_wq)
+		return -ENOMEM;
+
 	return 0;
 }
 subsys_initcall(hugetlb_init);
-- 
2.16.4

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-12  6:06       ` Davidlohr Bueso
@ 2019-12-12  6:30         ` Davidlohr Bueso
  2019-12-12 19:04           ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Davidlohr Bueso
  2019-12-12 21:32         ` [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Andi Kleen
  1 sibling, 1 reply; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-12  6:30 UTC (permalink / raw)
  To: Mike Kravetz, Waiman Long, Matthew Wilcox, Andrew Morton,
	linux-kernel, linux-mm, Michal Hocko

On Wed, 11 Dec 2019, Davidlohr Bueso wrote:
>Instead, just use a workqueue and do the free_huge_page() in regular
>task context.

Hmm so this is done unconditionally, perhaps we want to do this _only_
under irq just to avoid any workqueue overhead in the common case?


      if (unlikely(in_interrupt()) {
	    workqueue free_huge_page
      } else {
	    normal free_huge_page
      }

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12  6:30         ` Davidlohr Bueso
@ 2019-12-12 19:04           ` Davidlohr Bueso
  2019-12-12 19:22             ` Mike Kravetz
                               ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-12 19:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mike Kravetz, Waiman Long, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

There have been deadlock reports[1, 2] where put_page is called
from softirq context and this causes trouble with the hugetlb_lock,
as well as potentially the subpool lock.

For such an unlikely scenario, lets not add irq dancing overhead
to the lock+unlock operations, which could incur in expensive
instruction dependencies, particularly when considering hard-irq
safety. For example PUSHF+POPF on x86.

Instead, just use a workqueue and do the free_huge_page() in regular
task context.

[1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
[2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/

Reported-by: Waiman Long <longman@redhat.com>
Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---

- Changes from v1: Only use wq when in_interrupt(), otherwise business
    as usual. Also include the proper header file.

- While I have not reproduced this issue, the v1 using wq passes all hugetlb
    related tests in ltp.

 mm/hugetlb.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..f28cf601938d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,7 @@
 #include <linux/swapops.h>
 #include <linux/jhash.h>
 #include <linux/numa.h>
+#include <linux/workqueue.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -1136,7 +1137,13 @@ static inline void ClearPageHugeTemporary(struct page *page)
 	page[2].mapping = NULL;
 }
 
-void free_huge_page(struct page *page)
+static struct workqueue_struct *hugetlb_free_page_wq;
+struct hugetlb_free_page_work {
+	struct page *page;
+	struct work_struct work;
+};
+
+static void __free_huge_page(struct page *page)
 {
 	/*
 	 * Can't pass hstate in here because it is called from the
@@ -1199,6 +1206,36 @@ void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
+static void free_huge_page_workfn(struct work_struct *work)
+{
+	struct page *page;
+
+	page = container_of(work, struct hugetlb_free_page_work, work)->page;
+	__free_huge_page(page);
+}
+
+void free_huge_page(struct page *page)
+{
+	if (unlikely(in_interrupt())) {
+		/*
+		 * While uncommon, free_huge_page() can be at least
+		 * called from softirq context, defer freeing such
+		 * that the hugetlb_lock and spool->lock need not have
+		 * to deal with irq dances just for this.
+		 */
+		struct hugetlb_free_page_work work;
+
+		work.page = page;
+		INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn);
+		queue_work(hugetlb_free_page_wq, &work.work);
+
+		/* wait until the huge page freeing is done */
+		flush_work(&work.work);
+		destroy_work_on_stack(&work.work);
+	} else
+		__free_huge_page(page);
+}
+
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	INIT_LIST_HEAD(&page->lru);
@@ -2816,6 +2853,12 @@ static int __init hugetlb_init(void)
 
 	for (i = 0; i < num_fault_mutexes; i++)
 		mutex_init(&hugetlb_fault_mutex_table[i]);
+
+	hugetlb_free_page_wq = alloc_workqueue("hugetlb_free_page_wq",
+					       WQ_MEM_RECLAIM, 0);
+	if (!hugetlb_free_page_wq)
+		return -ENOMEM;
+
 	return 0;
 }
 subsys_initcall(hugetlb_init);
--
2.16.4

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 19:04           ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Davidlohr Bueso
@ 2019-12-12 19:22             ` Mike Kravetz
  2019-12-12 19:36               ` Davidlohr Bueso
  2019-12-12 20:52               ` Waiman Long
  2019-12-12 21:01             ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Waiman Long
  2019-12-16 13:37             ` Michal Hocko
  2 siblings, 2 replies; 27+ messages in thread
From: Mike Kravetz @ 2019-12-12 19:22 UTC (permalink / raw)
  To: Andrew Morton, Waiman Long, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
> There have been deadlock reports[1, 2] where put_page is called
> from softirq context and this causes trouble with the hugetlb_lock,
> as well as potentially the subpool lock.
> 
> For such an unlikely scenario, lets not add irq dancing overhead
> to the lock+unlock operations, which could incur in expensive
> instruction dependencies, particularly when considering hard-irq
> safety. For example PUSHF+POPF on x86.
> 
> Instead, just use a workqueue and do the free_huge_page() in regular
> task context.
> 
> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
> 
> Reported-by: Waiman Long <longman@redhat.com>
> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>

Thank you Davidlohr.

The patch does seem fairly simple and straight forward.  I need to brush up
on my workqueue knowledge to provide a full review.

Longman,
Do you have a test to reproduce the issue?  If so, can you try running with
this patch.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 19:22             ` Mike Kravetz
@ 2019-12-12 19:36               ` Davidlohr Bueso
  2019-12-12 20:52               ` Waiman Long
  1 sibling, 0 replies; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-12 19:36 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Andrew Morton, Waiman Long, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

On Thu, 12 Dec 2019, Mike Kravetz wrote:

>The patch does seem fairly simple and straight forward.  I need to brush up
>on my workqueue knowledge to provide a full review.

The flush_work() call is actually bogus as we obviously cannot block in
softirq...

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 19:22             ` Mike Kravetz
  2019-12-12 19:36               ` Davidlohr Bueso
@ 2019-12-12 20:52               ` Waiman Long
  2019-12-12 21:04                 ` Waiman Long
  2019-12-16 13:26                 ` Michal Hocko
  1 sibling, 2 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-12 20:52 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

On 12/12/19 2:22 PM, Mike Kravetz wrote:
> On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
>> There have been deadlock reports[1, 2] where put_page is called
>> from softirq context and this causes trouble with the hugetlb_lock,
>> as well as potentially the subpool lock.
>>
>> For such an unlikely scenario, lets not add irq dancing overhead
>> to the lock+unlock operations, which could incur in expensive
>> instruction dependencies, particularly when considering hard-irq
>> safety. For example PUSHF+POPF on x86.
>>
>> Instead, just use a workqueue and do the free_huge_page() in regular
>> task context.
>>
>> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
>> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
>>
>> Reported-by: Waiman Long <longman@redhat.com>
>> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> Thank you Davidlohr.
>
> The patch does seem fairly simple and straight forward.  I need to brush up
> on my workqueue knowledge to provide a full review.
>
> Longman,
> Do you have a test to reproduce the issue?  If so, can you try running with
> this patch.

Yes, I do have a test that can reproduce the issue. I will run it with
the patch and report the status tomorrow.

-Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 19:04           ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Davidlohr Bueso
  2019-12-12 19:22             ` Mike Kravetz
@ 2019-12-12 21:01             ` Waiman Long
  2019-12-16 13:37             ` Michal Hocko
  2 siblings, 0 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-12 21:01 UTC (permalink / raw)
  To: Andrew Morton, Mike Kravetz, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

On 12/12/19 2:04 PM, Davidlohr Bueso wrote:
> There have been deadlock reports[1, 2] where put_page is called
> from softirq context and this causes trouble with the hugetlb_lock,
> as well as potentially the subpool lock.
>
> For such an unlikely scenario, lets not add irq dancing overhead
> to the lock+unlock operations, which could incur in expensive
> instruction dependencies, particularly when considering hard-irq
> safety. For example PUSHF+POPF on x86.
>
> Instead, just use a workqueue and do the free_huge_page() in regular
> task context.
>
> [1]
> https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
> [2]
> https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
>
> Reported-by: Waiman Long <longman@redhat.com>
> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> ---
>
> - Changes from v1: Only use wq when in_interrupt(), otherwise business
>    as usual. Also include the proper header file.
>
> - While I have not reproduced this issue, the v1 using wq passes all
> hugetlb
>    related tests in ltp.
>
> mm/hugetlb.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ac65bb5e38ac..f28cf601938d 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -27,6 +27,7 @@
> #include <linux/swapops.h>
> #include <linux/jhash.h>
> #include <linux/numa.h>
> +#include <linux/workqueue.h>
>
> #include <asm/page.h>
> #include <asm/pgtable.h>
> @@ -1136,7 +1137,13 @@ static inline void
> ClearPageHugeTemporary(struct page *page)
>     page[2].mapping = NULL;
> }
>
> -void free_huge_page(struct page *page)
> +static struct workqueue_struct *hugetlb_free_page_wq;
> +struct hugetlb_free_page_work {
> +    struct page *page;
> +    struct work_struct work;
> +};
> +
> +static void __free_huge_page(struct page *page)
> {
>     /*
>      * Can't pass hstate in here because it is called from the
> @@ -1199,6 +1206,36 @@ void free_huge_page(struct page *page)
>     spin_unlock(&hugetlb_lock);
> }
>
> +static void free_huge_page_workfn(struct work_struct *work)
> +{
> +    struct page *page;
> +
> +    page = container_of(work, struct hugetlb_free_page_work,
> work)->page;
> +    __free_huge_page(page);
> +}
> +
> +void free_huge_page(struct page *page)
> +{
> +    if (unlikely(in_interrupt())) {

in_interrupt() also include context where softIRQ is disabled. So maybe
!in_task() is a better fit here.


> +        /*
> +         * While uncommon, free_huge_page() can be at least
> +         * called from softirq context, defer freeing such
> +         * that the hugetlb_lock and spool->lock need not have
> +         * to deal with irq dances just for this.
> +         */
> +        struct hugetlb_free_page_work work;
> +
> +        work.page = page;
> +        INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn);
> +        queue_work(hugetlb_free_page_wq, &work.work);
> +
> +        /* wait until the huge page freeing is done */
> +        flush_work(&work.work);
> +        destroy_work_on_stack(&work.work);

The problem I see is that you don't want to wait too long while in the
hardirq context. However, the latency for the work to finish is
indeterminate.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 20:52               ` Waiman Long
@ 2019-12-12 21:04                 ` Waiman Long
  2019-12-16 13:26                 ` Michal Hocko
  1 sibling, 0 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-12 21:04 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton, Matthew Wilcox, linux-kernel,
	linux-mm, Michal Hocko, aneesh.kumar

On 12/12/19 3:52 PM, Waiman Long wrote:
> On 12/12/19 2:22 PM, Mike Kravetz wrote:
>> On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
>>> There have been deadlock reports[1, 2] where put_page is called
>>> from softirq context and this causes trouble with the hugetlb_lock,
>>> as well as potentially the subpool lock.
>>>
>>> For such an unlikely scenario, lets not add irq dancing overhead
>>> to the lock+unlock operations, which could incur in expensive
>>> instruction dependencies, particularly when considering hard-irq
>>> safety. For example PUSHF+POPF on x86.
>>>
>>> Instead, just use a workqueue and do the free_huge_page() in regular
>>> task context.
>>>
>>> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
>>> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
>>>
>>> Reported-by: Waiman Long <longman@redhat.com>
>>> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
>> Thank you Davidlohr.
>>
>> The patch does seem fairly simple and straight forward.  I need to brush up
>> on my workqueue knowledge to provide a full review.
>>
>> Longman,
>> Do you have a test to reproduce the issue?  If so, can you try running with
>> this patch.
> Yes, I do have a test that can reproduce the issue. I will run it with
> the patch and report the status tomorrow.

I don't think Davidlohr's patch is ready for prime time yet. So I will
wait until a better version is available.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-12  6:06       ` Davidlohr Bueso
  2019-12-12  6:30         ` Davidlohr Bueso
@ 2019-12-12 21:32         ` Andi Kleen
  2019-12-12 22:42           ` Davidlohr Bueso
  1 sibling, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2019-12-12 21:32 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Waiman Long, Matthew Wilcox, Andrew Morton, linux-kernel,
	linux-mm, Michal Hocko

Davidlohr Bueso <dave@stgolabs.net> writes:
> +void free_huge_page(struct page *page)
> +{
> +	struct hugetlb_free_page_work work;
> +
> +	work.page = page;
> +	INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn);
> +	queue_work(hugetlb_free_page_wq, &work.work);
> +
> +	/*
> +	 * Wait until free_huge_page is done.
> +	 */
> +	flush_work(&work.work);
> +	destroy_work_on_stack(&work.work);

Does flushing really work in softirq context?

Anyways, waiting seems inefficient over fire'n'forget

You'll need a per cpu pre allocated work item and a queue.
Then take a lock on the the queue and link the page into
it and trigger the work item if it's not already pending.

And add a in_interrupt() check of course.


-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock
  2019-12-12 21:32         ` [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Andi Kleen
@ 2019-12-12 22:42           ` Davidlohr Bueso
  0 siblings, 0 replies; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-12 22:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mike Kravetz, Waiman Long, Matthew Wilcox, Andrew Morton,
	linux-kernel, linux-mm, Michal Hocko

On Thu, 12 Dec 2019, Andi Kleen wrote:

>Davidlohr Bueso <dave@stgolabs.net> writes:
>> +void free_huge_page(struct page *page)
>> +{
>> +	struct hugetlb_free_page_work work;
>> +
>> +	work.page = page;
>> +	INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn);
>> +	queue_work(hugetlb_free_page_wq, &work.work);
>> +
>> +	/*
>> +	 * Wait until free_huge_page is done.
>> +	 */
>> +	flush_work(&work.work);
>> +	destroy_work_on_stack(&work.work);
>
>Does flushing really work in softirq context?
>
>Anyways, waiting seems inefficient over fire'n'forget

Yep. I was only thinking about the workerfn not blocking
and therefore we could wait safely, but flush_work has no
such guarantees.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 20:52               ` Waiman Long
  2019-12-12 21:04                 ` Waiman Long
@ 2019-12-16 13:26                 ` Michal Hocko
  2019-12-16 15:38                   ` Waiman Long
  1 sibling, 1 reply; 27+ messages in thread
From: Michal Hocko @ 2019-12-16 13:26 UTC (permalink / raw)
  To: Waiman Long
  Cc: Mike Kravetz, Andrew Morton, Matthew Wilcox, linux-kernel,
	linux-mm, aneesh.kumar

On Thu 12-12-19 15:52:20, Waiman Long wrote:
> On 12/12/19 2:22 PM, Mike Kravetz wrote:
> > On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
> >> There have been deadlock reports[1, 2] where put_page is called
> >> from softirq context and this causes trouble with the hugetlb_lock,
> >> as well as potentially the subpool lock.
> >>
> >> For such an unlikely scenario, lets not add irq dancing overhead
> >> to the lock+unlock operations, which could incur in expensive
> >> instruction dependencies, particularly when considering hard-irq
> >> safety. For example PUSHF+POPF on x86.
> >>
> >> Instead, just use a workqueue and do the free_huge_page() in regular
> >> task context.
> >>
> >> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
> >> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
> >>
> >> Reported-by: Waiman Long <longman@redhat.com>
> >> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> > Thank you Davidlohr.
> >
> > The patch does seem fairly simple and straight forward.  I need to brush up
> > on my workqueue knowledge to provide a full review.
> >
> > Longman,
> > Do you have a test to reproduce the issue?  If so, can you try running with
> > this patch.
> 
> Yes, I do have a test that can reproduce the issue. I will run it with
> the patch and report the status tomorrow.

Can you extract guts of the testcase and integrate them into hugetlb
test suite?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-12 19:04           ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Davidlohr Bueso
  2019-12-12 19:22             ` Mike Kravetz
  2019-12-12 21:01             ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Waiman Long
@ 2019-12-16 13:37             ` Michal Hocko
  2019-12-16 16:17               ` Waiman Long
  2019-12-16 16:17               ` Davidlohr Bueso
  2 siblings, 2 replies; 27+ messages in thread
From: Michal Hocko @ 2019-12-16 13:37 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Mike Kravetz, Waiman Long, Matthew Wilcox,
	linux-kernel, linux-mm, aneesh.kumar

On Thu 12-12-19 11:04:27, Davidlohr Bueso wrote:
> There have been deadlock reports[1, 2] where put_page is called
> from softirq context and this causes trouble with the hugetlb_lock,
> as well as potentially the subpool lock.
> 
> For such an unlikely scenario, lets not add irq dancing overhead
> to the lock+unlock operations, which could incur in expensive
> instruction dependencies, particularly when considering hard-irq
> safety. For example PUSHF+POPF on x86.
> 
> Instead, just use a workqueue and do the free_huge_page() in regular
> task context.

I am afraid that work_struct is too large to be stuffed into the struct
page array (because of the lockdep part).

I think that it would be just safer to make hugetlb_lock irq safe. Are
there any other locks that would require the same?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 13:26                 ` Michal Hocko
@ 2019-12-16 15:38                   ` Waiman Long
  2019-12-16 18:44                     ` Waiman Long
  0 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2019-12-16 15:38 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mike Kravetz, Andrew Morton, Matthew Wilcox, linux-kernel,
	linux-mm, aneesh.kumar, Jarod Wilson

On 12/16/19 8:26 AM, Michal Hocko wrote:
> On Thu 12-12-19 15:52:20, Waiman Long wrote:
>> On 12/12/19 2:22 PM, Mike Kravetz wrote:
>>> On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
>>>> There have been deadlock reports[1, 2] where put_page is called
>>>> from softirq context and this causes trouble with the hugetlb_lock,
>>>> as well as potentially the subpool lock.
>>>>
>>>> For such an unlikely scenario, lets not add irq dancing overhead
>>>> to the lock+unlock operations, which could incur in expensive
>>>> instruction dependencies, particularly when considering hard-irq
>>>> safety. For example PUSHF+POPF on x86.
>>>>
>>>> Instead, just use a workqueue and do the free_huge_page() in regular
>>>> task context.
>>>>
>>>> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
>>>> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
>>>>
>>>> Reported-by: Waiman Long <longman@redhat.com>
>>>> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
>>> Thank you Davidlohr.
>>>
>>> The patch does seem fairly simple and straight forward.  I need to brush up
>>> on my workqueue knowledge to provide a full review.
>>>
>>> Longman,
>>> Do you have a test to reproduce the issue?  If so, can you try running with
>>> this patch.
>> Yes, I do have a test that can reproduce the issue. I will run it with
>> the patch and report the status tomorrow.
> Can you extract guts of the testcase and integrate them into hugetlb
> test suite?

The test case that I used is the Red Hat internal "Fork vs. fast GUP
race test" written by Jarod Wilson. I would have to ask him if he is OK
with that.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 13:37             ` Michal Hocko
@ 2019-12-16 16:17               ` Waiman Long
  2019-12-16 16:17               ` Davidlohr Bueso
  1 sibling, 0 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-16 16:17 UTC (permalink / raw)
  To: Michal Hocko, Davidlohr Bueso
  Cc: Andrew Morton, Mike Kravetz, Matthew Wilcox, linux-kernel,
	linux-mm, aneesh.kumar

On 12/16/19 8:37 AM, Michal Hocko wrote:
> On Thu 12-12-19 11:04:27, Davidlohr Bueso wrote:
>> There have been deadlock reports[1, 2] where put_page is called
>> from softirq context and this causes trouble with the hugetlb_lock,
>> as well as potentially the subpool lock.
>>
>> For such an unlikely scenario, lets not add irq dancing overhead
>> to the lock+unlock operations, which could incur in expensive
>> instruction dependencies, particularly when considering hard-irq
>> safety. For example PUSHF+POPF on x86.
>>
>> Instead, just use a workqueue and do the free_huge_page() in regular
>> task context.
> I am afraid that work_struct is too large to be stuffed into the struct
> page array (because of the lockdep part).
>
> I think that it would be just safer to make hugetlb_lock irq safe. Are
> there any other locks that would require the same?

Currently, free_huge_page() can be called from the softIRQ context. The
hugetlb_lock will be acquired during that call. The subpool lock may
conditionally be acquired as well.

I am still torn between converting both locks to be irq-safe or
deferring the freeing to a workqueue.

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 13:37             ` Michal Hocko
  2019-12-16 16:17               ` Waiman Long
@ 2019-12-16 16:17               ` Davidlohr Bueso
  2019-12-16 17:18                 ` Matthew Wilcox
  2019-12-16 19:08                 ` Mike Kravetz
  1 sibling, 2 replies; 27+ messages in thread
From: Davidlohr Bueso @ 2019-12-16 16:17 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Mike Kravetz, Waiman Long, Matthew Wilcox,
	linux-kernel, linux-mm, aneesh.kumar

On Mon, 16 Dec 2019, Michal Hocko wrote:
>I am afraid that work_struct is too large to be stuffed into the struct
>page array (because of the lockdep part).

Yeah, this needs to be done without touching struct page.

Which is why I had done the stack allocated way in this patch, but we
cannot wait for it to complete in irq, so that's out the window. Andi
had suggested percpu allocated work items, but having played with the
idea over the weekend, I don't see how we can prevent another page being
freed on the same cpu before previous work on the same cpu is complete
(cpu0 wants to free pageA, schedules the work, in the mean time cpu0
wants to free pageB and workerfn for pageA still hasn't been called).

>I think that it would be just safer to make hugetlb_lock irq safe. Are
>there any other locks that would require the same?

It would be simpler. Any performance issues that arise would probably
be only seen in microbenchmarks, assuming we want to have full irq safety.
If we don't need to worry about hardirq, then even better.

The subpool lock would also need to be irq safe.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 16:17               ` Davidlohr Bueso
@ 2019-12-16 17:18                 ` Matthew Wilcox
  2019-12-16 19:08                 ` Mike Kravetz
  1 sibling, 0 replies; 27+ messages in thread
From: Matthew Wilcox @ 2019-12-16 17:18 UTC (permalink / raw)
  To: Michal Hocko, Andrew Morton, Mike Kravetz, Waiman Long,
	linux-kernel, linux-mm, aneesh.kumar

On Mon, Dec 16, 2019 at 08:17:48AM -0800, Davidlohr Bueso wrote:
> On Mon, 16 Dec 2019, Michal Hocko wrote:
> > I am afraid that work_struct is too large to be stuffed into the struct
> > page array (because of the lockdep part).
> 
> Yeah, this needs to be done without touching struct page.
> 
> Which is why I had done the stack allocated way in this patch, but we
> cannot wait for it to complete in irq, so that's out the window. Andi
> had suggested percpu allocated work items, but having played with the
> idea over the weekend, I don't see how we can prevent another page being
> freed on the same cpu before previous work on the same cpu is complete
> (cpu0 wants to free pageA, schedules the work, in the mean time cpu0
> wants to free pageB and workerfn for pageA still hasn't been called).

Why is it that we can call functions after-an-RCU-period-has-elapsed time,
at returning-to-userspace time and after-exiting-hardirq-handler
time easily, but the mechanism for calling a function
after-we've-finished-handling-softirqs is so bloody hard to use?

That's surely the major problem we need to fix.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 15:38                   ` Waiman Long
@ 2019-12-16 18:44                     ` Waiman Long
  2019-12-17  9:00                       ` Michal Hocko
  0 siblings, 1 reply; 27+ messages in thread
From: Waiman Long @ 2019-12-16 18:44 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Mike Kravetz, Andrew Morton, Matthew Wilcox, linux-kernel,
	linux-mm, aneesh.kumar, Jarod Wilson

On 12/16/19 10:38 AM, Waiman Long wrote:
> On 12/16/19 8:26 AM, Michal Hocko wrote:
>> On Thu 12-12-19 15:52:20, Waiman Long wrote:
>>> On 12/12/19 2:22 PM, Mike Kravetz wrote:
>>>> On 12/12/19 11:04 AM, Davidlohr Bueso wrote:
>>>>> There have been deadlock reports[1, 2] where put_page is called
>>>>> from softirq context and this causes trouble with the hugetlb_lock,
>>>>> as well as potentially the subpool lock.
>>>>>
>>>>> For such an unlikely scenario, lets not add irq dancing overhead
>>>>> to the lock+unlock operations, which could incur in expensive
>>>>> instruction dependencies, particularly when considering hard-irq
>>>>> safety. For example PUSHF+POPF on x86.
>>>>>
>>>>> Instead, just use a workqueue and do the free_huge_page() in regular
>>>>> task context.
>>>>>
>>>>> [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@redhat.com/
>>>>> [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@linux.ibm.com/
>>>>>
>>>>> Reported-by: Waiman Long <longman@redhat.com>
>>>>> Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
>>>> Thank you Davidlohr.
>>>>
>>>> The patch does seem fairly simple and straight forward.  I need to brush up
>>>> on my workqueue knowledge to provide a full review.
>>>>
>>>> Longman,
>>>> Do you have a test to reproduce the issue?  If so, can you try running with
>>>> this patch.
>>> Yes, I do have a test that can reproduce the issue. I will run it with
>>> the patch and report the status tomorrow.
>> Can you extract guts of the testcase and integrate them into hugetlb
>> test suite?

BTW, what hugetlb test suite are you talking about?

Cheers,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 16:17               ` Davidlohr Bueso
  2019-12-16 17:18                 ` Matthew Wilcox
@ 2019-12-16 19:08                 ` Mike Kravetz
  2019-12-16 19:13                   ` Waiman Long
  1 sibling, 1 reply; 27+ messages in thread
From: Mike Kravetz @ 2019-12-16 19:08 UTC (permalink / raw)
  To: Michal Hocko, Andrew Morton, Waiman Long, Matthew Wilcox,
	linux-kernel, linux-mm, aneesh.kumar

On 12/16/19 8:17 AM, Davidlohr Bueso wrote:
> On Mon, 16 Dec 2019, Michal Hocko wrote:
>> I am afraid that work_struct is too large to be stuffed into the struct
>> page array (because of the lockdep part).
> 
> Yeah, this needs to be done without touching struct page.
> 
> Which is why I had done the stack allocated way in this patch, but we
> cannot wait for it to complete in irq, so that's out the window. Andi
> had suggested percpu allocated work items, but having played with the
> idea over the weekend, I don't see how we can prevent another page being
> freed on the same cpu before previous work on the same cpu is complete
> (cpu0 wants to free pageA, schedules the work, in the mean time cpu0
> wants to free pageB and workerfn for pageA still hasn't been called).
> 
>> I think that it would be just safer to make hugetlb_lock irq safe. Are
>> there any other locks that would require the same?
> 
> It would be simpler. Any performance issues that arise would probably
> be only seen in microbenchmarks, assuming we want to have full irq safety.
> If we don't need to worry about hardirq, then even better.
> 
> The subpool lock would also need to be irq safe.

I do think we need to worry about hardirq.  There are no restruictions that
put_page can not be called from hardirq context. 

I am concerned about the latency of making hugetlb_lock (and potentially
subpool lock) hardirq safe.  When these locks were introduced (before my
time) the concept of making them irq safe was not considered.  Recently,
I learned that the hugetlb_lock is held for a linear scan of ALL hugetlb
pages during a cgroup reparentling operation.  That is just too long.

If there is no viable work queue solution, then I think we would like to
restructure the hugetlb locking before a change to just make hugetlb_lock
irq safe.  The idea would be to split the scope of what is done under
hugetlb_lock.  Most of it would never be executed in irq context.  Then
have a small/limited set of functionality that really needs to be irq
safe protected by an irq safe lock.

-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 19:08                 ` Mike Kravetz
@ 2019-12-16 19:13                   ` Waiman Long
  0 siblings, 0 replies; 27+ messages in thread
From: Waiman Long @ 2019-12-16 19:13 UTC (permalink / raw)
  To: Mike Kravetz, Michal Hocko, Andrew Morton, Matthew Wilcox,
	linux-kernel, linux-mm, aneesh.kumar

On 12/16/19 2:08 PM, Mike Kravetz wrote:
> On 12/16/19 8:17 AM, Davidlohr Bueso wrote:
>> On Mon, 16 Dec 2019, Michal Hocko wrote:
>>> I am afraid that work_struct is too large to be stuffed into the struct
>>> page array (because of the lockdep part).
>> Yeah, this needs to be done without touching struct page.
>>
>> Which is why I had done the stack allocated way in this patch, but we
>> cannot wait for it to complete in irq, so that's out the window. Andi
>> had suggested percpu allocated work items, but having played with the
>> idea over the weekend, I don't see how we can prevent another page being
>> freed on the same cpu before previous work on the same cpu is complete
>> (cpu0 wants to free pageA, schedules the work, in the mean time cpu0
>> wants to free pageB and workerfn for pageA still hasn't been called).
>>
>>> I think that it would be just safer to make hugetlb_lock irq safe. Are
>>> there any other locks that would require the same?
>> It would be simpler. Any performance issues that arise would probably
>> be only seen in microbenchmarks, assuming we want to have full irq safety.
>> If we don't need to worry about hardirq, then even better.
>>
>> The subpool lock would also need to be irq safe.
> I do think we need to worry about hardirq.  There are no restruictions that
> put_page can not be called from hardirq context. 
>
> I am concerned about the latency of making hugetlb_lock (and potentially
> subpool lock) hardirq safe.  When these locks were introduced (before my
> time) the concept of making them irq safe was not considered.  Recently,
> I learned that the hugetlb_lock is held for a linear scan of ALL hugetlb
> pages during a cgroup reparentling operation.  That is just too long.
>
> If there is no viable work queue solution, then I think we would like to
> restructure the hugetlb locking before a change to just make hugetlb_lock
> irq safe.  The idea would be to split the scope of what is done under
> hugetlb_lock.  Most of it would never be executed in irq context.  Then
> have a small/limited set of functionality that really needs to be irq
> safe protected by an irq safe lock.
>
Please take a look at my recently posted patch to see if that is an
acceptable workqueue based solution.

Thanks,
Longman


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-16 18:44                     ` Waiman Long
@ 2019-12-17  9:00                       ` Michal Hocko
  2019-12-17 18:05                         ` Mike Kravetz
  0 siblings, 1 reply; 27+ messages in thread
From: Michal Hocko @ 2019-12-17  9:00 UTC (permalink / raw)
  To: Mike Kravetz, Waiman Long
  Cc: Andrew Morton, Matthew Wilcox, linux-kernel, linux-mm,
	aneesh.kumar, Jarod Wilson

On Mon 16-12-19 13:44:53, Waiman Long wrote:
> On 12/16/19 10:38 AM, Waiman Long wrote:
[...]
> >> Can you extract guts of the testcase and integrate them into hugetlb
> >> test suite?
> 
> BTW, what hugetlb test suite are you talking about?

I was using tests from libhugetlbfs package in the past. There are few
tests in LTP project but the libhugetlbfs coverage used to cover the
largest part of the functionality.

Is there any newer home for the package than [1], Mike? Btw. would it
mak sense to migrate those tests to a more common place, LTP or kernel
selftests?

[1] https://github.com/libhugetlbfs/libhugetlbfs/tree/master/tests
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue
  2019-12-17  9:00                       ` Michal Hocko
@ 2019-12-17 18:05                         ` Mike Kravetz
  2019-12-18 12:18                           ` hugetlbfs testing coverage (was: Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue) Michal Hocko
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Kravetz @ 2019-12-17 18:05 UTC (permalink / raw)
  To: Michal Hocko, Waiman Long, Eric B Munson
  Cc: Andrew Morton, Matthew Wilcox, linux-kernel, linux-mm,
	aneesh.kumar, Jarod Wilson

Cc: Eric

On 12/17/19 1:00 AM, Michal Hocko wrote:
> On Mon 16-12-19 13:44:53, Waiman Long wrote:
>> On 12/16/19 10:38 AM, Waiman Long wrote:
> [...]
>>>> Can you extract guts of the testcase and integrate them into hugetlb
>>>> test suite?
>>
>> BTW, what hugetlb test suite are you talking about?
> 
> I was using tests from libhugetlbfs package in the past. There are few
> tests in LTP project but the libhugetlbfs coverage used to cover the
> largest part of the functionality.
> 
> Is there any newer home for the package than [1], Mike? Btw. would it
> mak sense to migrate those tests to a more common place, LTP or kernel
> selftests?

That is the latest home/release for libhugetlbfs.

The libhugetlbfs test suite is somewhat strange in that I suspect it started
as testing for libhugetlbfs itself.  When it was written, the thought may have
been that people would use libhugetlfs as the primary interface to hugetlb
pages.  That is not the case today.  Over time, hugetlbfs tests not associated
with libhugetlbfs were added.

If we want to migrate libhugetlbfs tests, then I think we would only want to
migrate the non-libhugetlbfs test cases.  Although, the libhugetlbfs specific
tests are useful as they 'could' point out regressions.

Added Eric (libhugetlbfs maintainer) on Cc for another opinion.

> 
> [1] https://github.com/libhugetlbfs/libhugetlbfs/tree/master/tests
> 


-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 27+ messages in thread

* hugetlbfs testing coverage (was: Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue)
  2019-12-17 18:05                         ` Mike Kravetz
@ 2019-12-18 12:18                           ` Michal Hocko
  0 siblings, 0 replies; 27+ messages in thread
From: Michal Hocko @ 2019-12-18 12:18 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: Waiman Long, Eric B Munson, Andrew Morton, Matthew Wilcox,
	linux-kernel, linux-mm, aneesh.kumar, Jarod Wilson

On Tue 17-12-19 10:05:06, Mike Kravetz wrote:
> Cc: Eric
> 
> On 12/17/19 1:00 AM, Michal Hocko wrote:
> > On Mon 16-12-19 13:44:53, Waiman Long wrote:
> >> On 12/16/19 10:38 AM, Waiman Long wrote:
> > [...]
> >>>> Can you extract guts of the testcase and integrate them into hugetlb
> >>>> test suite?
> >>
> >> BTW, what hugetlb test suite are you talking about?
> > 
> > I was using tests from libhugetlbfs package in the past. There are few
> > tests in LTP project but the libhugetlbfs coverage used to cover the
> > largest part of the functionality.
> > 
> > Is there any newer home for the package than [1], Mike? Btw. would it
> > mak sense to migrate those tests to a more common place, LTP or kernel
> > selftests?
> 
> That is the latest home/release for libhugetlbfs.
> 
> The libhugetlbfs test suite is somewhat strange in that I suspect it started
> as testing for libhugetlbfs itself.  When it was written, the thought may have
> been that people would use libhugetlfs as the primary interface to hugetlb
> pages.  That is not the case today.  Over time, hugetlbfs tests not associated
> with libhugetlbfs were added.
> 
> If we want to migrate libhugetlbfs tests, then I think we would only want to
> migrate the non-libhugetlbfs test cases.  Although, the libhugetlbfs specific
> tests are useful as they 'could' point out regressions.

Yeah, I can second that. I remember using the suite and it pointed to
real issues when I was touching the area in the past. So if we can get
as many tests to be independent on the library and integrate it to some
existing testing project - be it kernel selftest or LTP - then it would
be really great and I assume the testing coverage of the hugetlb
functionality would increase dramatically.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2019-12-18 12:18 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-11 19:46 [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Waiman Long
2019-12-11 22:04 ` Mike Kravetz
2019-12-11 22:19   ` Waiman Long
2019-12-12  1:11     ` Mike Kravetz
2019-12-12  6:06       ` Davidlohr Bueso
2019-12-12  6:30         ` Davidlohr Bueso
2019-12-12 19:04           ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Davidlohr Bueso
2019-12-12 19:22             ` Mike Kravetz
2019-12-12 19:36               ` Davidlohr Bueso
2019-12-12 20:52               ` Waiman Long
2019-12-12 21:04                 ` Waiman Long
2019-12-16 13:26                 ` Michal Hocko
2019-12-16 15:38                   ` Waiman Long
2019-12-16 18:44                     ` Waiman Long
2019-12-17  9:00                       ` Michal Hocko
2019-12-17 18:05                         ` Mike Kravetz
2019-12-18 12:18                           ` hugetlbfs testing coverage (was: Re: [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue) Michal Hocko
2019-12-12 21:01             ` [PATCH v2] mm/hugetlb: defer free_huge_page() to a workqueue Waiman Long
2019-12-16 13:37             ` Michal Hocko
2019-12-16 16:17               ` Waiman Long
2019-12-16 16:17               ` Davidlohr Bueso
2019-12-16 17:18                 ` Matthew Wilcox
2019-12-16 19:08                 ` Mike Kravetz
2019-12-16 19:13                   ` Waiman Long
2019-12-12 21:32         ` [PATCH v2] hugetlbfs: Disable softIRQ when taking hugetlb_lock Andi Kleen
2019-12-12 22:42           ` Davidlohr Bueso
2019-12-11 23:05 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).