All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
@ 2023-08-17 18:18 Sidhartha Kumar
  2023-08-18 18:03 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Sidhartha Kumar @ 2023-08-17 18:18 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: akpm, songmuchun, mike.kravetz, willy, Sidhartha Kumar

Remove special cased hugetlb handling code within the page cache by
changing the granularity of each index to the base page size rather than
the huge page size. Adds new wrappers for hugetlb code to to interact with the
page cache which convert to a linear index.

========================= PERFORMANCE ======================================

Perf was used to check the performance differences after the patch. Overall
the performance is similar to mainline with a very small larger overhead that
occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because
of the larger overhead that occurs in xa_load() and xa_store() as the
xarray is now using more entries to store hugetlb folios in the page cache.

Timing

aarch64
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt
            real    1m49.568s
            user    0m0.000s
            sys     1m49.461s

        6.5-rc3:
            [root]# time fallocate -l 700GB test.txt
            real    1m47.495s
            user    0m0.000s
            sys     1m47.370s  
    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m47.024s
            user    0m0.000s
            sys     1m46.921s

        6.5-rc3:
            [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt
            real    1m44.551s
            user    0m0.000s
            sys     1m44.438s

x86
    2MB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt
            real    0m22.383s
            user    0m0.000s
            sys     0m22.255s

        6.5-rc3:
            [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt
            real    0m22.735s
            user    0m0.038s
            sys     0m22.567s

    1GB Page Size
        6.5-rc3 + this patch:
            [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt
            real    0m25.786s
            user    0m0.001s
            sys     0m25.589s

        6.5-rc3:
            [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt
            real    0m33.454s
            user    0m0.001s
            sys     0m33.193s


aarch64:
    workload - fallocate a 700GB file backed by huge pages 
    
    6.5-rc3 + this patch:
        2MB Page Size:
            --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |          
                          |--95.04%--__pi_clear_page
                          |          
                          |--3.57%--clear_huge_page
                          |          |          
                          |          |--2.63%--rcu_all_qs
                          |          |          
                          |           --0.91%--__cond_resched
                          |          
                           --0.67%--__cond_resched
            0.17%     0.00%             0  fallocate  [kernel.vmlinux]       [k] hugetlb_add_to_page_cache
            0.14%     0.10%            11  fallocate  [kernel.vmlinux]       [k] __filemap_add_folio

    6.5-rc3
        2MB Page Size:
                --100.00%--__arm64_sys_fallocate
                          ksys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |          
                          |--94.91%--__pi_clear_page
                          |          
                          |--4.11%--clear_huge_page
                          |          |          
                          |          |--3.00%--rcu_all_qs
                          |          |          
                          |           --1.10%--__cond_resched
                          |          
                           --0.59%--__cond_resched
            0.08%     0.01%             1  fallocate  [kernel.kallsyms]  [k] hugetlb_add_to_page_cache
            0.05%     0.03%             3  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

x86
    workload - fallocate a 100GB file backed by huge pages 
    
    6.5-rc3 + this patch:
        2MB Page Size:
            hugetlbfs_fallocate
            |          
            --99.57%--clear_huge_page
                |          
                --98.47%--clear_page_erms
                    |          
                    --0.53%--asm_sysvec_apic_timer_interrupt
                
            0.04%     0.04%             1  fallocate  [kernel.kallsyms]     [k] xa_load
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] hugetlb_add_to_page_cache
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] __filemap_add_folio
            0.04%     0.00%             0  fallocate  [kernel.kallsyms]     [k] xas_store

    6.5-rc3
        2MB Page Size:
                --99.93%--__x64_sys_fallocate
                          vfs_fallocate
                          hugetlbfs_fallocate
                          |          
                           --99.38%--clear_huge_page
                                     |          
                                     |--98.40%--clear_page_erms
                                     |          
                                      --0.59%--__cond_resched
            0.03%     0.03%             1  fallocate  [kernel.kallsyms]  [k] __filemap_add_folio

========================= TESTING ======================================

This patch passes libhugetlbfs tests and LTP hugetlb tests

********** TEST SUMMARY                                                                                                                                                                                                                    
*                      2M                                                                                                                                                                                                                  
*                      32-bit 64-bit                                                                                                                                                                                                       
*     Total testcases:   110    113                                                                                                                                                                                                        
*             Skipped:     0      0                                                                                                                                                                                                        
*                PASS:   107    113                                                                                                                                                                                                        
*                FAIL:     0      0                                                                                                                                                                                                        
*    Killed by signal:     3      0                                                                                                                                                                                                        
*   Bad configuration:     0      0                                                                                                                                                                                                        
*       Expected FAIL:     0      0                                                                                                                                                                                                        
*     Unexpected PASS:     0      0                                                                                                                                                                                                        
*    Test not present:     0      0                                                                                                                                                                                                        
* Strange test result:     0      0                                                                                                                                                                                                        
**********

###############################################################                                                                                                                                                                                                                                                                                                                                                                                                           
    Done executing testcases.                                                                                                                                                                                                      
    LTP Version:  20220527-178-g2761a81c4                                                                                                                                                                                          
###############################################################

Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>

---

rebased on mm-unstable 08/14/23

RFC v2[1] -> v1[2]
    -change direction of series to maintain both huge and base page size index
     rather than try to get rid of all references to a huge page sized index.

v1 -> v2[3]
    - squash seperate filemap and hugetlb changes into one patch to allow
      for bisection per Matthew
    - get rid of page_to_index()
    - fix errors in hugetlb_fallocate() and remove_inode_hugepages()

v2 -> v3[4]
    - gather performance data per Mike Kravetz
    - remove start variable in remove_inode_hugepages() per Mike Kravetz
    - remove hugetlb special case within folio_file_page()

v3 -> v4[5]
    - rebase to current mm-unstable
    - include time data per Mike Kravetz

v4 -> v5[6]
	- fix build issue by removing hugetlb_basepage_index() definition 
	   per intel test robot

v5 -> v6
	- remove folio_more_pages() from result of incorrect rebase

[1]: https://lore.kernel.org/linux-mm/20230519220142.212051-1-sidhartha.kumar@oracle.com/T/
[2]: https://lore.kernel.org/lkml/20230609194947.37196-1-sidhartha.kumar@oracle.com/
[3]: https://lore.kernel.org/lkml/ZLtVlJA+V2+2yjxc@casper.infradead.org/T/
[4]: https://lore.kernel.org/lkml/20230811233939.GA105247@monkey/T/
[5]: https://lore.kernel.org/linux-mm/202308151231.L2pfzOmu-lkp@intel.com/T/#t
[6]: https://lore.kernel.org/linux-mm/20230815191932.273054-1-sidhartha.kumar@oracle.com/T/

 fs/hugetlbfs/inode.c    | 15 ++++++++-------
 include/linux/hugetlb.h | 12 ++++++++++++
 include/linux/pagemap.h | 29 ++---------------------------
 mm/filemap.c            | 34 ++++++++++------------------------
 mm/hugetlb.c            | 25 ++++++-------------------
 5 files changed, 38 insertions(+), 77 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e7611ae1e612..ec0f856a1228 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 {
 	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> huge_page_shift(h);
-	const pgoff_t end = lend >> huge_page_shift(h);
+	const pgoff_t end = lend >> PAGE_SHIFT;
 	struct folio_batch fbatch;
 	pgoff_t next, index;
 	int i, freed = 0;
 	bool truncate_op = (lend == LLONG_MAX);
 
 	folio_batch_init(&fbatch);
-	next = start;
+	next = lstart >> PAGE_SHIFT;
 	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
 			struct folio *folio = fbatch.folios[i];
 			u32 hash = 0;
 
-			index = folio->index;
+			index = folio->index >> huge_page_order(h);
 			hash = hugetlb_fault_mutex_hash(mapping, index);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
@@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	}
 
 	if (truncate_op)
-		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+		(void)hugetlb_unreserve_pages(inode,
+				lstart >> huge_page_shift(h),
+				LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
@@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
 	pgoff_t idx = start >> huge_page_shift(h);
 	struct folio *folio;
 
-	folio = filemap_lock_folio(mapping, idx);
+	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 	if (IS_ERR(folio))
 		return;
 
@@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		/* See if already present in mapping to avoid alloc/free */
-		folio = filemap_get_folio(mapping, index);
+		folio = filemap_get_folio(mapping, index << huge_page_order(h));
 		if (!IS_ERR(folio)) {
 			folio_put(folio);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0a393bc02f25..1bb3fcacdcdd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -811,6 +811,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
 	return huge_page_size(h) / 512;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
 #include <asm/hugetlb.h>
 
 #ifndef is_hugepage_only_range
@@ -1005,6 +1011,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
 	return NULL;
 }
 
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+				struct address_space *mapping, pgoff_t idx)
+{
+	return NULL;
+}
+
 static inline int isolate_or_dissolve_huge_page(struct page *page,
 						struct list_head *list)
 {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 71dd79b4ae0a..6816d3ccbd7e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -729,9 +729,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
  */
 static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return &folio->page;
 	return folio_page(folio, index & (folio_nr_pages(folio) - 1));
 }
 
@@ -747,9 +744,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
  */
 static inline bool folio_contains(struct folio *folio, pgoff_t index)
 {
-	/* HugeTLBfs indexes the page cache in units of hpage_size */
-	if (folio_test_hugetlb(folio))
-		return folio->index == index;
 	return index - folio_index(folio) < folio_nr_pages(folio);
 }
 
@@ -807,10 +801,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
 }
 
 /*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
  */
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
 {
 	struct page *head;
 
@@ -825,19 +818,6 @@ static inline pgoff_t page_to_index(struct page *page)
 	return head->index + page - head;
 }
 
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
-	if (unlikely(PageHuge(page)))
-		return hugetlb_basepage_index(page);
-	return page_to_index(page);
-}
-
 /*
  * Return byte-offset into filesystem object for page.
  */
@@ -874,12 +854,9 @@ static inline loff_t folio_file_pos(struct folio *folio)
 
 /*
  * Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
  */
 static inline pgoff_t folio_pgoff(struct folio *folio)
 {
-	if (unlikely(folio_test_hugetlb(folio)))
-		return hugetlb_basepage_index(&folio->page);
 	return folio->index;
 }
 
@@ -890,8 +867,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
 	pgoff_t pgoff;
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return linear_hugepage_index(vma, address);
 	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
 	return pgoff;
diff --git a/mm/filemap.c b/mm/filemap.c
index 014b73eb96a1..94e8d96159bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -134,11 +134,8 @@ static void page_cache_delete(struct address_space *mapping,
 
 	mapping_set_update(&xas, mapping);
 
-	/* hugetlb pages are represented by a single entry in the xarray */
-	if (!folio_test_hugetlb(folio)) {
-		xas_set_order(&xas, folio->index, folio_order(folio));
-		nr = folio_nr_pages(folio);
-	}
+	xas_set_order(&xas, folio->index, folio_order(folio));
+	nr = folio_nr_pages(folio);
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
@@ -237,7 +234,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
 	if (free_folio)
 		free_folio(folio);
 
-	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+	if (folio_test_large(folio))
 		refs = folio_nr_pages(folio);
 	folio_put_refs(folio, refs);
 }
@@ -858,14 +855,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 
 	if (!huge) {
 		int error = mem_cgroup_charge(folio, NULL, gfp);
-		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
 		if (error)
 			return error;
 		charged = true;
-		xas_set_order(&xas, index, folio_order(folio));
-		nr = folio_nr_pages(folio);
 	}
 
+	VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
+	xas_set_order(&xas, index, folio_order(folio));
+	nr = folio_nr_pages(folio);
+
 	gfp &= GFP_RECLAIM_MASK;
 	folio_ref_add(folio, nr);
 	folio->mapping = mapping;
@@ -2038,7 +2036,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
 		int idx = folio_batch_count(fbatch) - 1;
 
 		folio = fbatch->folios[idx];
-		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+		if (!xa_is_value(folio))
 			nr = folio_nr_pages(folio);
 		*start = indices[idx] + nr;
 	}
@@ -2102,7 +2100,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
 		int idx = folio_batch_count(fbatch) - 1;
 
 		folio = fbatch->folios[idx];
-		if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+		if (!xa_is_value(folio))
 			nr = folio_nr_pages(folio);
 		*start = indices[idx] + nr;
 	}
@@ -2143,9 +2141,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
 			continue;
 		if (!folio_batch_add(fbatch, folio)) {
 			unsigned long nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
@@ -2211,9 +2206,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 
 		if (!folio_batch_add(fbatch, folio)) {
 			nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
@@ -2230,10 +2222,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 
 	if (nr) {
 		folio = fbatch->folios[nr - 1];
-		if (folio_test_hugetlb(folio))
-			*start = folio->index + 1;
-		else
-			*start = folio_next_index(folio);
+		*start = folio->index + folio_nr_pages(folio);
 	}
 out:
 	rcu_read_unlock();
@@ -2271,9 +2260,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
 			continue;
 		if (!folio_batch_add(fbatch, folio)) {
 			unsigned long nr = folio_nr_pages(folio);
-
-			if (folio_test_hugetlb(folio))
-				nr = 1;
 			*start = folio->index + nr;
 			goto out;
 		}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e327a5a7602c..dfd5bc63e61a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t)
 
 /*
  * Convert the address within this vma to the page offset within
- * the mapping, in pagecache page units; huge pages here.
+ * the mapping, huge page units here.
  */
 static pgoff_t vma_hugecache_offset(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
@@ -2113,20 +2113,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
 	return NULL;
 }
 
-pgoff_t hugetlb_basepage_index(struct page *page)
-{
-	struct page *page_head = compound_head(page);
-	pgoff_t index = page_index(page_head);
-	unsigned long compound_idx;
-
-	if (compound_order(page_head) > MAX_ORDER)
-		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
-	else
-		compound_idx = page - page_head;
-
-	return (index << compound_order(page_head)) + compound_idx;
-}
-
 static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
 		nodemask_t *node_alloc_noretry)
@@ -5750,7 +5736,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
 			struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	pgoff_t idx = vma_hugecache_offset(h, vma, address);
+	pgoff_t idx = linear_page_index(vma, address);
 	struct folio *folio;
 
 	folio = filemap_get_folio(mapping, idx);
@@ -5767,6 +5753,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
 	struct hstate *h = hstate_inode(inode);
 	int err;
 
+	idx <<= huge_page_order(h);
 	__folio_set_locked(folio);
 	err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
 
@@ -5874,7 +5861,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 * before we get page_table_lock.
 	 */
 	new_folio = false;
-	folio = filemap_lock_folio(mapping, idx);
+	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 	if (IS_ERR(folio)) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
@@ -6183,7 +6170,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Just decrements count, does not deallocate */
 		vma_end_reservation(h, vma, haddr);
 
-		pagecache_folio = filemap_lock_folio(mapping, idx);
+		pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 		if (IS_ERR(pagecache_folio))
 			pagecache_folio = NULL;
 	}
@@ -6315,7 +6302,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 
 	if (is_continue) {
 		ret = -EFAULT;
-		folio = filemap_lock_folio(mapping, idx);
+		folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 		if (IS_ERR(folio))
 			goto out;
 		folio_in_pagecache = true;
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-17 18:18 [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c Sidhartha Kumar
@ 2023-08-18 18:03 ` Andrew Morton
  2023-08-18 18:09   ` Matthew Wilcox
  2023-08-21 18:33 ` Mike Kravetz
  2023-08-22 17:15 ` Matthew Wilcox
  2 siblings, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2023-08-18 18:03 UTC (permalink / raw)
  To: Sidhartha Kumar; +Cc: linux-kernel, linux-mm, songmuchun, mike.kravetz, willy

On Thu, 17 Aug 2023 11:18:36 -0700 Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:

> Perf was used to check the performance differences after the patch. Overall
> the performance is similar to mainline with a very small larger overhead that
> occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because
> of the larger overhead that occurs in xa_load() and xa_store() as the
> xarray is now using more entries to store hugetlb folios in the page cache.

So... why merge the patch?  To save 40 lines of code?

I mean, if a patch which added 40 lines yielded a "very small"
reduction in overhead, we'd probably merge it!

Or is there some wider reason for this which the changelog omitted?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-18 18:03 ` Andrew Morton
@ 2023-08-18 18:09   ` Matthew Wilcox
  2023-08-18 18:34     ` Mike Kravetz
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew Wilcox @ 2023-08-18 18:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Sidhartha Kumar, linux-kernel, linux-mm, songmuchun, mike.kravetz

On Fri, Aug 18, 2023 at 11:03:09AM -0700, Andrew Morton wrote:
> On Thu, 17 Aug 2023 11:18:36 -0700 Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
> 
> > Perf was used to check the performance differences after the patch. Overall
> > the performance is similar to mainline with a very small larger overhead that
> > occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because
> > of the larger overhead that occurs in xa_load() and xa_store() as the
> > xarray is now using more entries to store hugetlb folios in the page cache.
> 
> So... why merge the patch?  To save 40 lines of code?
> 
> I mean, if a patch which added 40 lines yielded a "very small"
> reduction in overhead, we'd probably merge it!
> 
> Or is there some wider reason for this which the changelog omitted?

Sidhartha's benchmarks are for hugetlbfs which is where we're likely
to see performance regressions.  What's not shown are any performance
numbers for !hugetlbfs.  The functions where this patch removes code
are used on _every_ page cache lookup, so given the typical difference
in number of lookups performed against hugetlb vs non-hugetlb files,
even saving 0.1% performance on non-hugetlb files will lead to fewer
instructions executed overall.

There's also a conceptual reduction in complexity.  We no longer need to
think about whether the inode is hugetlb or not before doing the lookup
and scaling the byte offset differently.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-18 18:09   ` Matthew Wilcox
@ 2023-08-18 18:34     ` Mike Kravetz
  2023-08-18 18:54       ` Sidhartha Kumar
  0 siblings, 1 reply; 10+ messages in thread
From: Mike Kravetz @ 2023-08-18 18:34 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, Sidhartha Kumar, linux-kernel, linux-mm, songmuchun

On 08/18/23 19:09, Matthew Wilcox wrote:
> 
> There's also a conceptual reduction in complexity.  We no longer need to
> think about whether the inode is hugetlb or not before doing the lookup
> and scaling the byte offset differently.

I 'think' this was the primary motivation for this patch.

Recent discussions about HGM always came back to hugetlb special case code
in the core mm.  HGM would add a little more, and most people thought there
was too much already.  Most everyone in those discussions agreed there
should be some effort to unify hugetlb and core mm code as much as possible
to reduce these special cases.  This is one step in that direction.

As a hugetlb maintainer, I don't like any decrease in performance even
if it is not readily observable.  However, as a member of the mm community
I like the fact that there is less special case hugetlb code.  Overall,
I consider this a win.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-18 18:34     ` Mike Kravetz
@ 2023-08-18 18:54       ` Sidhartha Kumar
  2023-08-18 19:24         ` Andrew Morton
  0 siblings, 1 reply; 10+ messages in thread
From: Sidhartha Kumar @ 2023-08-18 18:54 UTC (permalink / raw)
  To: Mike Kravetz, Matthew Wilcox
  Cc: Andrew Morton, linux-kernel, linux-mm, songmuchun

On 8/18/23 11:34 AM, Mike Kravetz wrote:
> On 08/18/23 19:09, Matthew Wilcox wrote:
>>
>> There's also a conceptual reduction in complexity.  We no longer need to
>> think about whether the inode is hugetlb or not before doing the lookup
>> and scaling the byte offset differently.
> 
> I 'think' this was the primary motivation for this patch.
> 
> Recent discussions about HGM always came back to hugetlb special case code
> in the core mm.  HGM would add a little more, and most people thought there
> was too much already.  Most everyone in those discussions agreed there
> should be some effort to unify hugetlb and core mm code as much as possible
> to reduce these special cases.  This is one step in that direction.
> 

yes, the motivation for this patch came from the discussion about the 
various places in mm where hugetlb is special cased and how there should 
be an effort to unify these cases. As Matthew mentions, this series gets 
rid of branches that would occur on every page cache lookup. The 
comments in pagemap.h also mention

- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
so this has been a goal for the page cache.

Thanks
Sidhartha Kumar

> As a hugetlb maintainer, I don't like any decrease in performance even
> if it is not readily observable.  However, as a member of the mm community
> I like the fact that there is less special case hugetlb code.  Overall,
> I consider this a win.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-18 18:54       ` Sidhartha Kumar
@ 2023-08-18 19:24         ` Andrew Morton
  0 siblings, 0 replies; 10+ messages in thread
From: Andrew Morton @ 2023-08-18 19:24 UTC (permalink / raw)
  To: Sidhartha Kumar
  Cc: Mike Kravetz, Matthew Wilcox, linux-kernel, linux-mm, songmuchun

On Fri, 18 Aug 2023 11:54:24 -0700 Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:

> yes, the motivation for this patch came from the discussion about the 
> various places in mm where hugetlb is special cased and how there should 
> be an effort to unify these cases. As Matthew mentions, this series gets 
> rid of branches that would occur on every page cache lookup. 

Thanks.  Could we please get all this spelled out in the changelog?

Probably in a few days, give people time to review the patch itself.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-17 18:18 [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c Sidhartha Kumar
  2023-08-18 18:03 ` Andrew Morton
@ 2023-08-21 18:33 ` Mike Kravetz
  2023-09-05  4:05   ` Sidhartha Kumar
  2023-08-22 17:15 ` Matthew Wilcox
  2 siblings, 1 reply; 10+ messages in thread
From: Mike Kravetz @ 2023-08-21 18:33 UTC (permalink / raw)
  To: Sidhartha Kumar; +Cc: linux-kernel, linux-mm, akpm, songmuchun, willy

On 08/17/23 11:18, Sidhartha Kumar wrote:
> Remove special cased hugetlb handling code within the page cache by
> changing the granularity of each index to the base page size rather than
> the huge page size. Adds new wrappers for hugetlb code to to interact with the
> page cache which convert to a linear index.
<snip>
> @@ -237,7 +234,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
>  	if (free_folio)
>  		free_folio(folio);
>  
> -	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
> +	if (folio_test_large(folio))
>  		refs = folio_nr_pages(folio);
>  	folio_put_refs(folio, refs);
>  }
> @@ -858,14 +855,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
>  
>  	if (!huge) {
>  		int error = mem_cgroup_charge(folio, NULL, gfp);
> -		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
>  		if (error)
>  			return error;
>  		charged = true;
> -		xas_set_order(&xas, index, folio_order(folio));
> -		nr = folio_nr_pages(folio);
>  	}

When a hugetlb page is added to the page cache, the ref count will now
be increased by folio_nr_pages.  So, the ref count for a 2MB hugetlb page
on x86 will be increased by 512.

We will need a corresponding change to migrate_huge_page_move_mapping().
For migration, the ref count is checked as follows:

	xas_lock_irq(&xas);
	expected_count = 2 + folio_has_private(src);
	if (!folio_ref_freeze(src, expected_count)) {
		xas_unlock_irq(&xas);
		return -EAGAIN;
	}

So, this patch will break hugetlb migration of hugetlb pages in the page
cache.

Sorry for not noticing this earlier.
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-17 18:18 [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c Sidhartha Kumar
  2023-08-18 18:03 ` Andrew Morton
  2023-08-21 18:33 ` Mike Kravetz
@ 2023-08-22 17:15 ` Matthew Wilcox
  2 siblings, 0 replies; 10+ messages in thread
From: Matthew Wilcox @ 2023-08-22 17:15 UTC (permalink / raw)
  To: Sidhartha Kumar; +Cc: linux-kernel, linux-mm, akpm, songmuchun, mike.kravetz

On Thu, Aug 17, 2023 at 11:18:36AM -0700, Sidhartha Kumar wrote:
> @@ -890,8 +867,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
>  					unsigned long address)
>  {
>  	pgoff_t pgoff;
> -	if (unlikely(is_vm_hugetlb_page(vma)))
> -		return linear_hugepage_index(vma, address);
>  	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
>  	pgoff += vma->vm_pgoff;
>  	return pgoff;

This is the last use of linear_hugepage_index(), so please remove the
function and its declaration too.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-08-21 18:33 ` Mike Kravetz
@ 2023-09-05  4:05   ` Sidhartha Kumar
  2023-09-07  0:18     ` Mike Kravetz
  0 siblings, 1 reply; 10+ messages in thread
From: Sidhartha Kumar @ 2023-09-05  4:05 UTC (permalink / raw)
  To: Mike Kravetz; +Cc: linux-kernel, linux-mm, akpm, songmuchun, willy

On 8/21/23 11:33 AM, Mike Kravetz wrote:
> On 08/17/23 11:18, Sidhartha Kumar wrote:
>> Remove special cased hugetlb handling code within the page cache by
>> changing the granularity of each index to the base page size rather than
>> the huge page size. Adds new wrappers for hugetlb code to to interact with the
>> page cache which convert to a linear index.
> <snip>
>> @@ -237,7 +234,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
>>   	if (free_folio)
>>   		free_folio(folio);
>>   
>> -	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
>> +	if (folio_test_large(folio))
>>   		refs = folio_nr_pages(folio);
>>   	folio_put_refs(folio, refs);
>>   }
>> @@ -858,14 +855,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
>>   
>>   	if (!huge) {
>>   		int error = mem_cgroup_charge(folio, NULL, gfp);
>> -		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
>>   		if (error)
>>   			return error;
>>   		charged = true;
>> -		xas_set_order(&xas, index, folio_order(folio));
>> -		nr = folio_nr_pages(folio);
>>   	}
> 
> When a hugetlb page is added to the page cache, the ref count will now
> be increased by folio_nr_pages.  So, the ref count for a 2MB hugetlb page
> on x86 will be increased by 512.
> 
> We will need a corresponding change to migrate_huge_page_move_mapping().
> For migration, the ref count is checked as follows:
> 
> 	xas_lock_irq(&xas);
> 	expected_count = 2 + folio_has_private(src);
Hi Mike,

Thanks for catching this. Changing this line to:
+	expected_count = folio_expected_refs(mapping, src);
seems to fix migration from my testing. My test was inserting a sleep() 
in the hugepage-mmap.c selftest and running the migratepages command.

With this version of the patch:
migrate_pages(44906, 65, [0x0000000000000001], [0x0000000000000002]) = 75
which means 75 pages did not migrate and after the change to 
folio_expected_refs():
migrate_pages(7344, 65, [0x0000000000000001], [0x0000000000000002]) = 0

Does that change look correct to you?

Thanks,
Sid Kumar


> 	if (!folio_ref_freeze(src, expected_count)) {
> 		xas_unlock_irq(&xas);
> 		return -EAGAIN;
> 	}
> 
> So, this patch will break hugetlb migration of hugetlb pages in the page
> cache.
> 
> Sorry for not noticing this earlier.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
  2023-09-05  4:05   ` Sidhartha Kumar
@ 2023-09-07  0:18     ` Mike Kravetz
  0 siblings, 0 replies; 10+ messages in thread
From: Mike Kravetz @ 2023-09-07  0:18 UTC (permalink / raw)
  To: Sidhartha Kumar; +Cc: linux-kernel, linux-mm, akpm, songmuchun, willy

[-- Attachment #1: Type: text/plain, Size: 2654 bytes --]

On 09/04/23 21:05, Sidhartha Kumar wrote:
> On 8/21/23 11:33 AM, Mike Kravetz wrote:
> > On 08/17/23 11:18, Sidhartha Kumar wrote:
> > > Remove special cased hugetlb handling code within the page cache by
> > > changing the granularity of each index to the base page size rather than
> > > the huge page size. Adds new wrappers for hugetlb code to to interact with the
> > > page cache which convert to a linear index.
> > <snip>
> > > @@ -237,7 +234,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio)
> > >   	if (free_folio)
> > >   		free_folio(folio);
> > > -	if (folio_test_large(folio) && !folio_test_hugetlb(folio))
> > > +	if (folio_test_large(folio))
> > >   		refs = folio_nr_pages(folio);
> > >   	folio_put_refs(folio, refs);
> > >   }
> > > @@ -858,14 +855,15 @@ noinline int __filemap_add_folio(struct address_space *mapping,
> > >   	if (!huge) {
> > >   		int error = mem_cgroup_charge(folio, NULL, gfp);
> > > -		VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
> > >   		if (error)
> > >   			return error;
> > >   		charged = true;
> > > -		xas_set_order(&xas, index, folio_order(folio));
> > > -		nr = folio_nr_pages(folio);
> > >   	}
> > 
> > When a hugetlb page is added to the page cache, the ref count will now
> > be increased by folio_nr_pages.  So, the ref count for a 2MB hugetlb page
> > on x86 will be increased by 512.
> > 
> > We will need a corresponding change to migrate_huge_page_move_mapping().
> > For migration, the ref count is checked as follows:
> > 
> > 	xas_lock_irq(&xas);
> > 	expected_count = 2 + folio_has_private(src);
> Hi Mike,
> 
> Thanks for catching this. Changing this line to:
> +	expected_count = folio_expected_refs(mapping, src);
> seems to fix migration from my testing. My test was inserting a sleep() in
> the hugepage-mmap.c selftest and running the migratepages command.
> 
> With this version of the patch:
> migrate_pages(44906, 65, [0x0000000000000001], [0x0000000000000002]) = 75
> which means 75 pages did not migrate and after the change to
> folio_expected_refs():
> migrate_pages(7344, 65, [0x0000000000000001], [0x0000000000000002]) = 0
> 
> Does that change look correct to you?

I just ran the simple attached test program (don't laugh) on the suggested
change.  Command line './move-pages 2 /var/opt/oracle/hugepool/foo'.
Unfortunately, migration is not working as expected.  The source pages of
the migration are not freed.

I have not taken a closer look at the code to get an idea about root cause.
Certainly, it has to do with the ref counts.  I can look closer in a day or
two if you have not resolved the issue.
-- 
Mike Kravetz

[-- Attachment #2: move-pages.c --]
[-- Type: text/plain, Size: 3777 bytes --]

/*
 * hugepage-mmap:
 *
 * Example of using huge page memory in a user application using the mmap
 * system call.  Before running this application, make sure that the
 * administrator has mounted the hugetlbfs filesystem (on some directory
 * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
 * example, the app is requesting memory of size 256MB that is backed by
 * huge pages.
 *
 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
 * huge pages.  That means that if one requires a fixed address, a huge page
 * aligned address starting with 0x800000... will be required.  If a fixed
 * address is not required, the kernel will select an address in the proper
 * range.
 * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
 */

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#define __USE_GNU
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <time.h>
#include <numa.h>
#include <numaif.h>

#define USAGE "USAGE: %s num_hpages hugepagefile_name"
#define H_PAGESIZE (2 * 1024 * 1024)
#define B_PAGESIZE (4096)

#define ITERATIONS 100000

#define PROTECTION (PROT_READ | PROT_WRITE)
#define ADDR (void *)(0x0UL)
#define FLAGS (MAP_SHARED)

int main(int argc, char ** argv)
{
	char *f_name;
	char *sep;
	char ch;
	int fd;
	long i;
	long long hpages, bpages;
	void *addr;
	char foo;
	long count = 0;
	void **pages;
	int *nodes;
	int *status;
	int flags;
	long m_ret;
	/*
	 * HARD CODED FOR TWO NODES: 0 and 1
	 */
	unsigned long node0_mask = 01L << 0;
	unsigned long node1_mask = 01L << 1;

	if (argc != 3) {
		printf(USAGE, argv[0]);
		exit (1);
	}

	hpages = strtol(argv[1], &sep, 0);
	if (errno || hpages < 0) {
		printf("Invalid number hpages (%s)\n", argv[1]);
		printf(USAGE, argv[0]);
		exit (1);
	}
	bpages = hpages * (H_PAGESIZE / B_PAGESIZE);

	f_name = argv[2];
	fd = open(f_name, O_CREAT | O_RDWR, 0755);
	if (fd < 0) {
		printf("Open of %s failed", argv[2]);
		exit(1);
	}

	addr = mmap(ADDR, hpages * H_PAGESIZE, PROTECTION, FLAGS, fd, 0);
	if (addr == MAP_FAILED) {
		perror("mmap");
		exit (1);
	}
	printf("%ld huge pages mapped at 0x%lx\n", hpages,
		( unsigned long)addr);
	printf("Faulting in all pages\n");
	for (i=0; i < hpages; i++)
		foo = *((char *)(addr + (i * H_PAGESIZE)));

	pages = malloc(bpages * sizeof(void *));
	nodes = malloc(bpages * sizeof(int));
	status = malloc(bpages * sizeof(int));
	if (!pages || !nodes || !status) {
		printf("error allocating memory for arrays\n");
		exit (1);
	}

while (1) {
	printf("Hit any key to move hugetlb pages to node 1\n");
	read(STDIN_FILENO, &ch, 1);

	for (i=0; i < hpages; i++) {
		pages[i] = addr + (i * H_PAGESIZE);
		// pages[i] = addr + (i * H_PAGESIZE) + B_PAGESIZE;
		nodes[i] = 1;
		status[i] = -1;
		flags = MPOL_MF_MOVE_ALL;
	}
	m_ret = numa_move_pages(0, hpages, pages, nodes, status, flags);
	if (m_ret) {
		perror("move_pages");
		if (m_ret > 0)
			printf("%ld pages not migrated\n", m_ret);
	} else {
		printf("Success!\n");
	}
	for (i=0; i < hpages; i++) {
		printf("\tstatus[%d] = %d\n", i, status[i]);
		status[i] = -1;
	}

	printf("Hit any key to move hugetlb pages to node 0\n");
	read(STDIN_FILENO, &ch, 1);
	for (i=0; i < hpages; i++) {
		pages[i] = addr + (i * H_PAGESIZE);
		// pages[i] = addr + (i * H_PAGESIZE) + B_PAGESIZE;
		nodes[i] = 0;
		status[i] = -1;
		flags = MPOL_MF_MOVE_ALL;
	}
	m_ret = numa_move_pages(0, hpages, pages, nodes, status, flags);
	if (m_ret) {
		perror("move_pages");
		if (m_ret > 0)
			printf("%ld pages not migrated\n", m_ret);
	} else {
		printf("Success!\n");
	}
	for (i=0; i < hpages; i++) {
		printf("\tstatus[%d] = %d\n", i, status[i]);
		status[i] = -1;
	}
}

	munmap(addr, hpages * H_PAGESIZE);
	close(fd);

	return 0;
}

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-09-07  0:19 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-17 18:18 [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c Sidhartha Kumar
2023-08-18 18:03 ` Andrew Morton
2023-08-18 18:09   ` Matthew Wilcox
2023-08-18 18:34     ` Mike Kravetz
2023-08-18 18:54       ` Sidhartha Kumar
2023-08-18 19:24         ` Andrew Morton
2023-08-21 18:33 ` Mike Kravetz
2023-09-05  4:05   ` Sidhartha Kumar
2023-09-07  0:18     ` Mike Kravetz
2023-08-22 17:15 ` Matthew Wilcox

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.