[PATCH v1 1/2] mm: cma: introduce cma_release_nowait()

From: Roman Gushchin <guro@fb.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Zi Yan <ziy@nvidia.com>, Joonsoo Kim <iamjoonsoo.kim@lge.com>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	<saberlily.xia@hisilicon.com>, <linux-kernel@vger.kernel.org>,
	<linux-mm@kvack.org>, <kernel-team@fb.com>,
	Roman Gushchin <guro@fb.com>
Subject: [PATCH v1 1/2] mm: cma: introduce cma_release_nowait()
Date: Thu, 22 Oct 2020 15:53:07 -0700	[thread overview]
Message-ID: <20201022225308.2927890-2-guro@fb.com> (raw)
In-Reply-To: <20201022225308.2927890-1-guro@fb.com>

cma_release() has to lock the cma_lock mutex to clear the cma bitmap.
It makes it a blocking function, which complicates its usage from
non-blocking contexts. For instance, hugetlbfs code is temporarily
dropping the hugetlb_lock spinlock to call cma_release().

This patch introduces a non-blocking cma_release_nowait(), which
postpones the cma bitmap clearance. It's done later from a work
context. The first page in the cma allocation is used to store
the work struct. Because CMA allocations and de-allocations are
usually not that frequent, a single global workqueue is used.

To make sure that subsequent cma_alloc() call will pass, cma_alloc()
flushes the cma_release_wq workqueue. To avoid a performance
regression in the case when only cma_release() is used, gate it
by a per-cma area flag, which is set by the first call
of cma_release_nowait().

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 include/linux/cma.h |  2 +
 mm/cma.c            | 93 +++++++++++++++++++++++++++++++++++++++++++++
 mm/cma.h            |  5 +++
 3 files changed, 100 insertions(+)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 217999c8a762..497eca478c2f 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -47,6 +47,8 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 			      bool no_warn);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+extern bool cma_release_nowait(struct cma *cma, const struct page *pages,
+			       unsigned int count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index 7f415d7cda9f..9fcfddcf1a6c 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -36,10 +36,19 @@
 
 #include "cma.h"
 
+struct cma_clear_bitmap_work {
+	struct work_struct work;
+	struct cma *cma;
+	unsigned long pfn;
+	unsigned int count;
+};
+
 struct cma cma_areas[MAX_CMA_AREAS];
 unsigned cma_area_count;
 static DEFINE_MUTEX(cma_mutex);
 
+struct workqueue_struct *cma_release_wq;
+
 phys_addr_t cma_get_base(const struct cma *cma)
 {
 	return PFN_PHYS(cma->base_pfn);
@@ -148,6 +157,10 @@ static int __init cma_init_reserved_areas(void)
 	for (i = 0; i < cma_area_count; i++)
 		cma_activate_area(&cma_areas[i]);
 
+	cma_release_wq = create_workqueue("cma_release");
+	if (!cma_release_wq)
+		return -ENOMEM;
+
 	return 0;
 }
 core_initcall(cma_init_reserved_areas);
@@ -205,6 +218,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 
 	cma->base_pfn = PFN_DOWN(base);
 	cma->count = size >> PAGE_SHIFT;
+	cma->flags = 0;
 	cma->order_per_bit = order_per_bit;
 	*res_cma = cma;
 	cma_area_count++;
@@ -437,6 +451,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 		return NULL;
 
 	for (;;) {
+		/*
+		 * If the CMA bitmap is cleared asynchronously after
+		 * cma_release_nowait(), cma release workqueue has to be
+		 * flushed here in order to make the allocation succeed.
+		 */
+		if (test_bit(CMA_DELAYED_RELEASE, &cma->flags))
+			flush_workqueue(cma_release_wq);
+
 		mutex_lock(&cma->lock);
 		bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
 				bitmap_maxno, start, bitmap_count, mask,
@@ -528,6 +550,77 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 	return true;
 }
 
+static void cma_clear_bitmap_fn(struct work_struct *work)
+{
+	struct cma_clear_bitmap_work *w;
+
+	w = container_of(work, struct cma_clear_bitmap_work, work);
+
+	cma_clear_bitmap(w->cma, w->pfn, w->count);
+
+	__free_page(pfn_to_page(w->pfn));
+}
+
+/**
+ * cma_release_nowait() - release allocated pages without blocking
+ * @cma:   Contiguous memory region for which the allocation is performed.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * Similar to cma_release(), this function releases memory allocated
+ * by cma_alloc(), but unlike cma_release() is non-blocking and can be
+ * called from an atomic context.
+ * It returns false when provided pages do not belong to contiguous area
+ * and true otherwise.
+ */
+bool cma_release_nowait(struct cma *cma, const struct page *pages,
+			unsigned int count)
+{
+	struct cma_clear_bitmap_work *work;
+	unsigned long pfn;
+
+	if (!cma || !pages)
+		return false;
+
+	pr_debug("%s(page %p)\n", __func__, (void *)pages);
+
+	pfn = page_to_pfn(pages);
+
+	if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+		return false;
+
+	VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+
+	/*
+	 * Set CMA_DELAYED_RELEASE flag: subsequent cma_alloc()'s
+	 * will wait for the async part of cma_release_nowait() to
+	 * finish.
+	 */
+	if (unlikely(!test_bit(CMA_DELAYED_RELEASE, &cma->flags)))
+		set_bit(CMA_DELAYED_RELEASE, &cma->flags);
+
+	/*
+	 * To make cma_release_nowait() non-blocking, cma bitmap is cleared
+	 * from a work context (see cma_clear_bitmap_fn()). The first page
+	 * in the cma allocation is used to store the work structure,
+	 * so it's released after the cma bitmap clearance. Other pages
+	 * are released immediately as previously.
+	 */
+	if (count > 1)
+		free_contig_range(pfn + 1, count - 1);
+
+	work = (struct cma_clear_bitmap_work *)page_to_virt(pages);
+	INIT_WORK(&work->work, cma_clear_bitmap_fn);
+	work->cma = cma;
+	work->pfn = pfn;
+	work->count = count;
+	queue_work(cma_release_wq, &work->work);
+
+	trace_cma_release(pfn, pages, count);
+
+	return true;
+}
+
 int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
 {
 	int i;
diff --git a/mm/cma.h b/mm/cma.h
index 42ae082cb067..e9293871d122 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -7,6 +7,7 @@
 struct cma {
 	unsigned long   base_pfn;
 	unsigned long   count;
+	unsigned long   flags;
 	unsigned long   *bitmap;
 	unsigned int order_per_bit; /* Order of pages represented by one bit */
 	struct mutex    lock;
@@ -18,6 +19,10 @@ struct cma {
 	char name[CMA_MAX_NAME];
 };
 
+enum cma_flags {
+	CMA_DELAYED_RELEASE, /* cma bitmap is cleared asynchronously */
+};
+
 extern struct cma cma_areas[MAX_CMA_AREAS];
 extern unsigned cma_area_count;
 
-- 
2.26.2