netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [net-next PATCH] page_pool: do not release pool until inflight == 0.
@ 2019-11-12  5:32 Jonathan Lemon
  2019-11-12 12:08 ` Jesper Dangaard Brouer
  2019-11-12 15:11 ` Jesper Dangaard Brouer
  0 siblings, 2 replies; 10+ messages in thread
From: Jonathan Lemon @ 2019-11-12  5:32 UTC (permalink / raw)
  To: netdev, davem; +Cc: kernel-team, brouer, ilias.apalodimas

The page pool keeps track of the number of pages in flight, and
it isn't safe to remove the pool until all pages are returned.

Disallow removing the pool until all pages are back, so the pool
is always available for page producers.

Make the page pool responsible for its own delayed destruction
instead of relying on XDP, so the page pool can be used without
xdp.

When all pages are returned, free the pool and notify xdp if the
pool is registered with the xdp memory system.  Have the callback
perform a table walk since some drivers (cpsw) may share the pool
among multiple xdp_rxq_info.

Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |   4 +-
 include/net/page_pool.h                       |  54 +++-----
 include/net/xdp_priv.h                        |   4 -
 include/trace/events/xdp.h                    |  19 +--
 net/core/page_pool.c                          | 115 ++++++++++------
 net/core/xdp.c                                | 125 +++++++-----------
 6 files changed, 137 insertions(+), 184 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 199c4f938bb2..cd0d8a4bbeb3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1502,10 +1502,8 @@ static void free_dma_rx_desc_resources(struct stmmac_priv *priv)
 					  rx_q->dma_erx, rx_q->dma_rx_phy);
 
 		kfree(rx_q->buf_pool);
-		if (rx_q->page_pool) {
-			page_pool_request_shutdown(rx_q->page_pool);
+		if (rx_q->page_pool)
 			page_pool_destroy(rx_q->page_pool);
-		}
 	}
 }
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 2cbcdbdec254..0a9eba9c682c 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -70,7 +70,12 @@ struct page_pool_params {
 struct page_pool {
 	struct page_pool_params p;
 
-        u32 pages_state_hold_cnt;
+	struct delayed_work release_dw;
+	void (*disconnect)(void *);
+	unsigned long defer_start;
+	unsigned long defer_warn;
+
+	u32 pages_state_hold_cnt;
 
 	/*
 	 * Data structure for allocation side
@@ -129,26 +134,20 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
-void __page_pool_free(struct page_pool *pool);
-static inline void page_pool_free(struct page_pool *pool)
-{
-	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
-	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
-	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_free(pool);
-#endif
-}
-
-/* Drivers use this instead of page_pool_free */
+void page_pool_destroy(struct page_pool *pool);
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *));
+#else
 static inline void page_pool_destroy(struct page_pool *pool)
 {
-	if (!pool)
-		return;
-
-	page_pool_free(pool);
 }
 
+static inline void page_pool_use_xdp_mem(struct page_pool *pool,
+					 void (*disconnect)(void *));
+{
+}
+#endif
+
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
@@ -170,24 +169,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
-/* API user MUST have disconnected alloc-side (not allowed to call
- * page_pool_alloc_pages()) before calling this.  The free-side can
- * still run concurrently, to handle in-flight packet-pages.
- *
- * A request to shutdown can fail (with false) if there are still
- * in-flight packet-pages.
- */
-bool __page_pool_request_shutdown(struct page_pool *pool);
-static inline bool page_pool_request_shutdown(struct page_pool *pool)
-{
-	bool safe_to_remove = false;
-
-#ifdef CONFIG_PAGE_POOL
-	safe_to_remove = __page_pool_request_shutdown(pool);
-#endif
-	return safe_to_remove;
-}
-
 /* Disconnects a page (from a page_pool).  API users can have a need
  * to disconnect a page (from a page_pool), to allow it to be used as
  * a regular page (that will eventually be returned to the normal
@@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled_in(void)
 #endif
 }
 
-static inline void page_pool_get(struct page_pool *pool)
-{
-	refcount_inc(&pool->user_cnt);
-}
-
 static inline bool page_pool_put(struct page_pool *pool)
 {
 	return refcount_dec_and_test(&pool->user_cnt);
diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
index 6a8cba6ea79a..a9d5b7603b89 100644
--- a/include/net/xdp_priv.h
+++ b/include/net/xdp_priv.h
@@ -12,12 +12,8 @@ struct xdp_mem_allocator {
 		struct page_pool *page_pool;
 		struct zero_copy_allocator *zc_alloc;
 	};
-	int disconnect_cnt;
-	unsigned long defer_start;
 	struct rhash_head node;
 	struct rcu_head rcu;
-	struct delayed_work defer_wq;
-	unsigned long defer_warn;
 };
 
 #endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index c7e3c9c5bad3..a7378bcd9928 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -317,19 +317,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
 
 TRACE_EVENT(mem_disconnect,
 
-	TP_PROTO(const struct xdp_mem_allocator *xa,
-		 bool safe_to_remove, bool force),
+	TP_PROTO(const struct xdp_mem_allocator *xa),
 
-	TP_ARGS(xa, safe_to_remove, force),
+	TP_ARGS(xa),
 
 	TP_STRUCT__entry(
 		__field(const struct xdp_mem_allocator *,	xa)
 		__field(u32,		mem_id)
 		__field(u32,		mem_type)
 		__field(const void *,	allocator)
-		__field(bool,		safe_to_remove)
-		__field(bool,		force)
-		__field(int,		disconnect_cnt)
 	),
 
 	TP_fast_assign(
@@ -337,19 +333,12 @@ TRACE_EVENT(mem_disconnect,
 		__entry->mem_id		= xa->mem.id;
 		__entry->mem_type	= xa->mem.type;
 		__entry->allocator	= xa->allocator;
-		__entry->safe_to_remove	= safe_to_remove;
-		__entry->force		= force;
-		__entry->disconnect_cnt	= xa->disconnect_cnt;
 	),
 
-	TP_printk("mem_id=%d mem_type=%s allocator=%p"
-		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
+	TP_printk("mem_id=%d mem_type=%s allocator=%p",
 		  __entry->mem_id,
 		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
-		  __entry->allocator,
-		  __entry->safe_to_remove ? "true" : "false",
-		  __entry->force ? "true" : "false",
-		  __entry->disconnect_cnt
+		  __entry->allocator
 	)
 );
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..bfe96326335d 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -18,6 +18,9 @@
 
 #include <trace/events/page_pool.h>
 
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
@@ -193,22 +196,14 @@ static s32 page_pool_inflight(struct page_pool *pool)
 {
 	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
-	s32 distance;
+	s32 inflight;
 
-	distance = _distance(hold_cnt, release_cnt);
+	inflight = _distance(hold_cnt, release_cnt);
 
-	trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
-	return distance;
-}
-
-static bool __page_pool_safe_to_destroy(struct page_pool *pool)
-{
-	s32 inflight = page_pool_inflight(pool);
-
-	/* The distance should not be able to become negative */
+	trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt);
 	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 
-	return (inflight == 0);
+	return inflight;
 }
 
 /* Cleanup page_pool state from page */
@@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
 	}
 }
 
-static void __warn_in_flight(struct page_pool *pool)
+static void page_pool_free(struct page_pool *pool)
 {
-	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
-	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
-	s32 distance;
-
-	distance = _distance(hold_cnt, release_cnt);
-
-	/* Drivers should fix this, but only problematic when DMA is used */
-	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
-	     distance, hold_cnt, release_cnt);
-}
-
-void __page_pool_free(struct page_pool *pool)
-{
-	/* Only last user actually free/release resources */
-	if (!page_pool_put(pool))
-		return;
-
-	WARN(pool->alloc.count, "API usage violation");
-	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
-
-	/* Can happen due to forced shutdown */
-	if (!__page_pool_safe_to_destroy(pool))
-		__warn_in_flight(pool);
+	if (pool->disconnect)
+		pool->disconnect(pool);
 
 	ptr_ring_cleanup(&pool->ring, NULL);
 
@@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
 
 	kfree(pool);
 }
-EXPORT_SYMBOL(__page_pool_free);
 
-/* Request to shutdown: release pages cached by page_pool, and check
- * for in-flight pages
- */
-bool __page_pool_request_shutdown(struct page_pool *pool)
+static void page_pool_scrub(struct page_pool *pool)
 {
 	struct page *page;
 
@@ -393,7 +363,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
 	 * be in-flight.
 	 */
 	__page_pool_empty_ring(pool);
-
-	return __page_pool_safe_to_destroy(pool);
 }
-EXPORT_SYMBOL(__page_pool_request_shutdown);
+
+static int page_pool_release(struct page_pool *pool)
+{
+	int inflight;
+
+	page_pool_scrub(pool);
+	inflight = page_pool_inflight(pool);
+	if (!inflight)
+		page_pool_free(pool);
+
+	return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+	struct delayed_work *dwq = to_delayed_work(wq);
+	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+	int inflight;
+
+	inflight = page_pool_release(pool);
+	if (!inflight)
+		return;
+
+	/* Periodic warning */
+	if (time_after_eq(jiffies, pool->defer_warn)) {
+		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
+			__func__, inflight, sec);
+		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+	}
+
+	/* Still not ready to be disconnected, retry later */
+	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+{
+	refcount_inc(&pool->user_cnt);
+	pool->disconnect = disconnect;
+}
+
+void page_pool_destroy(struct page_pool *pool)
+{
+	if (!pool)
+		return;
+
+	if (!page_pool_put(pool))
+		return;
+
+	if (!page_pool_release(pool))
+		return;
+
+	pool->defer_start = jiffies;
+	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
+
+	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 20781ad5f9c3..8e405abaf05a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 
 	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
 
-	/* Allocator have indicated safe to remove before this is called */
-	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
-		page_pool_free(xa->page_pool);
-
 	/* Allow this ID to be reused */
 	ida_simple_remove(&mem_id_pool, xa->mem.id);
 
@@ -85,10 +81,41 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
-static bool __mem_id_disconnect(int id, bool force)
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
+{
+	trace_mem_disconnect(xa);
+
+	mutex_lock(&mem_id_lock);
+
+	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+	mutex_unlock(&mem_id_lock);
+}
+
+static void mem_allocator_disconnect(void *allocator)
+{
+	struct xdp_mem_allocator *xa;
+	struct rhashtable_iter iter;
+
+	rhashtable_walk_enter(mem_id_ht, &iter);
+	do {
+		rhashtable_walk_start(&iter);
+
+		while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+			if (xa->allocator == allocator)
+				mem_xa_remove(xa);
+		}
+
+		rhashtable_walk_stop(&iter);
+
+	} while (xa == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&iter);
+}
+
+static void mem_id_disconnect(int id)
 {
 	struct xdp_mem_allocator *xa;
-	bool safe_to_remove = true;
 
 	mutex_lock(&mem_id_lock);
 
@@ -96,51 +123,15 @@ static bool __mem_id_disconnect(int id, bool force)
 	if (!xa) {
 		mutex_unlock(&mem_id_lock);
 		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
-		return true;
+		return;
 	}
-	xa->disconnect_cnt++;
 
-	/* Detects in-flight packet-pages for page_pool */
-	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
-		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+	trace_mem_disconnect(xa);
 
-	trace_mem_disconnect(xa, safe_to_remove, force);
-
-	if ((safe_to_remove || force) &&
-	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
 		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
 
 	mutex_unlock(&mem_id_lock);
-	return (safe_to_remove|force);
-}
-
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
-
-static void mem_id_disconnect_defer_retry(struct work_struct *wq)
-{
-	struct delayed_work *dwq = to_delayed_work(wq);
-	struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
-	bool force = false;
-
-	if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
-		force = true;
-
-	if (__mem_id_disconnect(xa->mem.id, force))
-		return;
-
-	/* Periodic warning */
-	if (time_after_eq(jiffies, xa->defer_warn)) {
-		int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
-
-		pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
-			__func__, xa->mem.id, xa->disconnect_cnt, sec);
-		xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-	}
-
-	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
 }
 
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -153,38 +144,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 		return;
 	}
 
-	if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
-	    xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
-		return;
-	}
-
 	if (id == 0)
 		return;
 
-	if (__mem_id_disconnect(id, false))
-		return;
+	if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return mem_id_disconnect(id);
 
-	/* Could not disconnect, defer new disconnect attempt to later */
-	mutex_lock(&mem_id_lock);
-
-	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-	if (!xa) {
-		mutex_unlock(&mem_id_lock);
-		return;
+	if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+		rcu_read_lock();
+		xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+		page_pool_destroy(xa->page_pool);
+		rcu_read_unlock();
 	}
-	xa->defer_start = jiffies;
-	xa->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
-
-	INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
-	mutex_unlock(&mem_id_lock);
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 
-/* This unregister operation will also cleanup and destroy the
- * allocator. The page_pool_free() operation is first called when it's
- * safe to remove, possibly deferred to a workqueue.
- */
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -371,7 +345,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	}
 
 	if (type == MEM_TYPE_PAGE_POOL)
-		page_pool_get(xdp_alloc->page_pool);
+		page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
 
 	mutex_unlock(&mem_id_lock);
 
@@ -402,15 +376,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (likely(xa)) {
-			napi_direct &= !xdp_return_frame_no_direct();
-			page_pool_put_page(xa->page_pool, page, napi_direct);
-		} else {
-			/* Hopefully stack show who to blame for late return */
-			WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
-			trace_mem_return_failed(mem, page);
-			put_page(page);
-		}
+		napi_direct &= !xdp_return_frame_no_direct();
+		page_pool_put_page(xa->page_pool, page, napi_direct);
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12  5:32 [net-next PATCH] page_pool: do not release pool until inflight == 0 Jonathan Lemon
@ 2019-11-12 12:08 ` Jesper Dangaard Brouer
  2019-11-12 16:33   ` Jonathan Lemon
  2019-11-12 15:11 ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 10+ messages in thread
From: Jesper Dangaard Brouer @ 2019-11-12 12:08 UTC (permalink / raw)
  To: Jonathan Lemon; +Cc: netdev, davem, kernel-team, ilias.apalodimas, brouer

On Mon, 11 Nov 2019 21:32:10 -0800
Jonathan Lemon <jonathan.lemon@gmail.com> wrote:

> The page pool keeps track of the number of pages in flight, and
> it isn't safe to remove the pool until all pages are returned.
> 
> Disallow removing the pool until all pages are back, so the pool
> is always available for page producers.
> 
> Make the page pool responsible for its own delayed destruction
> instead of relying on XDP, so the page pool can be used without
> xdp.

Can you please change this to:
 [... can be used without] xdp memory model.

 
> When all pages are returned, free the pool and notify xdp if the
> pool is registered with the xdp memory system.  Have the callback
> perform a table walk since some drivers (cpsw) may share the pool
> among multiple xdp_rxq_info.
> 
> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
> ---
[...]
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..bfe96326335d 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
[...]

Found an issue, see below.

> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
>  	}
>  }
>  
> -static void __warn_in_flight(struct page_pool *pool)
> +static void page_pool_free(struct page_pool *pool)
>  {
> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
> -	s32 distance;
> -
> -	distance = _distance(hold_cnt, release_cnt);
> -
> -	/* Drivers should fix this, but only problematic when DMA is used */
> -	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> -	     distance, hold_cnt, release_cnt);
> -}
> -
> -void __page_pool_free(struct page_pool *pool)
> -{
> -	/* Only last user actually free/release resources */
> -	if (!page_pool_put(pool))
> -		return;
> -
> -	WARN(pool->alloc.count, "API usage violation");
> -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> -
> -	/* Can happen due to forced shutdown */
> -	if (!__page_pool_safe_to_destroy(pool))
> -		__warn_in_flight(pool);
> +	if (pool->disconnect)
> +		pool->disconnect(pool);
>  	ptr_ring_cleanup(&pool->ring, NULL);
>  
> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
>  
>  	kfree(pool);
>  }
> -EXPORT_SYMBOL(__page_pool_free);

I don't think this is correct according to RCU.

Let me reproduce the resulting version of page_pool_free():

 static void page_pool_free(struct page_pool *pool)
 {
	if (pool->disconnect)
		pool->disconnect(pool);

	ptr_ring_cleanup(&pool->ring, NULL);

	if (pool->p.flags & PP_FLAG_DMA_MAP)
		put_device(pool->p.dev);

	kfree(pool);
 }

The issue is that pool->disconnect() will call into
mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove() does
a RCU delayed free.  And this function immediately does a kfree(pool).

I do know that we can ONLY reach this page_pool_free() function, when
inflight == 0, but that can happen as soon as __page_pool_clean_page()
does the decrement, and after this trace_page_pool_state_release()
still have access the page_pool object (thus, hard to catch use-after-free).

  
> -/* Request to shutdown: release pages cached by page_pool, and check
> - * for in-flight pages
> - */
> -bool __page_pool_request_shutdown(struct page_pool *pool)
> +static void page_pool_scrub(struct page_pool *pool)
>  {
>  	struct page *page;
>  
> @@ -393,7 +363,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
>  	 * be in-flight.
>  	 */
>  	__page_pool_empty_ring(pool);
> -
> -	return __page_pool_safe_to_destroy(pool);
>  }
> -EXPORT_SYMBOL(__page_pool_request_shutdown);
> +
> +static int page_pool_release(struct page_pool *pool)
> +{
> +	int inflight;
> +
> +	page_pool_scrub(pool);
> +	inflight = page_pool_inflight(pool);
> +	if (!inflight)
> +		page_pool_free(pool);
> +
> +	return inflight;
> +}
> +
> +static void page_pool_release_retry(struct work_struct *wq)
> +{
> +	struct delayed_work *dwq = to_delayed_work(wq);
> +	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
> +	int inflight;
> +
> +	inflight = page_pool_release(pool);
> +	if (!inflight)
> +		return;
> +
> +	/* Periodic warning */
> +	if (time_after_eq(jiffies, pool->defer_warn)) {
> +		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
> +
> +		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
> +			__func__, inflight, sec);
> +		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
> +	}
> +
> +	/* Still not ready to be disconnected, retry later */
> +	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +
> +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
> +{
> +	refcount_inc(&pool->user_cnt);
> +	pool->disconnect = disconnect;
> +}
> +
> +void page_pool_destroy(struct page_pool *pool)
> +{
> +	if (!pool)
> +		return;
> +
> +	if (!page_pool_put(pool))
> +		return;
> +
> +	if (!page_pool_release(pool))
> +		return;
> +
> +	pool->defer_start = jiffies;
> +	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
> +
> +	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
> +	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
> +}
> +EXPORT_SYMBOL(page_pool_destroy);
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 20781ad5f9c3..8e405abaf05a 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
>  
>  	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
>  
> -	/* Allocator have indicated safe to remove before this is called */
> -	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> -		page_pool_free(xa->page_pool);
> -
>  	/* Allow this ID to be reused */
>  	ida_simple_remove(&mem_id_pool, xa->mem.id);
>  
> @@ -85,10 +81,41 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
>  	kfree(xa);
>  }
>  
> -static bool __mem_id_disconnect(int id, bool force)
> +static void mem_xa_remove(struct xdp_mem_allocator *xa)
> +{
> +	trace_mem_disconnect(xa);
> +
> +	mutex_lock(&mem_id_lock);
> +
> +	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> +		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);

RCU delayed free.

> +
> +	mutex_unlock(&mem_id_lock);
> +}
> +
> +static void mem_allocator_disconnect(void *allocator)
> +{
> +	struct xdp_mem_allocator *xa;
> +	struct rhashtable_iter iter;
> +
> +	rhashtable_walk_enter(mem_id_ht, &iter);
> +	do {
> +		rhashtable_walk_start(&iter);
> +
> +		while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
> +			if (xa->allocator == allocator)
> +				mem_xa_remove(xa);
> +		}
> +
> +		rhashtable_walk_stop(&iter);
> +
> +	} while (xa == ERR_PTR(-EAGAIN));
> +	rhashtable_walk_exit(&iter);
> +}
>
> +static void mem_id_disconnect(int id)
>  {
>  	struct xdp_mem_allocator *xa;
> -	bool safe_to_remove = true;
>  
>  	mutex_lock(&mem_id_lock);
>  
> @@ -96,51 +123,15 @@ static bool __mem_id_disconnect(int id, bool force)
>  	if (!xa) {
>  		mutex_unlock(&mem_id_lock);
>  		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
> -		return true;
> +		return;
>  	}
> -	xa->disconnect_cnt++;
>  
> -	/* Detects in-flight packet-pages for page_pool */
> -	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
> -		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
> +	trace_mem_disconnect(xa);
>  
> -	trace_mem_disconnect(xa, safe_to_remove, force);
> -
> -	if ((safe_to_remove || force) &&
> -	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
> +	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
>  		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
>  
>  	mutex_unlock(&mem_id_lock);
> -	return (safe_to_remove|force);
> -}
[...]

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12  5:32 [net-next PATCH] page_pool: do not release pool until inflight == 0 Jonathan Lemon
  2019-11-12 12:08 ` Jesper Dangaard Brouer
@ 2019-11-12 15:11 ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 10+ messages in thread
From: Jesper Dangaard Brouer @ 2019-11-12 15:11 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: netdev, davem, kernel-team, ilias.apalodimas, brouer, Brendan Gregg

On Mon, 11 Nov 2019 21:32:10 -0800
Jonathan Lemon <jonathan.lemon@gmail.com> wrote:

> The page pool keeps track of the number of pages in flight, and
> it isn't safe to remove the pool until all pages are returned.
> 
> Disallow removing the pool until all pages are back, so the pool
> is always available for page producers.
> 
> Make the page pool responsible for its own delayed destruction
> instead of relying on XDP, so the page pool can be used without
> xdp.
> 
> When all pages are returned, free the pool and notify xdp if the
> pool is registered with the xdp memory system.  Have the callback
> perform a table walk since some drivers (cpsw) may share the pool
> among multiple xdp_rxq_info.
> 
> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>

FYI: I'm using this work-in-progress bpftrace script to test your patch:

 https://github.com/xdp-project/xdp-project/blob/master/areas/mem/bpftrace/page_pool_track_shutdown01.bt
 

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 12:08 ` Jesper Dangaard Brouer
@ 2019-11-12 16:33   ` Jonathan Lemon
  2019-11-12 16:48     ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 10+ messages in thread
From: Jonathan Lemon @ 2019-11-12 16:33 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, davem, kernel-team, ilias.apalodimas

On 12 Nov 2019, at 4:08, Jesper Dangaard Brouer wrote:

> On Mon, 11 Nov 2019 21:32:10 -0800
> Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
>> The page pool keeps track of the number of pages in flight, and
>> it isn't safe to remove the pool until all pages are returned.
>>
>> Disallow removing the pool until all pages are back, so the pool
>> is always available for page producers.
>>
>> Make the page pool responsible for its own delayed destruction
>> instead of relying on XDP, so the page pool can be used without
>> xdp.
>
> Can you please change this to:
>  [... can be used without] xdp memory model.

Okay.


>> When all pages are returned, free the pool and notify xdp if the
>> pool is registered with the xdp memory system.  Have the callback
>> perform a table walk since some drivers (cpsw) may share the pool
>> among multiple xdp_rxq_info.
>>
>> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic 
>> warning")
>> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
>> ---
> [...]
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index 5bc65587f1c4..bfe96326335d 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
> [...]
>
> Found an issue, see below.
>
>> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct 
>> page_pool *pool)
>>  	}
>>  }
>>
>> -static void __warn_in_flight(struct page_pool *pool)
>> +static void page_pool_free(struct page_pool *pool)
>>  {
>> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
>> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
>> -	s32 distance;
>> -
>> -	distance = _distance(hold_cnt, release_cnt);
>> -
>> -	/* Drivers should fix this, but only problematic when DMA is used 
>> */
>> -	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
>> -	     distance, hold_cnt, release_cnt);
>> -}
>> -
>> -void __page_pool_free(struct page_pool *pool)
>> -{
>> -	/* Only last user actually free/release resources */
>> -	if (!page_pool_put(pool))
>> -		return;
>> -
>> -	WARN(pool->alloc.count, "API usage violation");
>> -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>> -
>> -	/* Can happen due to forced shutdown */
>> -	if (!__page_pool_safe_to_destroy(pool))
>> -		__warn_in_flight(pool);
>> +	if (pool->disconnect)
>> +		pool->disconnect(pool);
>>  	ptr_ring_cleanup(&pool->ring, NULL);
>>
>> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
>>
>>  	kfree(pool);
>>  }
>> -EXPORT_SYMBOL(__page_pool_free);
>
> I don't think this is correct according to RCU.
>
> Let me reproduce the resulting version of page_pool_free():
>
>  static void page_pool_free(struct page_pool *pool)
>  {
> 	if (pool->disconnect)
> 		pool->disconnect(pool);
>
> 	ptr_ring_cleanup(&pool->ring, NULL);
>
> 	if (pool->p.flags & PP_FLAG_DMA_MAP)
> 		put_device(pool->p.dev);
>
> 	kfree(pool);
>  }
>
> The issue is that pool->disconnect() will call into
> mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove() 
> does
> a RCU delayed free.  And this function immediately does a kfree(pool).
>
> I do know that we can ONLY reach this page_pool_free() function, when
> inflight == 0, but that can happen as soon as __page_pool_clean_page()
> does the decrement, and after this trace_page_pool_state_release()
> still have access the page_pool object (thus, hard to catch 
> use-after-free).

Is this an issue?  The RCU delayed free is for the xa object, it is held
in an RCU-protected mem_id_ht, so it can't be freed until all the 
readers
are complete.

The change of &pool->pages_state_release_cnt can decrement the inflight
pages to 0, and another thread could see inflight == 0 and immediately
the remove the pool.  The atomic manipulation should be the last use of
the pool - this should be documented, I'll add that as well:

skip_dma_unmap:
         /* This may be the last page returned, releasing the pool, so
          * it is not safe to reference pool afterwards.
          */
         count = atomic_inc_return(&pool->pages_state_release_cnt);
         trace_page_pool_state_release(pool, page, count);

The trace_page_pool_state_release() does not dereference pool, it just
reports the pointer value, so there shouldn't be any use-after-free.
-- 
Jonathan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 16:33   ` Jonathan Lemon
@ 2019-11-12 16:48     ` Jesper Dangaard Brouer
  2019-11-12 17:14       ` Jonathan Lemon
  2019-11-12 17:23       ` Alexei Starovoitov
  0 siblings, 2 replies; 10+ messages in thread
From: Jesper Dangaard Brouer @ 2019-11-12 16:48 UTC (permalink / raw)
  To: Jonathan Lemon; +Cc: netdev, davem, kernel-team, ilias.apalodimas, brouer

On Tue, 12 Nov 2019 08:33:58 -0800
"Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:

> On 12 Nov 2019, at 4:08, Jesper Dangaard Brouer wrote:
> 
> > On Mon, 11 Nov 2019 21:32:10 -0800
> > Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
> >  
> >> The page pool keeps track of the number of pages in flight, and
> >> it isn't safe to remove the pool until all pages are returned.
> >>
> >> Disallow removing the pool until all pages are back, so the pool
> >> is always available for page producers.
> >>
> >> Make the page pool responsible for its own delayed destruction
> >> instead of relying on XDP, so the page pool can be used without
> >> xdp.  
> >
> > Can you please change this to:
> >  [... can be used without] xdp memory model.  
> 
> Okay.
> 
> 
> >> When all pages are returned, free the pool and notify xdp if the
> >> pool is registered with the xdp memory system.  Have the callback
> >> perform a table walk since some drivers (cpsw) may share the pool
> >> among multiple xdp_rxq_info.
> >>
> >> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic 
> >> warning")
> >> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
> >> ---  
> > [...]  
> >> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> >> index 5bc65587f1c4..bfe96326335d 100644
> >> --- a/net/core/page_pool.c
> >> +++ b/net/core/page_pool.c  
> > [...]
> >
> > Found an issue, see below.
> >  
> >> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct 
> >> page_pool *pool)
> >>  	}
> >>  }
> >>
> >> -static void __warn_in_flight(struct page_pool *pool)
> >> +static void page_pool_free(struct page_pool *pool)
> >>  {
> >> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
> >> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
> >> -	s32 distance;
> >> -
> >> -	distance = _distance(hold_cnt, release_cnt);
> >> -
> >> -	/* Drivers should fix this, but only problematic when DMA is used 
> >> */
> >> -	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> >> -	     distance, hold_cnt, release_cnt);
> >> -}
> >> -
> >> -void __page_pool_free(struct page_pool *pool)
> >> -{
> >> -	/* Only last user actually free/release resources */
> >> -	if (!page_pool_put(pool))
> >> -		return;
> >> -
> >> -	WARN(pool->alloc.count, "API usage violation");
> >> -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> >> -
> >> -	/* Can happen due to forced shutdown */
> >> -	if (!__page_pool_safe_to_destroy(pool))
> >> -		__warn_in_flight(pool);
> >> +	if (pool->disconnect)
> >> +		pool->disconnect(pool);
> >>  	ptr_ring_cleanup(&pool->ring, NULL);
> >>
> >> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
> >>
> >>  	kfree(pool);
> >>  }
> >> -EXPORT_SYMBOL(__page_pool_free);  
> >
> > I don't think this is correct according to RCU.
> >
> > Let me reproduce the resulting version of page_pool_free():
> >
> >  static void page_pool_free(struct page_pool *pool)
> >  {
> > 	if (pool->disconnect)
> > 		pool->disconnect(pool);
> >
> > 	ptr_ring_cleanup(&pool->ring, NULL);
> >
> > 	if (pool->p.flags & PP_FLAG_DMA_MAP)
> > 		put_device(pool->p.dev);
> >
> > 	kfree(pool);
> >  }
> >
> > The issue is that pool->disconnect() will call into
> > mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove() 
> > does
> > a RCU delayed free.  And this function immediately does a kfree(pool).
> >
> > I do know that we can ONLY reach this page_pool_free() function, when
> > inflight == 0, but that can happen as soon as __page_pool_clean_page()
> > does the decrement, and after this trace_page_pool_state_release()
> > still have access the page_pool object (thus, hard to catch 
> > use-after-free).  
> 
> Is this an issue?  The RCU delayed free is for the xa object, it is held
> in an RCU-protected mem_id_ht, so it can't be freed until all the 
> readers
> are complete.
> 
> The change of &pool->pages_state_release_cnt can decrement the inflight
> pages to 0, and another thread could see inflight == 0 and immediately
> the remove the pool.  The atomic manipulation should be the last use of
> the pool - this should be documented, I'll add that as well:
> 
> skip_dma_unmap:
>          /* This may be the last page returned, releasing the pool, so
>           * it is not safe to reference pool afterwards.
>           */
>          count = atomic_inc_return(&pool->pages_state_release_cnt);
>          trace_page_pool_state_release(pool, page, count);
> 
> The trace_page_pool_state_release() does not dereference pool, it just
> reports the pointer value, so there shouldn't be any use-after-free.

In the tracepoint we can still dereference the pool object pointer.
This is made easier via using bpftrace for example see[1] (and with BTF
this will become more common to do so).

[1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/bpftrace/test_tp__xdp_mem_disconnect.bt

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 16:48     ` Jesper Dangaard Brouer
@ 2019-11-12 17:14       ` Jonathan Lemon
  2019-11-12 17:23       ` Alexei Starovoitov
  1 sibling, 0 replies; 10+ messages in thread
From: Jonathan Lemon @ 2019-11-12 17:14 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, davem, kernel-team, ilias.apalodimas

On 12 Nov 2019, at 8:48, Jesper Dangaard Brouer wrote:

> On Tue, 12 Nov 2019 08:33:58 -0800
> "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
>
>> On 12 Nov 2019, at 4:08, Jesper Dangaard Brouer wrote:
>>
>>> On Mon, 11 Nov 2019 21:32:10 -0800
>>> Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>>>
>>>> The page pool keeps track of the number of pages in flight, and
>>>> it isn't safe to remove the pool until all pages are returned.
>>>>
>>>> Disallow removing the pool until all pages are back, so the pool
>>>> is always available for page producers.
>>>>
>>>> Make the page pool responsible for its own delayed destruction
>>>> instead of relying on XDP, so the page pool can be used without
>>>> xdp.
>>>
>>> Can you please change this to:
>>>  [... can be used without] xdp memory model.
>>
>> Okay.
>>
>>
>>>> When all pages are returned, free the pool and notify xdp if the
>>>> pool is registered with the xdp memory system.  Have the callback
>>>> perform a table walk since some drivers (cpsw) may share the pool
>>>> among multiple xdp_rxq_info.
>>>>
>>>> Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic
>>>> warning")
>>>> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
>>>> ---
>>> [...]
>>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>>> index 5bc65587f1c4..bfe96326335d 100644
>>>> --- a/net/core/page_pool.c
>>>> +++ b/net/core/page_pool.c
>>> [...]
>>>
>>> Found an issue, see below.
>>>
>>>> @@ -338,31 +333,10 @@ static void __page_pool_empty_ring(struct
>>>> page_pool *pool)
>>>>  	}
>>>>  }
>>>>
>>>> -static void __warn_in_flight(struct page_pool *pool)
>>>> +static void page_pool_free(struct page_pool *pool)
>>>>  {
>>>> -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
>>>> -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
>>>> -	s32 distance;
>>>> -
>>>> -	distance = _distance(hold_cnt, release_cnt);
>>>> -
>>>> -	/* Drivers should fix this, but only problematic when DMA is used
>>>> */
>>>> -	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
>>>> -	     distance, hold_cnt, release_cnt);
>>>> -}
>>>> -
>>>> -void __page_pool_free(struct page_pool *pool)
>>>> -{
>>>> -	/* Only last user actually free/release resources */
>>>> -	if (!page_pool_put(pool))
>>>> -		return;
>>>> -
>>>> -	WARN(pool->alloc.count, "API usage violation");
>>>> -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>>>> -
>>>> -	/* Can happen due to forced shutdown */
>>>> -	if (!__page_pool_safe_to_destroy(pool))
>>>> -		__warn_in_flight(pool);
>>>> +	if (pool->disconnect)
>>>> +		pool->disconnect(pool);
>>>>  	ptr_ring_cleanup(&pool->ring, NULL);
>>>>
>>>> @@ -371,12 +345,8 @@ void __page_pool_free(struct page_pool *pool)
>>>>
>>>>  	kfree(pool);
>>>>  }
>>>> -EXPORT_SYMBOL(__page_pool_free);
>>>
>>> I don't think this is correct according to RCU.
>>>
>>> Let me reproduce the resulting version of page_pool_free():
>>>
>>>  static void page_pool_free(struct page_pool *pool)
>>>  {
>>> 	if (pool->disconnect)
>>> 		pool->disconnect(pool);
>>>
>>> 	ptr_ring_cleanup(&pool->ring, NULL);
>>>
>>> 	if (pool->p.flags & PP_FLAG_DMA_MAP)
>>> 		put_device(pool->p.dev);
>>>
>>> 	kfree(pool);
>>>  }
>>>
>>> The issue is that pool->disconnect() will call into
>>> mem_allocator_disconnect() -> mem_xa_remove(), and mem_xa_remove()
>>> does
>>> a RCU delayed free.  And this function immediately does a kfree(pool).
>>>
>>> I do know that we can ONLY reach this page_pool_free() function, when
>>> inflight == 0, but that can happen as soon as __page_pool_clean_page()
>>> does the decrement, and after this trace_page_pool_state_release()
>>> still have access the page_pool object (thus, hard to catch
>>> use-after-free).
>>
>> Is this an issue?  The RCU delayed free is for the xa object, it is held
>> in an RCU-protected mem_id_ht, so it can't be freed until all the
>> readers
>> are complete.
>>
>> The change of &pool->pages_state_release_cnt can decrement the inflight
>> pages to 0, and another thread could see inflight == 0 and immediately
>> the remove the pool.  The atomic manipulation should be the last use of
>> the pool - this should be documented, I'll add that as well:
>>
>> skip_dma_unmap:
>>          /* This may be the last page returned, releasing the pool, so
>>           * it is not safe to reference pool afterwards.
>>           */
>>          count = atomic_inc_return(&pool->pages_state_release_cnt);
>>          trace_page_pool_state_release(pool, page, count);
>>
>> The trace_page_pool_state_release() does not dereference pool, it just
>> reports the pointer value, so there shouldn't be any use-after-free.
>
> In the tracepoint we can still dereference the pool object pointer.
> This is made easier via using bpftrace for example see[1] (and with BTF
> this will become more common to do so).

This seems problematic today - essentially the current code is only
safe because it's borrowing the RCU protection from the xdp_mem info,
right?
-- 
Jonathan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 16:48     ` Jesper Dangaard Brouer
  2019-11-12 17:14       ` Jonathan Lemon
@ 2019-11-12 17:23       ` Alexei Starovoitov
  2019-11-12 17:32         ` Jonathan Lemon
  1 sibling, 1 reply; 10+ messages in thread
From: Alexei Starovoitov @ 2019-11-12 17:23 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Jonathan Lemon
  Cc: netdev, davem, Kernel Team, ilias.apalodimas

On 11/12/19 8:48 AM, Jesper Dangaard Brouer wrote:
>> The trace_page_pool_state_release() does not dereference pool, it just
>> reports the pointer value, so there shouldn't be any use-after-free.
> In the tracepoint we can still dereference the pool object pointer.
> This is made easier via using bpftrace for example see[1] (and with BTF
> this will become more common to do so).

bpf tracing progs cannot assume that the pointer is valid.
The program can remember a kernel pointer in a map and then
access it days later.
Like kretprobe on kfree_skb(). The skb is freed. 100% use-after-free.
Such bpf program is broken and won't be reading meaningful values,
but it won't crash the kernel.

On the other side we should not be passing pointers to freed objects
into tracepoints. That just wrong.
May be simply move that questionable tracepoint?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 17:23       ` Alexei Starovoitov
@ 2019-11-12 17:32         ` Jonathan Lemon
  2019-11-13 10:08           ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 10+ messages in thread
From: Jonathan Lemon @ 2019-11-12 17:32 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jesper Dangaard Brouer, netdev, davem, Kernel Team, ilias.apalodimas



On 12 Nov 2019, at 9:23, Alexei Starovoitov wrote:

> On 11/12/19 8:48 AM, Jesper Dangaard Brouer wrote:
>>> The trace_page_pool_state_release() does not dereference pool, it 
>>> just
>>> reports the pointer value, so there shouldn't be any use-after-free.
>> In the tracepoint we can still dereference the pool object pointer.
>> This is made easier via using bpftrace for example see[1] (and with 
>> BTF
>> this will become more common to do so).
>
> bpf tracing progs cannot assume that the pointer is valid.
> The program can remember a kernel pointer in a map and then
> access it days later.
> Like kretprobe on kfree_skb(). The skb is freed. 100% use-after-free.
> Such bpf program is broken and won't be reading meaningful values,
> but it won't crash the kernel.
>
> On the other side we should not be passing pointers to freed objects
> into tracepoints. That just wrong.
> May be simply move that questionable tracepoint?

Yes, move and simplify it.  I believe this patch should resolve the 
issue,
it just reports pages entering/exiting the pool, without trying to 
access
the counters - the counters are reported through the inflight 
tracepoint.
-- 
Jonathan

diff --git a/include/trace/events/page_pool.h 
b/include/trace/events/page_pool.h
index 47b5ee880aa9..0adf9aed9f5b 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -35,50 +35,46 @@ TRACE_EVENT(page_pool_inflight,
  	  __entry->pool, __entry->inflight, __entry->hold, __entry->release)
  );

-TRACE_EVENT(page_pool_state_release,
+TRACE_EVENT(page_pool_page_release,

  	TP_PROTO(const struct page_pool *pool,
-		 const struct page *page, u32 release),
+		 const struct page *page)

-	TP_ARGS(pool, page, release),
+	TP_ARGS(pool, page),

  	TP_STRUCT__entry(
  		__field(const struct page_pool *,	pool)
  		__field(const struct page *,		page)
-		__field(u32,				release)
  	),

  	TP_fast_assign(
  		__entry->pool		= pool;
  		__entry->page		= page;
-		__entry->release	= release;
  	),

-	TP_printk("page_pool=%p page=%p release=%u",
-		  __entry->pool, __entry->page, __entry->release)
+	TP_printk("page_pool=%p page=%p",
+		  __entry->pool, __entry->page)
  );

-TRACE_EVENT(page_pool_state_hold,
+TRACE_EVENT(page_pool_page_hold,

  	TP_PROTO(const struct page_pool *pool,
-		 const struct page *page, u32 hold),
+		 const struct page *page),

-	TP_ARGS(pool, page, hold),
+	TP_ARGS(pool, page),

  	TP_STRUCT__entry(
  		__field(const struct page_pool *,	pool)
  		__field(const struct page *,		page)
-		__field(u32,				hold)
  	),

  	TP_fast_assign(
  		__entry->pool	= pool;
  		__entry->page	= page;
-		__entry->hold	= hold;
  	),

-	TP_printk("page_pool=%p page=%p hold=%u",
-		  __entry->pool, __entry->page, __entry->hold)
+	TP_printk("page_pool=%p page=%p",
+		  __entry->pool, __entry->page)
  );

  #endif /* _TRACE_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index bfe96326335d..05b93ea67ac5 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -163,7 +163,7 @@ static struct page 
*__page_pool_alloc_pages_slow(struct page_pool *pool,
  	/* Track how many pages are held 'in-flight' */
  	pool->pages_state_hold_cnt++;

-	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+	trace_page_pool_page_hold(pool, page);

  	/* When page just alloc'ed is should/must have refcnt 1. */
  	return page;
@@ -222,9 +223,11 @@ static void __page_pool_clean_page(struct page_pool 
*pool,
  			     DMA_ATTR_SKIP_CPU_SYNC);
  	page->dma_addr = 0;
  skip_dma_unmap:
+	trace_page_pool_page_release(pool, page);
+	/* This may be the last page returned, releasing the pool, so
+	 * it is not safe to reference pool afterwards.
+	 */
  	atomic_inc(&pool->pages_state_release_cnt);
-	trace_page_pool_state_release(pool, page,
-			      atomic_read(&pool->pages_state_release_cnt));
  }

  /* unmap the page and clean our state */

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-12 17:32         ` Jonathan Lemon
@ 2019-11-13 10:08           ` Jesper Dangaard Brouer
  2019-11-13 16:37             ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 10+ messages in thread
From: Jesper Dangaard Brouer @ 2019-11-13 10:08 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: Alexei Starovoitov, netdev, davem, Kernel Team, ilias.apalodimas, brouer

On Tue, 12 Nov 2019 09:32:10 -0800
"Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:

> On 12 Nov 2019, at 9:23, Alexei Starovoitov wrote:
> 
> > On 11/12/19 8:48 AM, Jesper Dangaard Brouer wrote:  
> >>> The trace_page_pool_state_release() does not dereference pool, it 
> >>> just
> >>> reports the pointer value, so there shouldn't be any use-after-free.  
> >> In the tracepoint we can still dereference the pool object pointer.
> >> This is made easier via using bpftrace for example see[1] (and with 
> >> BTF
> >> this will become more common to do so).  
> >
> > bpf tracing progs cannot assume that the pointer is valid.
> > The program can remember a kernel pointer in a map and then
> > access it days later.
> > Like kretprobe on kfree_skb(). The skb is freed. 100% use-after-free.
> > Such bpf program is broken and won't be reading meaningful values,
> > but it won't crash the kernel.
> >
> > On the other side we should not be passing pointers to freed objects
> > into tracepoints. That just wrong.
> > May be simply move that questionable tracepoint?  
> 
> Yes, move and simplify it.  I believe this patch should resolve the 
> issue, it just reports pages entering/exiting the pool, without
> trying to access the counters - the counters are reported through the
> inflight tracepoint.

Sorry, I don't like loosing the counter.  I have a plan for using these
counters in a bpftrace script.  (Worst case I might be able to live
without the counters).  

The basic idea is to use these tracepoints to detect if we leak
DMA-mappings. I'll try write the bpftrace script today, and
see it I can live without the counter.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH] page_pool: do not release pool until inflight == 0.
  2019-11-13 10:08           ` Jesper Dangaard Brouer
@ 2019-11-13 16:37             ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 10+ messages in thread
From: Jesper Dangaard Brouer @ 2019-11-13 16:37 UTC (permalink / raw)
  To: Jonathan Lemon
  Cc: Alexei Starovoitov, netdev, davem, Kernel Team, ilias.apalodimas,
	brouer, Brendan Gregg

On Wed, 13 Nov 2019 11:08:23 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> The basic idea is to use these tracepoints to detect if we leak
> DMA-mappings. I'll try write the bpftrace script today, and
> see it I can live without the counter.

Didn't finish, here is how far I got:
 https://github.com/xdp-project/xdp-project/blob/master/areas/mem/bpftrace/page_pool_track_leaks01.bt

In the end of the bpftrace script, I needed to iterate over a map,
which I don't think bpftrace supports (Brendan) ?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-11-13 16:37 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-12  5:32 [net-next PATCH] page_pool: do not release pool until inflight == 0 Jonathan Lemon
2019-11-12 12:08 ` Jesper Dangaard Brouer
2019-11-12 16:33   ` Jonathan Lemon
2019-11-12 16:48     ` Jesper Dangaard Brouer
2019-11-12 17:14       ` Jonathan Lemon
2019-11-12 17:23       ` Alexei Starovoitov
2019-11-12 17:32         ` Jonathan Lemon
2019-11-13 10:08           ` Jesper Dangaard Brouer
2019-11-13 16:37             ` Jesper Dangaard Brouer
2019-11-12 15:11 ` Jesper Dangaard Brouer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).