All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
@ 2018-03-05  4:20 Sarah Newman
  2018-03-05 10:09 ` Tariq Toukan
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-03-05  4:20 UTC (permalink / raw)
  To: Tariq Toukan, Yishai Hadas; +Cc: netdev, Sarah Newman

Take an additional reference to a page whenever it is placed
into the rx ring and put the page again after running
dma_unmap_page.

When swiotlb is in use, calling dma_unmap_page means that
the original page mapped with dma_map_page must still be valid,
as swiotlb will copy data from its internal cache back to the
originally requested DMA location.

When GRO is enabled, before this patch all references to the
original frag may be put and the page freed before dma_unmap_page
in mlx4_en_free_frag is called.

It is possible there is a path where the use-after-free occurs
even with GRO disabled, but this has not been observed so far.

The bug can be trivially detected by doing the following:

* Compile the kernel with DEBUG_PAGEALLOC
* Run the kernel as a Xen Dom0
* Leave GRO enabled on the interface
* Run a 10 second or more test with iperf over the interface.

This bug was likely introduced in
commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
first part of u3.6.

It was incidentally fixed in
commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
first part of v4.12.

This version applies to the v4.9 series.

Signed-off-by: Sarah Newman <srn@prgmr.com>
Tested-by: Sarah Newman <srn@prgmr.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 39 +++++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  1 +
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index bcbb80f..d1fb087 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -80,10 +80,14 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	page_alloc->page = page;
 	page_alloc->dma = dma;
 	page_alloc->page_offset = 0;
+	page_alloc->page_owner = true;
 	/* Not doing get_page() for each frag is a big win
 	 * on asymetric workloads. Note we can not use atomic_set().
 	 */
-	page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
+	/* Since the page must be valid until after dma_unmap_page is called,
+	 * take an additional reference we would not have otherwise.
+	 */
+	page_ref_add(page, page_alloc->page_size / frag_info->frag_stride);
 	return 0;
 }
 
@@ -105,9 +109,13 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 		page_alloc[i].page_offset += frag_info->frag_stride;
 
 		if (page_alloc[i].page_offset + frag_info->frag_stride <=
-		    ring_alloc[i].page_size)
-			continue;
-
+		    ring_alloc[i].page_size) {
+			WARN_ON(!page_alloc[i].page);
+			WARN_ON(!page_alloc[i].page_owner);
+			if (likely(page_alloc[i].page &&
+				   page_alloc[i].page_owner))
+				continue;
+		}
 		if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
 					      frag_info, gfp)))
 			goto out;
@@ -131,7 +139,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 			page = page_alloc[i].page;
 			/* Revert changes done by mlx4_alloc_pages */
 			page_ref_sub(page, page_alloc[i].page_size /
-					   priv->frag_info[i].frag_stride - 1);
+					   priv->frag_info[i].frag_stride);
 			put_page(page);
 		}
 	}
@@ -146,11 +154,13 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
 
 
-	if (next_frag_end > frags[i].page_size)
+	if (next_frag_end > frags[i].page_size) {
 		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
 			       frag_info->dma_dir);
+		put_page(frags[i].page);
+	}
 
-	if (frags[i].page)
+	if (frags[i].page_owner)
 		put_page(frags[i].page);
 }
 
@@ -184,9 +194,10 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 		page = page_alloc->page;
 		/* Revert changes done by mlx4_alloc_pages */
 		page_ref_sub(page, page_alloc->page_size /
-				   priv->frag_info[i].frag_stride - 1);
+				   priv->frag_info[i].frag_stride);
 		put_page(page);
 		page_alloc->page = NULL;
+		page_alloc->page_owner = false;
 	}
 	return -ENOMEM;
 }
@@ -206,12 +217,14 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 
 		dma_unmap_page(priv->ddev, page_alloc->dma,
 				page_alloc->page_size, frag_info->dma_dir);
+		put_page(page_alloc->page);
 		while (page_alloc->page_offset + frag_info->frag_stride <
 		       page_alloc->page_size) {
 			put_page(page_alloc->page);
 			page_alloc->page_offset += frag_info->frag_stride;
 		}
 		page_alloc->page = NULL;
+		page_alloc->page_owner = false;
 	}
 }
 
@@ -251,6 +264,11 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 	if (ring->page_cache.index > 0) {
 		frags[0] = ring->page_cache.buf[--ring->page_cache.index];
 		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+		WARN_ON(frags[0].page_owner);
+		if (likely(!frags[0].page_owner)) {
+			page_ref_inc(frags[0].page);
+			frags[0].page_owner = true;
+		}
 		return 0;
 	}
 
@@ -569,6 +587,7 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 
 		dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
 			       priv->frag_info[0].dma_dir);
+		WARN_ON(frame->page_owner);
 		put_page(frame->page);
 	}
 	ring->page_cache.index = 0;
@@ -595,7 +614,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 		frag_info = &priv->frag_info[nr];
 		if (length <= frag_info->frag_prefix_size)
 			break;
-		if (unlikely(!frags[nr].page))
+		if (unlikely(!frags[nr].page_owner))
 			goto fail;
 
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
@@ -607,7 +626,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
 		skb_frags_rx[nr].page_offset = frags[nr].page_offset;
 		skb->truesize += frag_info->frag_stride;
-		frags[nr].page = NULL;
+		frags[nr].page_owner = false;
 	}
 	/* Adjust size of last fragment to match actual length */
 	if (nr > 0)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index e2509bb..25f7f9e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -356,6 +356,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
 		.dma = tx_info->map0_dma,
 		.page_offset = 0,
 		.page_size = PAGE_SIZE,
+		.page_owner = false,
 	};
 
 	if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
@@ -1128,7 +1129,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
 	dma = frame->dma;
 
 	tx_info->page = frame->page;
-	frame->page = NULL;
+	frame->page_owner = false;
 	tx_info->map0_dma = dma;
 	tx_info->map0_byte_count = length;
 	tx_info->nr_txbb = nr_txbb;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index df0f396..2c9d9a6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -261,6 +261,7 @@ struct mlx4_en_rx_alloc {
 	dma_addr_t	dma;
 	u32		page_offset;
 	u32		page_size;
+	bool		page_owner;
 };
 
 #define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-05  4:20 [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page Sarah Newman
@ 2018-03-05 10:09 ` Tariq Toukan
  2018-03-05 21:10   ` Sarah Newman
  0 siblings, 1 reply; 14+ messages in thread
From: Tariq Toukan @ 2018-03-05 10:09 UTC (permalink / raw)
  To: Sarah Newman, Tariq Toukan, Yishai Hadas; +Cc: netdev



On 05/03/2018 6:20 AM, Sarah Newman wrote:
> Take an additional reference to a page whenever it is placed
> into the rx ring and put the page again after running
> dma_unmap_page.
> 
> When swiotlb is in use, calling dma_unmap_page means that
> the original page mapped with dma_map_page must still be valid,
> as swiotlb will copy data from its internal cache back to the
> originally requested DMA location.
> 
> When GRO is enabled, before this patch all references to the
> original frag may be put and the page freed before dma_unmap_page
> in mlx4_en_free_frag is called.
> 
> It is possible there is a path where the use-after-free occurs
> even with GRO disabled, but this has not been observed so far.
> 
> The bug can be trivially detected by doing the following:
> 
> * Compile the kernel with DEBUG_PAGEALLOC
> * Run the kernel as a Xen Dom0
> * Leave GRO enabled on the interface
> * Run a 10 second or more test with iperf over the interface.
> 

Hi Sarah, thanks for your patch!

> This bug was likely introduced in
> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
> first part of u3.6.
> 
> It was incidentally fixed in
> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
> first part of v4.12.
> 
> This version applies to the v4.9 series.
> 
> Signed-off-by: Sarah Newman <srn@prgmr.com>
> Tested-by: Sarah Newman <srn@prgmr.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 39 +++++++++++++++++++++-------
>   drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  3 ++-
>   drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  1 +
>   3 files changed, 32 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index bcbb80f..d1fb087 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -80,10 +80,14 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
>   	page_alloc->page = page;
>   	page_alloc->dma = dma;
>   	page_alloc->page_offset = 0;
> +	page_alloc->page_owner = true;

Do we really need this boolean? I believe the issue can be fixed without 
it. We need to make sure we hold the correct refcnt at every stage, and 
maintain symmetry between a flow and its inverse.

Upon alloc, refcnt is 1. This alloc refcnt should be inverted by a call 
to put_page. We might want to introduce a page free API (symmetric to 
mlx4_alloc_pages), that does: dma unmap the page, call put_page, nullify 
pointer.
Once alloced, page refcnt is bumped up by the amount of possible frags 
populating it, which is (page_size / frag_stride), as you do here.

>   	/* Not doing get_page() for each frag is a big win
>   	 * on asymetric workloads. Note we can not use atomic_set().
>   	 */
> -	page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
> +	/* Since the page must be valid until after dma_unmap_page is called,
> +	 * take an additional reference we would not have otherwise.
> +	 */
> +	page_ref_add(page, page_alloc->page_size / frag_info->frag_stride);
>   	return 0;
>   }
>   
> @@ -105,9 +109,13 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>   		page_alloc[i].page_offset += frag_info->frag_stride;
>   
>   		if (page_alloc[i].page_offset + frag_info->frag_stride <=
> -		    ring_alloc[i].page_size)
> -			continue;
> -
> +		    ring_alloc[i].page_size) {
> +			WARN_ON(!page_alloc[i].page);
> +			WARN_ON(!page_alloc[i].page_owner);

Why WARN before the likely() check?
Move after the check, for a better performance.

> +			if (likely(page_alloc[i].page &&
> +				   page_alloc[i].page_owner))
> +				continue;
> +		}
>   		if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
>   					      frag_info, gfp)))
>   			goto out;
> @@ -131,7 +139,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>   			page = page_alloc[i].page;
>   			/* Revert changes done by mlx4_alloc_pages */
>   			page_ref_sub(page, page_alloc[i].page_size /
> -					   priv->frag_info[i].frag_stride - 1);
> +					   priv->frag_info[i].frag_stride);
>   			put_page(page);
>   		}
>   	}
> @@ -146,11 +154,13 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
>   	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
>   
>   
> -	if (next_frag_end > frags[i].page_size)
> +	if (next_frag_end > frags[i].page_size) {
>   		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
>   			       frag_info->dma_dir);
> +		put_page(frags[i].page);
> +	}
>   
> -	if (frags[i].page)
> +	if (frags[i].page_owner)
>   		put_page(frags[i].page);
>   }
>   
> @@ -184,9 +194,10 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
>   		page = page_alloc->page;
>   		/* Revert changes done by mlx4_alloc_pages */
>   		page_ref_sub(page, page_alloc->page_size /
> -				   priv->frag_info[i].frag_stride - 1);
> +				   priv->frag_info[i].frag_stride);
>   		put_page(page);
>   		page_alloc->page = NULL;
> +		page_alloc->page_owner = false;
>   	}
>   	return -ENOMEM;
>   }
> @@ -206,12 +217,14 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
>   
>   		dma_unmap_page(priv->ddev, page_alloc->dma,
>   				page_alloc->page_size, frag_info->dma_dir);
> +		put_page(page_alloc->page);

for symmetry, i'd move this after the while loop.

>   		while (page_alloc->page_offset + frag_info->frag_stride <
>   		       page_alloc->page_size) {
>   			put_page(page_alloc->page);
>   			page_alloc->page_offset += frag_info->frag_stride;
>   		}
>   		page_alloc->page = NULL;
> +		page_alloc->page_owner = false;
>   	}
>   }
>   
> @@ -251,6 +264,11 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
>   	if (ring->page_cache.index > 0) {
>   		frags[0] = ring->page_cache.buf[--ring->page_cache.index];
>   		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
> +		WARN_ON(frags[0].page_owner);
> +		if (likely(!frags[0].page_owner)) {
> +			page_ref_inc(frags[0].page);
> +			frags[0].page_owner = true;
> +		}

Why? If I'm not mistaken, the page is cached with refcnt == 2. No?

>   		return 0;
>   	}
>   
> @@ -569,6 +587,7 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
>   
>   		dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
>   			       priv->frag_info[0].dma_dir);
> +		WARN_ON(frame->page_owner);
>   		put_page(frame->page);
>   	}
>   	ring->page_cache.index = 0;
> @@ -595,7 +614,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>   		frag_info = &priv->frag_info[nr];
>   		if (length <= frag_info->frag_prefix_size)
>   			break;
> -		if (unlikely(!frags[nr].page))
> +		if (unlikely(!frags[nr].page_owner))
>   			goto fail;
>   
>   		dma = be64_to_cpu(rx_desc->data[nr].addr);
> @@ -607,7 +626,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>   		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
>   		skb_frags_rx[nr].page_offset = frags[nr].page_offset;
>   		skb->truesize += frag_info->frag_stride;
> -		frags[nr].page = NULL;
> +		frags[nr].page_owner = false;
>   	}
>   	/* Adjust size of last fragment to match actual length */
>   	if (nr > 0)
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> index e2509bb..25f7f9e 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> @@ -356,6 +356,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
>   		.dma = tx_info->map0_dma,
>   		.page_offset = 0,
>   		.page_size = PAGE_SIZE,
> +		.page_owner = false,

I don't understand why this is needed.

>   	};
>   
>   	if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
> @@ -1128,7 +1129,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
>   	dma = frame->dma;
>   
>   	tx_info->page = frame->page;
> -	frame->page = NULL;
> +	frame->page_owner = false;
>   	tx_info->map0_dma = dma;
>   	tx_info->map0_byte_count = length;
>   	tx_info->nr_txbb = nr_txbb;
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> index df0f396..2c9d9a6 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> @@ -261,6 +261,7 @@ struct mlx4_en_rx_alloc {
>   	dma_addr_t	dma;
>   	u32		page_offset;
>   	u32		page_size;
> +	bool		page_owner;
>   };
>   
>   #define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
> 

Thanks,
Tariq

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-05 10:09 ` Tariq Toukan
@ 2018-03-05 21:10   ` Sarah Newman
  2018-03-06 16:13     ` Tariq Toukan
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-03-05 21:10 UTC (permalink / raw)
  To: Tariq Toukan, Yishai Hadas; +Cc: netdev

On 03/05/2018 02:09 AM, Tariq Toukan wrote:
> 
> 
> On 05/03/2018 6:20 AM, Sarah Newman wrote:
>> Take an additional reference to a page whenever it is placed
>> into the rx ring and put the page again after running
>> dma_unmap_page.
>>
>> When swiotlb is in use, calling dma_unmap_page means that
>> the original page mapped with dma_map_page must still be valid,
>> as swiotlb will copy data from its internal cache back to the
>> originally requested DMA location.
>>
>> When GRO is enabled, before this patch all references to the
>> original frag may be put and the page freed before dma_unmap_page
>> in mlx4_en_free_frag is called.
>>
>> It is possible there is a path where the use-after-free occurs
>> even with GRO disabled, but this has not been observed so far.
>>
>> The bug can be trivially detected by doing the following:
>>
>> * Compile the kernel with DEBUG_PAGEALLOC
>> * Run the kernel as a Xen Dom0
>> * Leave GRO enabled on the interface
>> * Run a 10 second or more test with iperf over the interface.
>>
> 
> Hi Sarah, thanks for your patch!
> 
>> This bug was likely introduced in
>> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
>> first part of u3.6.
>>
>> It was incidentally fixed in
>> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
>> first part of v4.12.
>>
>> This version applies to the v4.9 series.
>>
>> Signed-off-by: Sarah Newman <srn@prgmr.com>
>> Tested-by: Sarah Newman <srn@prgmr.com>
>> ---
>>   drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 39 +++++++++++++++++++++-------
>>   drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  3 ++-
>>   drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  1 +
>>   3 files changed, 32 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> index bcbb80f..d1fb087 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>> @@ -80,10 +80,14 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
>>       page_alloc->page = page;
>>       page_alloc->dma = dma;
>>       page_alloc->page_offset = 0;
>> +    page_alloc->page_owner = true;
> 
> Do we really need this boolean? I believe the issue can be fixed without it. We need to make sure we hold the correct refcnt at every stage, and
> maintain symmetry between a flow and its inverse.

The reason this was added is because the page address needs to stay around until after dma unmap_page is called, and right now setting page to NULL is
used to indicate that put_page should not be called when frags are freed in mlx4_en_free_frag. So either the code needs to be rearranged so that
dma_unmap_page while page is still set, or some variable needed to be used to indicate whether put_page should be called when the frags are freed.

If dma_unmap_page was called before page was set to NULL, then this variable doesn't need to be added, yes. Then the call to dma_unmap_page in
mlx4_en_free_frag would also be contingent on frags[i].page being set.

There are two places where page is set to NULL without calling dma_unmap_page first, mlx4_en_complete_rx_desc and mlx4_en_xmit_frame.

Is mlx4_en_complete_rx_desc the only place where a call to dma_unmap_page would need to be added? The other place page is set to NULL without a call
to dma_unmap_page first is in mlx4_en_xmit_frame, and I believe there is no call to mlx4_en_free_frag if mlx4_en_xmit_frame executes.

> 
> Upon alloc, refcnt is 1. This alloc refcnt should be inverted by a call to put_page. We might want to introduce a page free API (symmetric to
> mlx4_alloc_pages), that does: dma unmap the page, call put_page, nullify pointer.

That seems reasonable.

> Once alloced, page refcnt is bumped up by the amount of possible frags populating it, which is (page_size / frag_stride), as you do here.
> 
>>       /* Not doing get_page() for each frag is a big win
>>        * on asymetric workloads. Note we can not use atomic_set().
>>        */
>> -    page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
>> +    /* Since the page must be valid until after dma_unmap_page is called,
>> +     * take an additional reference we would not have otherwise.
>> +     */
>> +    page_ref_add(page, page_alloc->page_size / frag_info->frag_stride);
>>       return 0;
>>   }
>>   @@ -105,9 +109,13 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>>           page_alloc[i].page_offset += frag_info->frag_stride;
>>             if (page_alloc[i].page_offset + frag_info->frag_stride <=
>> -            ring_alloc[i].page_size)
>> -            continue;
>> -
>> +            ring_alloc[i].page_size) {
>> +            WARN_ON(!page_alloc[i].page);
>> +            WARN_ON(!page_alloc[i].page_owner);
> 
> Why WARN before the likely() check?
> Move after the check, for a better performance.

No particular reason.

> 
>> +            if (likely(page_alloc[i].page &&
>> +                   page_alloc[i].page_owner))
>> +                continue;
>> +        }
>>           if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
>>                             frag_info, gfp)))
>>               goto out;
>> @@ -131,7 +139,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>>               page = page_alloc[i].page;
>>               /* Revert changes done by mlx4_alloc_pages */
>>               page_ref_sub(page, page_alloc[i].page_size /
>> -                       priv->frag_info[i].frag_stride - 1);
>> +                       priv->frag_info[i].frag_stride);
>>               put_page(page);
>>           }
>>       }
>> @@ -146,11 +154,13 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
>>       u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
>>     -    if (next_frag_end > frags[i].page_size)
>> +    if (next_frag_end > frags[i].page_size) {
>>           dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
>>                      frag_info->dma_dir);
>> +        put_page(frags[i].page);
>> +    }
>>   -    if (frags[i].page)
>> +    if (frags[i].page_owner)
>>           put_page(frags[i].page);
>>   }
>>   @@ -184,9 +194,10 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
>>           page = page_alloc->page;
>>           /* Revert changes done by mlx4_alloc_pages */
>>           page_ref_sub(page, page_alloc->page_size /
>> -                   priv->frag_info[i].frag_stride - 1);
>> +                   priv->frag_info[i].frag_stride);
>>           put_page(page);
>>           page_alloc->page = NULL;
>> +        page_alloc->page_owner = false;
>>       }
>>       return -ENOMEM;
>>   }
>> @@ -206,12 +217,14 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
>>             dma_unmap_page(priv->ddev, page_alloc->dma,
>>                   page_alloc->page_size, frag_info->dma_dir);
>> +        put_page(page_alloc->page);
> 
> for symmetry, i'd move this after the while loop.

Or use the wrapper function you suggested for dma_unmap_page?

> 
>>           while (page_alloc->page_offset + frag_info->frag_stride <
>>                  page_alloc->page_size) {
>>               put_page(page_alloc->page);
>>               page_alloc->page_offset += frag_info->frag_stride;
>>           }
>>           page_alloc->page = NULL;
>> +        page_alloc->page_owner = false;
>>       }
>>   }
>>   @@ -251,6 +264,11 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
>>       if (ring->page_cache.index > 0) {
>>           frags[0] = ring->page_cache.buf[--ring->page_cache.index];
>>           rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
>> +        WARN_ON(frags[0].page_owner);
>> +        if (likely(!frags[0].page_owner)) {
>> +            page_ref_inc(frags[0].page);
>> +            frags[0].page_owner = true;
>> +        }
> 
> Why? If I'm not mistaken, the page is cached with refcnt == 2. No?

In mlx4_en_deactivate_rx_ring, pages assigned to frames in the page_cache are only put once. If refcnt == 2 when it's inserted, isn't that a memory
leak? I can confirm one way or another if you haven't already.


> 
>>           return 0;
>>       }
>>   @@ -569,6 +587,7 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
>>             dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
>>                      priv->frag_info[0].dma_dir);
>> +        WARN_ON(frame->page_owner);
>>           put_page(frame->page);
>>       }
>>       ring->page_cache.index = 0;
>> @@ -595,7 +614,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>>           frag_info = &priv->frag_info[nr];
>>           if (length <= frag_info->frag_prefix_size)
>>               break;
>> -        if (unlikely(!frags[nr].page))
>> +        if (unlikely(!frags[nr].page_owner))
>>               goto fail;
>>             dma = be64_to_cpu(rx_desc->data[nr].addr);
>> @@ -607,7 +626,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>>           skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
>>           skb_frags_rx[nr].page_offset = frags[nr].page_offset;
>>           skb->truesize += frag_info->frag_stride;
>> -        frags[nr].page = NULL;
>> +        frags[nr].page_owner = false;
>>       }
>>       /* Adjust size of last fragment to match actual length */
>>       if (nr > 0)
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> index e2509bb..25f7f9e 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>> @@ -356,6 +356,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
>>           .dma = tx_info->map0_dma,
>>           .page_offset = 0,
>>           .page_size = PAGE_SIZE,
>> +        .page_owner = false,
> 
> I don't understand why this is needed.

Not strictly needed but there for clarity.

> 
>>       };
>>         if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
>> @@ -1128,7 +1129,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
>>       dma = frame->dma;
>>         tx_info->page = frame->page;
>> -    frame->page = NULL;
>> +    frame->page_owner = false;
>>       tx_info->map0_dma = dma;
>>       tx_info->map0_byte_count = length;
>>       tx_info->nr_txbb = nr_txbb;
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> index df0f396..2c9d9a6 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>> @@ -261,6 +261,7 @@ struct mlx4_en_rx_alloc {
>>       dma_addr_t    dma;
>>       u32        page_offset;
>>       u32        page_size;
>> +    bool        page_owner;
>>   };
>>     #define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
>>
> 
> Thanks,
> Tariq

Thanks, Sarah

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-05 21:10   ` Sarah Newman
@ 2018-03-06 16:13     ` Tariq Toukan
  2018-03-06 20:16       ` Sarah Newman
  0 siblings, 1 reply; 14+ messages in thread
From: Tariq Toukan @ 2018-03-06 16:13 UTC (permalink / raw)
  To: Sarah Newman, Tariq Toukan, Yishai Hadas; +Cc: netdev



On 05/03/2018 11:10 PM, Sarah Newman wrote:
> On 03/05/2018 02:09 AM, Tariq Toukan wrote:
>>
>>
>> On 05/03/2018 6:20 AM, Sarah Newman wrote:
>>> Take an additional reference to a page whenever it is placed
>>> into the rx ring and put the page again after running
>>> dma_unmap_page.
>>>
>>> When swiotlb is in use, calling dma_unmap_page means that
>>> the original page mapped with dma_map_page must still be valid,
>>> as swiotlb will copy data from its internal cache back to the
>>> originally requested DMA location.
>>>
>>> When GRO is enabled, before this patch all references to the
>>> original frag may be put and the page freed before dma_unmap_page
>>> in mlx4_en_free_frag is called.
>>>
>>> It is possible there is a path where the use-after-free occurs
>>> even with GRO disabled, but this has not been observed so far.
>>>
>>> The bug can be trivially detected by doing the following:
>>>
>>> * Compile the kernel with DEBUG_PAGEALLOC
>>> * Run the kernel as a Xen Dom0
>>> * Leave GRO enabled on the interface
>>> * Run a 10 second or more test with iperf over the interface.
>>>
>>
>> Hi Sarah, thanks for your patch!
>>
>>> This bug was likely introduced in
>>> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
>>> first part of u3.6.
>>>
>>> It was incidentally fixed in
>>> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
>>> first part of v4.12.
>>>
>>> This version applies to the v4.9 series.
>>>
>>> Signed-off-by: Sarah Newman <srn@prgmr.com>
>>> Tested-by: Sarah Newman <srn@prgmr.com>
>>> ---
>>>    drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 39 +++++++++++++++++++++-------
>>>    drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  3 ++-
>>>    drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  1 +
>>>    3 files changed, 32 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>>> index bcbb80f..d1fb087 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
>>> @@ -80,10 +80,14 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
>>>        page_alloc->page = page;
>>>        page_alloc->dma = dma;
>>>        page_alloc->page_offset = 0;
>>> +    page_alloc->page_owner = true;
>>
>> Do we really need this boolean? I believe the issue can be fixed without it. We need to make sure we hold the correct refcnt at every stage, and
>> maintain symmetry between a flow and its inverse.
> 
> The reason this was added is because the page address needs to stay around until after dma unmap_page is called, and right now setting page to NULL is
> used to indicate that put_page should not be called when frags are freed in mlx4_en_free_frag. So either the code needs to be rearranged so that
> dma_unmap_page while page is still set, or some variable needed to be used to indicate whether put_page should be called when the frags are freed.
> 

rearranging sounds better.

> If dma_unmap_page was called before page was set to NULL, then this variable doesn't need to be added, yes. Then the call to dma_unmap_page in
> mlx4_en_free_frag would also be contingent on frags[i].page being set.
> 
> There are two places where page is set to NULL without calling dma_unmap_page first, mlx4_en_complete_rx_desc and mlx4_en_xmit_frame.
> 

In mlx4_en_xmit_frame, should not unmap, it should be done only upon 
completion, this is done in mlx4_en_recycle_tx_desc.
In mlx4_en_complete_rx_desc I think it was just a bug.

 >
> Is mlx4_en_complete_rx_desc the only place where a call to dma_unmap_page would need to be added? The other place page is set to NULL without a call
> to dma_unmap_page first is in mlx4_en_xmit_frame, and I believe there is no call to mlx4_en_free_frag if mlx4_en_xmit_frame executes.
> 

Yes, only in mlx4_en_complete_rx_desc, see above.

>>
>> Upon alloc, refcnt is 1. This alloc refcnt should be inverted by a call to put_page. We might want to introduce a page free API (symmetric to
>> mlx4_alloc_pages), that does: dma unmap the page, call put_page, nullify pointer.
> 
> That seems reasonable.
> 
Yes, let's use it in mlx4_en_free_frag.

>> Once alloced, page refcnt is bumped up by the amount of possible frags populating it, which is (page_size / frag_stride), as you do here.
>>
>>>        /* Not doing get_page() for each frag is a big win
>>>         * on asymetric workloads. Note we can not use atomic_set().
>>>         */
>>> -    page_ref_add(page, page_alloc->page_size / frag_info->frag_stride - 1);
>>> +    /* Since the page must be valid until after dma_unmap_page is called,
>>> +     * take an additional reference we would not have otherwise.
>>> +     */
>>> +    page_ref_add(page, page_alloc->page_size / frag_info->frag_stride);
>>>        return 0;
>>>    }
>>>    @@ -105,9 +109,13 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>>>            page_alloc[i].page_offset += frag_info->frag_stride;
>>>              if (page_alloc[i].page_offset + frag_info->frag_stride <=
>>> -            ring_alloc[i].page_size)
>>> -            continue;
>>> -
>>> +            ring_alloc[i].page_size) {
>>> +            WARN_ON(!page_alloc[i].page);
>>> +            WARN_ON(!page_alloc[i].page_owner);
>>
>> Why WARN before the likely() check?
>> Move after the check, for a better performance.
> 
> No particular reason.
> 

please move them.

>>
>>> +            if (likely(page_alloc[i].page &&
>>> +                   page_alloc[i].page_owner))
>>> +                continue;
>>> +        }
>>>            if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
>>>                              frag_info, gfp)))
>>>                goto out;
>>> @@ -131,7 +139,7 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
>>>                page = page_alloc[i].page;
>>>                /* Revert changes done by mlx4_alloc_pages */
>>>                page_ref_sub(page, page_alloc[i].page_size /
>>> -                       priv->frag_info[i].frag_stride - 1);
>>> +                       priv->frag_info[i].frag_stride);
>>>                put_page(page);
>>>            }
>>>        }
>>> @@ -146,11 +154,13 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
>>>        u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
>>>      -    if (next_frag_end > frags[i].page_size)
>>> +    if (next_frag_end > frags[i].page_size) {
>>>            dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
>>>                       frag_info->dma_dir);
>>> +        put_page(frags[i].page);
>>> +    }
>>>    -    if (frags[i].page)
>>> +    if (frags[i].page_owner)
>>>            put_page(frags[i].page);
>>>    }
>>>    @@ -184,9 +194,10 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
>>>            page = page_alloc->page;
>>>            /* Revert changes done by mlx4_alloc_pages */
>>>            page_ref_sub(page, page_alloc->page_size /
>>> -                   priv->frag_info[i].frag_stride - 1);
>>> +                   priv->frag_info[i].frag_stride);
>>>            put_page(page);
>>>            page_alloc->page = NULL;
>>> +        page_alloc->page_owner = false;
>>>        }
>>>        return -ENOMEM;
>>>    }
>>> @@ -206,12 +217,14 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
>>>              dma_unmap_page(priv->ddev, page_alloc->dma,
>>>                    page_alloc->page_size, frag_info->dma_dir);
>>> +        put_page(page_alloc->page);
>>
>> for symmetry, i'd move this after the while loop.
> 
> Or use the wrapper function you suggested for dma_unmap_page?
> 

yes.

>>
>>>            while (page_alloc->page_offset + frag_info->frag_stride <
>>>                   page_alloc->page_size) {
>>>                put_page(page_alloc->page);
>>>                page_alloc->page_offset += frag_info->frag_stride;
>>>            }
>>>            page_alloc->page = NULL;
>>> +        page_alloc->page_owner = false;
>>>        }
>>>    }
>>>    @@ -251,6 +264,11 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
>>>        if (ring->page_cache.index > 0) {
>>>            frags[0] = ring->page_cache.buf[--ring->page_cache.index];
>>>            rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
>>> +        WARN_ON(frags[0].page_owner);
>>> +        if (likely(!frags[0].page_owner)) {
>>> +            page_ref_inc(frags[0].page);
>>> +            frags[0].page_owner = true;
>>> +        }
>>
>> Why? If I'm not mistaken, the page is cached with refcnt == 2. No?
> 
> In mlx4_en_deactivate_rx_ring, pages assigned to frames in the page_cache are only put once. If refcnt == 2 when it's inserted, isn't that a memory
> leak? I can confirm one way or another if you haven't already.
> 

I think you're right. But I didn't check this yet.

> 
>>
>>>            return 0;
>>>        }
>>>    @@ -569,6 +587,7 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
>>>              dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
>>>                       priv->frag_info[0].dma_dir);
>>> +        WARN_ON(frame->page_owner);
>>>            put_page(frame->page);
>>>        }
>>>        ring->page_cache.index = 0;
>>> @@ -595,7 +614,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>>>            frag_info = &priv->frag_info[nr];
>>>            if (length <= frag_info->frag_prefix_size)
>>>                break;
>>> -        if (unlikely(!frags[nr].page))
>>> +        if (unlikely(!frags[nr].page_owner))
>>>                goto fail;
>>>              dma = be64_to_cpu(rx_desc->data[nr].addr);
>>> @@ -607,7 +626,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
>>>            skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
>>>            skb_frags_rx[nr].page_offset = frags[nr].page_offset;
>>>            skb->truesize += frag_info->frag_stride;
>>> -        frags[nr].page = NULL;
>>> +        frags[nr].page_owner = false;
>>>        }
>>>        /* Adjust size of last fragment to match actual length */
>>>        if (nr > 0)
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>>> index e2509bb..25f7f9e 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
>>> @@ -356,6 +356,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
>>>            .dma = tx_info->map0_dma,
>>>            .page_offset = 0,
>>>            .page_size = PAGE_SIZE,
>>> +        .page_owner = false,
>>
>> I don't understand why this is needed.
> 
> Not strictly needed but there for clarity.
> 

Let's obsolete it.

>>
>>>        };
>>>          if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
>>> @@ -1128,7 +1129,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
>>>        dma = frame->dma;
>>>          tx_info->page = frame->page;
>>> -    frame->page = NULL;
>>> +    frame->page_owner = false;
>>>        tx_info->map0_dma = dma;
>>>        tx_info->map0_byte_count = length;
>>>        tx_info->nr_txbb = nr_txbb;
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>>> index df0f396..2c9d9a6 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
>>> @@ -261,6 +261,7 @@ struct mlx4_en_rx_alloc {
>>>        dma_addr_t    dma;
>>>        u32        page_offset;
>>>        u32        page_size;
>>> +    bool        page_owner;
>>>    };
>>>      #define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
>>>
>>
>> Thanks,
>> Tariq
> 
> Thanks, Sarah
> 

I have a general question about the process.
I don't totally get what branch this patch is targeted to.
It touches critical areas in datapath and should go through regression 
tests before it is accepted to any branch.

Thanks,
Tariq

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-06 16:13     ` Tariq Toukan
@ 2018-03-06 20:16       ` Sarah Newman
  2018-03-11 15:15         ` Tariq Toukan
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-03-06 20:16 UTC (permalink / raw)
  To: Tariq Toukan, Yishai Hadas; +Cc: netdev

On 03/06/2018 08:13 AM, Tariq Toukan wrote:

> I have a general question about the process.
> I don't totally get what branch this patch is targeted to.
> It touches critical areas in datapath and should go through regression tests before it is accepted to any branch.
> 

This one is against 4.9 -
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/log/?h=linux-4.9.y

I assume you'd be interested in applying a fix to all currently maintained stable branches between 3.6 to 4.11, per
https://www.kernel.org/category/releases.html .

I know of at least one other person whose workaround has been to disable GRO, but I don't understand the networking code well enough to guarantee that
disabling GRO means the problem will never occur under all possible error conditions. If it's guaranteed that disabling GRO will definitely mitigate
the bug, probably it's better to stop supporting GRO for these versions of the driver instead of trying to implement some other fix.

--Sarah

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-06 20:16       ` Sarah Newman
@ 2018-03-11 15:15         ` Tariq Toukan
  2018-04-26  4:00           ` [PATCH v2] " Sarah Newman
  0 siblings, 1 reply; 14+ messages in thread
From: Tariq Toukan @ 2018-03-11 15:15 UTC (permalink / raw)
  To: Sarah Newman, Tariq Toukan, Yishai Hadas; +Cc: netdev



On 06/03/2018 10:16 PM, Sarah Newman wrote:
> On 03/06/2018 08:13 AM, Tariq Toukan wrote:
> 
>> I have a general question about the process.
>> I don't totally get what branch this patch is targeted to.
>> It touches critical areas in datapath and should go through regression tests before it is accepted to any branch.
>>
> 
> This one is against 4.9 -
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/log/?h=linux-4.9.y
> 
> I assume you'd be interested in applying a fix to all currently maintained stable branches between 3.6 to 4.11, per
> https://www.kernel.org/category/releases.html .
> 

Yes.

> I know of at least one other person whose workaround has been to disable GRO, but I don't understand the networking code well enough to guarantee that
> disabling GRO means the problem will never occur under all possible error conditions. If it's guaranteed that disabling GRO will definitely mitigate
> the bug, probably it's better to stop supporting GRO for these versions of the driver instead of trying to implement some other fix.

I don't think we have a guarantee, it just happens to work.
Also, disabling GRO will cause dramatic performance degradation.
We should fix this, not WA.

> 
> --Sarah
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-03-11 15:15         ` Tariq Toukan
@ 2018-04-26  4:00           ` Sarah Newman
  2018-04-27 23:48             ` David Miller
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-04-26  4:00 UTC (permalink / raw)
  To: tariqt, yishaih; +Cc: netdev, Sarah Newman

When swiotlb is in use, calling dma_unmap_page means that
the original page mapped with dma_map_page must still be valid
as swiotlb will copy data from its internal cache back to the
originally requested DMA location. When GRO is enabled,
all references to the original frag may be put before
mlx4_en_free_frag is called, meaning the page has been freed
before the call to dma_unmap_page in mlx4_en_free_frag.

To fix, unmap the page as soon as possible.

This can be trivially detected by doing the following:

Compile the kernel with DEBUG_PAGEALLOC
Run the kernel as a Xen Dom0
Leave GRO enabled on the interface
Run a 10 second or more test with iperf over the interface.

Signed-off-by: Sarah Newman <srn@prgmr.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 32 +++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 844f5ad..abe2b43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -142,16 +142,17 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 			      struct mlx4_en_rx_alloc *frags,
 			      int i)
 {
-	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
-
-
-	if (next_frag_end > frags[i].page_size)
-		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
-			       frag_info->dma_dir);
+	if (frags[i].page) {
+		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+		u32 next_frag_end = frags[i].page_offset +
+				2 * frag_info->frag_stride;
 
-	if (frags[i].page)
+		if (next_frag_end > frags[i].page_size) {
+			dma_unmap_page(priv->ddev, frags[i].dma,
+				       frags[i].page_size, frag_info->dma_dir);
+		}
 		put_page(frags[i].page);
+	}
 }
 
 static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
@@ -586,21 +587,28 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 				    int length)
 {
 	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
-	struct mlx4_en_frag_info *frag_info;
 	int nr;
 	dma_addr_t dma;
 
 	/* Collect used fragments while replacing them in the HW descriptors */
 	for (nr = 0; nr < priv->num_frags; nr++) {
-		frag_info = &priv->frag_info[nr];
+		struct mlx4_en_frag_info *frag_info = &priv->frag_info[nr];
+		u32 next_frag_end = frags[nr].page_offset +
+				2 * frag_info->frag_stride;
+
 		if (length <= frag_info->frag_prefix_size)
 			break;
 		if (unlikely(!frags[nr].page))
 			goto fail;
 
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
-		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
-					DMA_FROM_DEVICE);
+		if (next_frag_end > frags[nr].page_size)
+			dma_unmap_page(priv->ddev, frags[nr].dma,
+				       frags[nr].page_size, frag_info->dma_dir);
+		else
+			dma_sync_single_for_cpu(priv->ddev, dma,
+						frag_info->frag_size,
+						DMA_FROM_DEVICE);
 
 		/* Save page reference in skb */
 		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-04-26  4:00           ` [PATCH v2] " Sarah Newman
@ 2018-04-27 23:48             ` David Miller
  2018-05-02 13:50               ` Tariq Toukan
  0 siblings, 1 reply; 14+ messages in thread
From: David Miller @ 2018-04-27 23:48 UTC (permalink / raw)
  To: srn; +Cc: tariqt, yishaih, netdev

From: Sarah Newman <srn@prgmr.com>
Date: Wed, 25 Apr 2018 21:00:34 -0700

> When swiotlb is in use, calling dma_unmap_page means that
> the original page mapped with dma_map_page must still be valid
> as swiotlb will copy data from its internal cache back to the
> originally requested DMA location. When GRO is enabled,
> all references to the original frag may be put before
> mlx4_en_free_frag is called, meaning the page has been freed
> before the call to dma_unmap_page in mlx4_en_free_frag.
> 
> To fix, unmap the page as soon as possible.
> 
> This can be trivially detected by doing the following:
> 
> Compile the kernel with DEBUG_PAGEALLOC
> Run the kernel as a Xen Dom0
> Leave GRO enabled on the interface
> Run a 10 second or more test with iperf over the interface.
> 
> Signed-off-by: Sarah Newman <srn@prgmr.com>

Tariq, I assume I will get this from you in the next set of
changes you submit to me.

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-04-27 23:48             ` David Miller
@ 2018-05-02 13:50               ` Tariq Toukan
  2018-05-02 14:26                 ` David Miller
  0 siblings, 1 reply; 14+ messages in thread
From: Tariq Toukan @ 2018-05-02 13:50 UTC (permalink / raw)
  To: David Miller, srn; +Cc: yishaih, netdev



On 28/04/2018 2:48 AM, David Miller wrote:
> From: Sarah Newman <srn@prgmr.com>
> Date: Wed, 25 Apr 2018 21:00:34 -0700
> 
>> When swiotlb is in use, calling dma_unmap_page means that
>> the original page mapped with dma_map_page must still be valid
>> as swiotlb will copy data from its internal cache back to the
>> originally requested DMA location. When GRO is enabled,
>> all references to the original frag may be put before
>> mlx4_en_free_frag is called, meaning the page has been freed
>> before the call to dma_unmap_page in mlx4_en_free_frag.
>>
>> To fix, unmap the page as soon as possible.
>>
>> This can be trivially detected by doing the following:
>>
>> Compile the kernel with DEBUG_PAGEALLOC
>> Run the kernel as a Xen Dom0
>> Leave GRO enabled on the interface
>> Run a 10 second or more test with iperf over the interface.
>>
>> Signed-off-by: Sarah Newman <srn@prgmr.com>
> 
> Tariq, I assume I will get this from you in the next set of
> changes you submit to me.
> 
> Thanks.
> 

This patch fixes an issue existing in old kernels. It is not relevant 
per latest code.

So I'm not sure about the process. After I review it, do I just submit 
it again for -stable?

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-05-02 13:50               ` Tariq Toukan
@ 2018-05-02 14:26                 ` David Miller
  2018-05-31  1:04                   ` [PATCH v3] " Sarah Newman
  0 siblings, 1 reply; 14+ messages in thread
From: David Miller @ 2018-05-02 14:26 UTC (permalink / raw)
  To: tariqt; +Cc: srn, yishaih, netdev

From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 2 May 2018 16:50:28 +0300

> This patch fixes an issue existing in old kernels. It is not relevant
> per latest code.
> 
> So I'm not sure about the process. After I review it, do I just submit
> it again for -stable?

If that's the case, yes that is what you do, just submit it for -stable
and add a note to the top of the commit message body that says something
like:

	[ Not relevant upstream, therefore no upstream commit. ]

As -stable submissions have to say what their upstream commit ID is
otherwise.

CC: me on the submission, thank you.

Thanks.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-05-02 14:26                 ` David Miller
@ 2018-05-31  1:04                   ` Sarah Newman
  2018-05-31  5:36                     ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-05-31  1:04 UTC (permalink / raw)
  To: stable; +Cc: tariqt, davem, Sarah Newman

[ Not relevant upstream, therefore no upstream commit. ]

To fix, unmap the page as soon as possible.

When swiotlb is in use, calling dma_unmap_page means that
the original page mapped with dma_map_page must still be valid,
as swiotlb will copy data from its internal cache back to the
originally requested DMA location.

When GRO is enabled, before this patch all references to the
original frag may be put and the page freed before dma_unmap_page
in mlx4_en_free_frag is called.

It is possible there is a path where the use-after-free occurs
even with GRO disabled, but this has not been observed so far.

The bug can be trivially detected by doing the following:

* Compile the kernel with DEBUG_PAGEALLOC
* Run the kernel as a Xen Dom0
* Leave GRO enabled on the interface
* Run a 10 second or more test with iperf over the interface.

This bug was likely introduced in
commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
first part of u3.6.

It was incidentally fixed in
commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
first part of v4.12.

This version applies to the v4.9 series.

Signed-off-by: Sarah Newman <srn@prgmr.com>
Tested-by: Sarah Newman <srn@prgmr.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 32 +++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 844f5ad..abe2b43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -142,16 +142,17 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 			      struct mlx4_en_rx_alloc *frags,
 			      int i)
 {
-	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
-
-
-	if (next_frag_end > frags[i].page_size)
-		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
-			       frag_info->dma_dir);
+	if (frags[i].page) {
+		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+		u32 next_frag_end = frags[i].page_offset +
+				2 * frag_info->frag_stride;
 
-	if (frags[i].page)
+		if (next_frag_end > frags[i].page_size) {
+			dma_unmap_page(priv->ddev, frags[i].dma,
+				       frags[i].page_size, frag_info->dma_dir);
+		}
 		put_page(frags[i].page);
+	}
 }
 
 static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
@@ -586,21 +587,28 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 				    int length)
 {
 	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
-	struct mlx4_en_frag_info *frag_info;
 	int nr;
 	dma_addr_t dma;
 
 	/* Collect used fragments while replacing them in the HW descriptors */
 	for (nr = 0; nr < priv->num_frags; nr++) {
-		frag_info = &priv->frag_info[nr];
+		struct mlx4_en_frag_info *frag_info = &priv->frag_info[nr];
+		u32 next_frag_end = frags[nr].page_offset +
+				2 * frag_info->frag_stride;
+
 		if (length <= frag_info->frag_prefix_size)
 			break;
 		if (unlikely(!frags[nr].page))
 			goto fail;
 
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
-		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
-					DMA_FROM_DEVICE);
+		if (next_frag_end > frags[nr].page_size)
+			dma_unmap_page(priv->ddev, frags[nr].dma,
+				       frags[nr].page_size, frag_info->dma_dir);
+		else
+			dma_sync_single_for_cpu(priv->ddev, dma,
+						frag_info->frag_size,
+						DMA_FROM_DEVICE);
 
 		/* Save page reference in skb */
 		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-05-31  1:04                   ` [PATCH v3] " Sarah Newman
@ 2018-05-31  5:36                     ` Greg KH
  2018-05-31 17:15                       ` Sarah Newman
  0 siblings, 1 reply; 14+ messages in thread
From: Greg KH @ 2018-05-31  5:36 UTC (permalink / raw)
  To: Sarah Newman; +Cc: stable, tariqt, davem

On Wed, May 30, 2018 at 06:04:05PM -0700, Sarah Newman wrote:
> [ Not relevant upstream, therefore no upstream commit. ]
> 
> To fix, unmap the page as soon as possible.
> 
> When swiotlb is in use, calling dma_unmap_page means that
> the original page mapped with dma_map_page must still be valid,
> as swiotlb will copy data from its internal cache back to the
> originally requested DMA location.
> 
> When GRO is enabled, before this patch all references to the
> original frag may be put and the page freed before dma_unmap_page
> in mlx4_en_free_frag is called.
> 
> It is possible there is a path where the use-after-free occurs
> even with GRO disabled, but this has not been observed so far.
> 
> The bug can be trivially detected by doing the following:
> 
> * Compile the kernel with DEBUG_PAGEALLOC
> * Run the kernel as a Xen Dom0
> * Leave GRO enabled on the interface
> * Run a 10 second or more test with iperf over the interface.
> 
> This bug was likely introduced in
> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
> first part of u3.6.
> 
> It was incidentally fixed in
> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
> first part of v4.12.

Why not just apply this patch instead?

> 
> This version applies to the v4.9 series.

What about 4.4?  Why not just use 4.14 for this hardware?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-05-31  5:36                     ` Greg KH
@ 2018-05-31 17:15                       ` Sarah Newman
  2018-06-02 13:33                         ` Greg KH
  0 siblings, 1 reply; 14+ messages in thread
From: Sarah Newman @ 2018-05-31 17:15 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, tariqt, davem

On 05/30/2018 10:36 PM, Greg KH wrote:
> On Wed, May 30, 2018 at 06:04:05PM -0700, Sarah Newman wrote:
>> [ Not relevant upstream, therefore no upstream commit. ]
>>
>> To fix, unmap the page as soon as possible.
>>
>> When swiotlb is in use, calling dma_unmap_page means that
>> the original page mapped with dma_map_page must still be valid,
>> as swiotlb will copy data from its internal cache back to the
>> originally requested DMA location.
>>
>> When GRO is enabled, before this patch all references to the
>> original frag may be put and the page freed before dma_unmap_page
>> in mlx4_en_free_frag is called.
>>
>> It is possible there is a path where the use-after-free occurs
>> even with GRO disabled, but this has not been observed so far.
>>
>> The bug can be trivially detected by doing the following:
>>
>> * Compile the kernel with DEBUG_PAGEALLOC
>> * Run the kernel as a Xen Dom0
>> * Leave GRO enabled on the interface
>> * Run a 10 second or more test with iperf over the interface.
>>
>> This bug was likely introduced in
>> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
>> first part of u3.6.
>>
>> It was incidentally fixed in
>> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
>> first part of v4.12.
> 
> Why not just apply this patch instead?

That patch was part of a major rewrite. There was a 13 patch series and not even the first patch of the series 69ba943151b2e "mlx4: dma_dir is a
mlx4_en_priv attribute" applies cleanly to 4.9. I didn't believe that was appropriate to backport.

> 
>>
>> This version applies to the v4.9 series.
> 
> What about 4.4?  Why not just use 4.14 for this hardware?

I can also submit a patch for 4.4 if that's desired. The differences are minor.

We don't use 4.14 because we want to use a kernel version more widely tested for the majority of features we use. Currently our distribution ships 4.9.

Thanks, Sarah

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] net/mlx4_en: fix potential use-after-free with dma_unmap_page
  2018-05-31 17:15                       ` Sarah Newman
@ 2018-06-02 13:33                         ` Greg KH
  0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2018-06-02 13:33 UTC (permalink / raw)
  To: Sarah Newman; +Cc: stable, tariqt, davem

On Thu, May 31, 2018 at 10:15:21AM -0700, Sarah Newman wrote:
> On 05/30/2018 10:36 PM, Greg KH wrote:
> > On Wed, May 30, 2018 at 06:04:05PM -0700, Sarah Newman wrote:
> >> [ Not relevant upstream, therefore no upstream commit. ]
> >>
> >> To fix, unmap the page as soon as possible.
> >>
> >> When swiotlb is in use, calling dma_unmap_page means that
> >> the original page mapped with dma_map_page must still be valid,
> >> as swiotlb will copy data from its internal cache back to the
> >> originally requested DMA location.
> >>
> >> When GRO is enabled, before this patch all references to the
> >> original frag may be put and the page freed before dma_unmap_page
> >> in mlx4_en_free_frag is called.
> >>
> >> It is possible there is a path where the use-after-free occurs
> >> even with GRO disabled, but this has not been observed so far.
> >>
> >> The bug can be trivially detected by doing the following:
> >>
> >> * Compile the kernel with DEBUG_PAGEALLOC
> >> * Run the kernel as a Xen Dom0
> >> * Leave GRO enabled on the interface
> >> * Run a 10 second or more test with iperf over the interface.
> >>
> >> This bug was likely introduced in
> >> commit 4cce66cdd14a ("mlx4_en: map entire pages to increase throughput"),
> >> first part of u3.6.
> >>
> >> It was incidentally fixed in
> >> commit 34db548bfb95 ("mlx4: add page recycling in receive path"),
> >> first part of v4.12.
> > 
> > Why not just apply this patch instead?
> 
> That patch was part of a major rewrite. There was a 13 patch series and not even the first patch of the series 69ba943151b2e "mlx4: dma_dir is a
> mlx4_en_priv attribute" applies cleanly to 4.9. I didn't believe that was appropriate to backport.
> 
> > 
> >>
> >> This version applies to the v4.9 series.
> > 
> > What about 4.4?  Why not just use 4.14 for this hardware?
> 
> I can also submit a patch for 4.4 if that's desired. The differences are minor.
> 
> We don't use 4.14 because we want to use a kernel version more widely
> tested for the majority of features we use. Currently our distribution
> ships 4.9.

Well, I would move to 4.14 as soon as possible, 4.9 is getting "long in
the tooth", especially when it comes to some of the recent
spectre/meltdown issues....

Anyway, now queued up, thanks.

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-06-02 13:34 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-05  4:20 [PATCH] net/mlx4_en: fix potential use-after-free with dma_unmap_page Sarah Newman
2018-03-05 10:09 ` Tariq Toukan
2018-03-05 21:10   ` Sarah Newman
2018-03-06 16:13     ` Tariq Toukan
2018-03-06 20:16       ` Sarah Newman
2018-03-11 15:15         ` Tariq Toukan
2018-04-26  4:00           ` [PATCH v2] " Sarah Newman
2018-04-27 23:48             ` David Miller
2018-05-02 13:50               ` Tariq Toukan
2018-05-02 14:26                 ` David Miller
2018-05-31  1:04                   ` [PATCH v3] " Sarah Newman
2018-05-31  5:36                     ` Greg KH
2018-05-31 17:15                       ` Sarah Newman
2018-06-02 13:33                         ` Greg KH

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.