All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] jme: Fix DMA unmap warning
@ 2014-05-05 18:51 Neil Horman
  2014-05-06 20:25 ` Govindarajulu Varadarajan
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-05 18:51 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Guo-Fu Tseng, David S. Miller

The jme driver forgot to check the return status from pci_map_page in its tx
path, causing a dma api warning on unmap.  Easy fix, just do the check and
augment the tx path to tell the stack that the driver is busy so we re-queue the
frame.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Guo-Fu Tseng <cooldavid@cooldavid.org>
CC: "David S. Miller" <davem@davemloft.net>
---
 drivers/net/ethernet/jme.c | 53 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index b0c6050..6e664d9 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1988,7 +1988,7 @@ jme_alloc_txdesc(struct jme_adapter *jme,
 	return idx;
 }
 
-static void
+static int
 jme_fill_tx_map(struct pci_dev *pdev,
 		struct txdesc *txdesc,
 		struct jme_buffer_info *txbi,
@@ -2005,6 +2005,9 @@ jme_fill_tx_map(struct pci_dev *pdev,
 				len,
 				PCI_DMA_TODEVICE);
 
+	if (unlikely(pci_dma_mapping_error(pdev, dmaaddr)))
+		return -EINVAL;
+
 	pci_dma_sync_single_for_device(pdev,
 				       dmaaddr,
 				       len,
@@ -2021,9 +2024,30 @@ jme_fill_tx_map(struct pci_dev *pdev,
 
 	txbi->mapping = dmaaddr;
 	txbi->len = len;
+	return 0;
 }
 
-static void
+static void jme_drop_tx_map(struct jme_adapter *jme, int startidx, int endidx)
+{
+	struct jme_ring *txring = &(jme->txring[0]);
+	struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
+	int mask = jme->tx_ring_mask;
+	int j;
+
+	for (j = startidx ; j < endidx ; ++j) {
+		ctxbi = txbi + ((startidx + j + 2) & (mask));
+		pci_unmap_page(jme->pdev,
+				ctxbi->mapping,
+				ctxbi->len,
+				PCI_DMA_TODEVICE);
+
+				ctxbi->mapping = 0;
+				ctxbi->len = 0;
+	}
+
+}
+
+static int
 jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
 {
 	struct jme_ring *txring = &(jme->txring[0]);
@@ -2034,25 +2058,37 @@ jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
 	int mask = jme->tx_ring_mask;
 	const struct skb_frag_struct *frag;
 	u32 len;
+	int ret = 0;
 
 	for (i = 0 ; i < nr_frags ; ++i) {
 		frag = &skb_shinfo(skb)->frags[i];
 		ctxdesc = txdesc + ((idx + i + 2) & (mask));
 		ctxbi = txbi + ((idx + i + 2) & (mask));
 
-		jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
+		ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
 				skb_frag_page(frag),
 				frag->page_offset, skb_frag_size(frag), hidma);
+		if (ret) {
+			jme_drop_tx_map(jme, idx, idx+i);
+			goto out;
+		}
+
 	}
 
 	len = skb_is_nonlinear(skb) ? skb_headlen(skb) : skb->len;
 	ctxdesc = txdesc + ((idx + 1) & (mask));
 	ctxbi = txbi + ((idx + 1) & (mask));
-	jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi, virt_to_page(skb->data),
+	ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi, virt_to_page(skb->data),
 			offset_in_page(skb->data), len, hidma);
+	if (ret)
+		jme_drop_tx_map(jme, idx, idx+i);
+
+out:
+	return ret;
 
 }
 
+
 static int
 jme_tx_tso(struct sk_buff *skb, __le16 *mss, u8 *flags)
 {
@@ -2131,6 +2167,7 @@ jme_fill_tx_desc(struct jme_adapter *jme, struct sk_buff *skb, int idx)
 	struct txdesc *txdesc;
 	struct jme_buffer_info *txbi;
 	u8 flags;
+	int ret = 0;
 
 	txdesc = (struct txdesc *)txring->desc + idx;
 	txbi = txring->bufinf + idx;
@@ -2155,7 +2192,10 @@ jme_fill_tx_desc(struct jme_adapter *jme, struct sk_buff *skb, int idx)
 	if (jme_tx_tso(skb, &txdesc->desc1.mss, &flags))
 		jme_tx_csum(jme, skb, &flags);
 	jme_tx_vlan(skb, &txdesc->desc1.vlan, &flags);
-	jme_map_tx_skb(jme, skb, idx);
+	ret = jme_map_tx_skb(jme, skb, idx);
+	if (ret)
+		return ret;
+
 	txdesc->desc1.flags = flags;
 	/*
 	 * Set tx buffer info after telling NIC to send
@@ -2228,7 +2268,8 @@ jme_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 		return NETDEV_TX_BUSY;
 	}
 
-	jme_fill_tx_desc(jme, skb, idx);
+	if (jme_fill_tx_desc(jme, skb, idx))
+		return NETDEV_TX_BUSY;
 
 	jwrite32(jme, JME_TXCS, jme->reg_txcs |
 				TXCS_SELECT_QUEUE0 |
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-05 18:51 [PATCH] jme: Fix DMA unmap warning Neil Horman
@ 2014-05-06 20:25 ` Govindarajulu Varadarajan
  2014-05-06 20:50   ` Neil Horman
  2014-05-07 19:56 ` David Miller
  2014-05-11 12:49 ` [PATCH] jme: Fix DMA unmap warning Ben Hutchings
  2 siblings, 1 reply; 26+ messages in thread
From: Govindarajulu Varadarajan @ 2014-05-06 20:25 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Guo-Fu Tseng, David S. Miller



On Mon, 5 May 2014, Neil Horman wrote:

> The jme driver forgot to check the return status from pci_map_page in its tx
> path, causing a dma api warning on unmap.  Easy fix, just do the check and
> augment the tx path to tell the stack that the driver is busy so we re-queue the
> frame.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Guo-Fu Tseng <cooldavid@cooldavid.org>
> CC: "David S. Miller" <davem@davemloft.net>
> ---
> drivers/net/ethernet/jme.c | 53 ++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 47 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
> index b0c6050..6e664d9 100644
> --- a/drivers/net/ethernet/jme.c
> +++ b/drivers/net/ethernet/jme.c
> @@ -1988,7 +1988,7 @@ jme_alloc_txdesc(struct jme_adapter *jme,
> 	return idx;
> }
>
> -static void
> +static int
> jme_fill_tx_map(struct pci_dev *pdev,
> 		struct txdesc *txdesc,
> 		struct jme_buffer_info *txbi,
> @@ -2005,6 +2005,9 @@ jme_fill_tx_map(struct pci_dev *pdev,
> 				len,
> 				PCI_DMA_TODEVICE);
>
> +	if (unlikely(pci_dma_mapping_error(pdev, dmaaddr)))
> +		return -EINVAL;
> +
> 	pci_dma_sync_single_for_device(pdev,
> 				       dmaaddr,
> 				       len,
> @@ -2021,9 +2024,30 @@ jme_fill_tx_map(struct pci_dev *pdev,
>
> 	txbi->mapping = dmaaddr;
> 	txbi->len = len;
> +	return 0;
> }
>
> -static void
> +static void jme_drop_tx_map(struct jme_adapter *jme, int startidx, int endidx)
> +{
> +	struct jme_ring *txring = &(jme->txring[0]);
> +	struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
> +	int mask = jme->tx_ring_mask;
> +	int j;
> +
> +	for (j = startidx ; j < endidx ; ++j) {
> +		ctxbi = txbi + ((startidx + j + 2) & (mask));
> +		pci_unmap_page(jme->pdev,
> +				ctxbi->mapping,
> +				ctxbi->len,
> +				PCI_DMA_TODEVICE);
> +
> +				ctxbi->mapping = 0;
> +				ctxbi->len = 0;

Alignment, a tab from 'for'
> +	}
> +
> +}
> +
> +static int
> jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> {
> 	struct jme_ring *txring = &(jme->txring[0]);
> @@ -2034,25 +2058,37 @@ jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> 	int mask = jme->tx_ring_mask;
> 	const struct skb_frag_struct *frag;
> 	u32 len;
> +	int ret = 0;
>
> 	for (i = 0 ; i < nr_frags ; ++i) {
> 		frag = &skb_shinfo(skb)->frags[i];
> 		ctxdesc = txdesc + ((idx + i + 2) & (mask));
> 		ctxbi = txbi + ((idx + i + 2) & (mask));
>
> -		jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
> +		ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
> 				skb_frag_page(frag),
> 				frag->page_offset, skb_frag_size(frag), hidma);
> +		if (ret) {
> +			jme_drop_tx_map(jme, idx, idx+i);
> +			goto out;

Why not just return here, you are using out:, only once.

Govind
> +		}
> +
> 	}
>
> 	len = skb_is_nonlinear(skb) ? skb_headlen(skb) : skb->len;
> 	ctxdesc = txdesc + ((idx + 1) & (mask));
> 	ctxbi = txbi + ((idx + 1) & (mask));
> -	jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi, virt_to_page(skb->data),
> +	ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi, virt_to_page(skb->data),
> 			offset_in_page(skb->data), len, hidma);
> +	if (ret)
> +		jme_drop_tx_map(jme, idx, idx+i);
> +
> +out:
> +	return ret;
>
> }
>
> +
> static int
> jme_tx_tso(struct sk_buff *skb, __le16 *mss, u8 *flags)
> {
> @@ -2131,6 +2167,7 @@ jme_fill_tx_desc(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> 	struct txdesc *txdesc;
> 	struct jme_buffer_info *txbi;
> 	u8 flags;
> +	int ret = 0;
>
> 	txdesc = (struct txdesc *)txring->desc + idx;
> 	txbi = txring->bufinf + idx;
> @@ -2155,7 +2192,10 @@ jme_fill_tx_desc(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> 	if (jme_tx_tso(skb, &txdesc->desc1.mss, &flags))
> 		jme_tx_csum(jme, skb, &flags);
> 	jme_tx_vlan(skb, &txdesc->desc1.vlan, &flags);
> -	jme_map_tx_skb(jme, skb, idx);
> +	ret = jme_map_tx_skb(jme, skb, idx);
> +	if (ret)
> +		return ret;
> +
> 	txdesc->desc1.flags = flags;
> 	/*
> 	 * Set tx buffer info after telling NIC to send
> @@ -2228,7 +2268,8 @@ jme_start_xmit(struct sk_buff *skb, struct net_device *netdev)
> 		return NETDEV_TX_BUSY;
> 	}
>
> -	jme_fill_tx_desc(jme, skb, idx);
> +	if (jme_fill_tx_desc(jme, skb, idx))
> +		return NETDEV_TX_BUSY;
>
> 	jwrite32(jme, JME_TXCS, jme->reg_txcs |
> 				TXCS_SELECT_QUEUE0 |
> -- 
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-06 20:25 ` Govindarajulu Varadarajan
@ 2014-05-06 20:50   ` Neil Horman
  2014-05-06 20:53     ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Neil Horman @ 2014-05-06 20:50 UTC (permalink / raw)
  To: Govindarajulu Varadarajan; +Cc: netdev, Guo-Fu Tseng, David S. Miller

On Wed, May 07, 2014 at 01:55:00AM +0530, Govindarajulu Varadarajan wrote:
> 
> 
> On Mon, 5 May 2014, Neil Horman wrote:
> 
> >The jme driver forgot to check the return status from pci_map_page in its tx
> >path, causing a dma api warning on unmap.  Easy fix, just do the check and
> >augment the tx path to tell the stack that the driver is busy so we re-queue the
> >frame.
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Guo-Fu Tseng <cooldavid@cooldavid.org>
> >CC: "David S. Miller" <davem@davemloft.net>
> >---
> >drivers/net/ethernet/jme.c | 53 ++++++++++++++++++++++++++++++++++++++++------
> >1 file changed, 47 insertions(+), 6 deletions(-)
> >
> >diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
> >index b0c6050..6e664d9 100644
> >--- a/drivers/net/ethernet/jme.c
> >+++ b/drivers/net/ethernet/jme.c
> >@@ -1988,7 +1988,7 @@ jme_alloc_txdesc(struct jme_adapter *jme,
> >	return idx;
> >}
> >
> >-static void
> >+static int
> >jme_fill_tx_map(struct pci_dev *pdev,
> >		struct txdesc *txdesc,
> >		struct jme_buffer_info *txbi,
> >@@ -2005,6 +2005,9 @@ jme_fill_tx_map(struct pci_dev *pdev,
> >				len,
> >				PCI_DMA_TODEVICE);
> >
> >+	if (unlikely(pci_dma_mapping_error(pdev, dmaaddr)))
> >+		return -EINVAL;
> >+
> >	pci_dma_sync_single_for_device(pdev,
> >				       dmaaddr,
> >				       len,
> >@@ -2021,9 +2024,30 @@ jme_fill_tx_map(struct pci_dev *pdev,
> >
> >	txbi->mapping = dmaaddr;
> >	txbi->len = len;
> >+	return 0;
> >}
> >
> >-static void
> >+static void jme_drop_tx_map(struct jme_adapter *jme, int startidx, int endidx)
> >+{
> >+	struct jme_ring *txring = &(jme->txring[0]);
> >+	struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
> >+	int mask = jme->tx_ring_mask;
> >+	int j;
> >+
> >+	for (j = startidx ; j < endidx ; ++j) {
> >+		ctxbi = txbi + ((startidx + j + 2) & (mask));
> >+		pci_unmap_page(jme->pdev,
> >+				ctxbi->mapping,
> >+				ctxbi->len,
> >+				PCI_DMA_TODEVICE);
> >+
> >+				ctxbi->mapping = 0;
> >+				ctxbi->len = 0;
> 
> Alignment, a tab from 'for'
Not sure what you mean by this, I don't see any alignment errors above, or in my
tree.  Checkpatch also claims its clean.

> >+	}
> >+
> >+}
> >+
> >+static int
> >jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> >{
> >	struct jme_ring *txring = &(jme->txring[0]);
> >@@ -2034,25 +2058,37 @@ jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> >	int mask = jme->tx_ring_mask;
> >	const struct skb_frag_struct *frag;
> >	u32 len;
> >+	int ret = 0;
> >
> >	for (i = 0 ; i < nr_frags ; ++i) {
> >		frag = &skb_shinfo(skb)->frags[i];
> >		ctxdesc = txdesc + ((idx + i + 2) & (mask));
> >		ctxbi = txbi + ((idx + i + 2) & (mask));
> >
> >-		jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
> >+		ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi,
> >				skb_frag_page(frag),
> >				frag->page_offset, skb_frag_size(frag), hidma);
> >+		if (ret) {
> >+			jme_drop_tx_map(jme, idx, idx+i);
> >+			goto out;
> 
> Why not just return here, you are using out:, only once.
> 
Because thats in keeping with the coding style of the file, and the kernel at
large.  I could just return sure, but nominally thats only done at the head of a
function when its very clear no cleanup is needed.  This way, we keep a single
return point in the function.  Its done in several other places within this
driver already (see jme_check_link for an example)

Regards
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-06 20:50   ` Neil Horman
@ 2014-05-06 20:53     ` David Miller
  2014-05-06 21:04       ` Govindarajulu Varadarajan
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2014-05-06 20:53 UTC (permalink / raw)
  To: nhorman; +Cc: _govind, netdev, cooldavid

From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 6 May 2014 16:50:46 -0400

>> >+		ctxbi = txbi + ((startidx + j + 2) & (mask));
>> >+		pci_unmap_page(jme->pdev,
>> >+				ctxbi->mapping,
>> >+				ctxbi->len,
>> >+				PCI_DMA_TODEVICE);
>> >+
>> >+				ctxbi->mapping = 0;
>> >+				ctxbi->len = 0;
>> 
>> Alignment, a tab from 'for'
> Not sure what you mean by this, I don't see any alignment errors above, or in my
> tree.  Checkpatch also claims its clean.

Indeed it is indented correctly, the "+" at the beginning of the line in
the patch makes it look like it's not, but it is.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-06 20:53     ` David Miller
@ 2014-05-06 21:04       ` Govindarajulu Varadarajan
  2014-05-06 21:10         ` David Miller
  0 siblings, 1 reply; 26+ messages in thread
From: Govindarajulu Varadarajan @ 2014-05-06 21:04 UTC (permalink / raw)
  To: David Miller; +Cc: nhorman, _govind, netdev, cooldavid



On Tue, 6 May 2014, David Miller wrote:

> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Tue, 6 May 2014 16:50:46 -0400
>
>>>> +		ctxbi = txbi + ((startidx + j + 2) & (mask));
>>>> +		pci_unmap_page(jme->pdev,
>>>> +				ctxbi->mapping,
>>>> +				ctxbi->len,
>>>> +				PCI_DMA_TODEVICE);
>>>> +
>>>> +				ctxbi->mapping = 0;
>>>> +				ctxbi->len = 0;
>>>
>>> Alignment, a tab from 'for'
>> Not sure what you mean by this, I don't see any alignment errors above, or in my
>> tree.  Checkpatch also claims its clean.
>
> Indeed it is indented correctly, the "+" at the beginning of the line in
> the patch makes it look like it's not, but it is.
>

Hmm. This is how it looked when I applied the patch.
http://i.imgur.com/ZdjqJ3d.png

ctxbi->mapping = 0;
ctxbi->len = 0;

is aligned to '(' of pci_unmap_page. But these two statements are not arg of
the function. Shouldn't it be aligned along pci_unmap_page and not '('

Govind

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-06 21:04       ` Govindarajulu Varadarajan
@ 2014-05-06 21:10         ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2014-05-06 21:10 UTC (permalink / raw)
  To: _govind; +Cc: nhorman, netdev, cooldavid

From: Govindarajulu Varadarajan <_govind@gmx.com>
Date: Wed, 7 May 2014 02:34:55 +0530 (IST)

> 
> 
> On Tue, 6 May 2014, David Miller wrote:
> 
>> From: Neil Horman <nhorman@tuxdriver.com>
>> Date: Tue, 6 May 2014 16:50:46 -0400
>>
>>>>> +		ctxbi = txbi + ((startidx + j + 2) & (mask));
>>>>> +		pci_unmap_page(jme->pdev,
>>>>> +				ctxbi->mapping,
>>>>> +				ctxbi->len,
>>>>> +				PCI_DMA_TODEVICE);
>>>>> +
>>>>> +				ctxbi->mapping = 0;
>>>>> +				ctxbi->len = 0;
>>>>
>>>> Alignment, a tab from 'for'
>>> Not sure what you mean by this, I don't see any alignment errors
>>> above, or in my
>>> tree.  Checkpatch also claims its clean.
>>
>> Indeed it is indented correctly, the "+" at the beginning of the line
>> in
>> the patch makes it look like it's not, but it is.
>>
> 
> Hmm. This is how it looked when I applied the patch.
> http://i.imgur.com/ZdjqJ3d.png
> 
> ctxbi->mapping = 0;
> ctxbi->len = 0;
> 
> is aligned to '(' of pci_unmap_page. But these two statements are not
> arg of
> the function. Shouldn't it be aligned along pci_unmap_page and not '('

It should be aligned to the first column after the openning "(" of the
function call.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-05 18:51 [PATCH] jme: Fix DMA unmap warning Neil Horman
  2014-05-06 20:25 ` Govindarajulu Varadarajan
@ 2014-05-07 19:56 ` David Miller
  2014-05-07 20:33   ` Neil Horman
  2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
  2014-05-11 12:49 ` [PATCH] jme: Fix DMA unmap warning Ben Hutchings
  2 siblings, 2 replies; 26+ messages in thread
From: David Miller @ 2014-05-07 19:56 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, cooldavid

From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon,  5 May 2014 14:51:47 -0400

> The jme driver forgot to check the return status from pci_map_page in its tx
> path, causing a dma api warning on unmap.  Easy fix, just do the check and
> augment the tx path to tell the stack that the driver is busy so we re-queue the
> frame.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

Applied thanks Neil.

Probably we should eventually come up with a perhaps more suitable recovery
scheme in the generic netdev queuing layer for these situations.

If a mapping fails, usually it's because of resource exhaustion and that's
a "wait and try again" type situation in hoping that whatever releases
are keeping the allocation from happening will get released.

But that's not exactly what we do right now when NETDEV_TX_BUSY is
signalled.  We'll just loop over and over in the TX software interrupt
handler for the qdisc.

Perhaps an hrtimer or similar would be more appropriate.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-07 19:56 ` David Miller
@ 2014-05-07 20:33   ` Neil Horman
  2014-05-07 20:44     ` David Miller
  2014-05-08  9:02     ` David Laight
  2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
  1 sibling, 2 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-07 20:33 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, cooldavid

On Wed, May 07, 2014 at 03:56:13PM -0400, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Mon,  5 May 2014 14:51:47 -0400
> 
> > The jme driver forgot to check the return status from pci_map_page in its tx
> > path, causing a dma api warning on unmap.  Easy fix, just do the check and
> > augment the tx path to tell the stack that the driver is busy so we re-queue the
> > frame.
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> 
> Applied thanks Neil.
> 
No problem.

> Probably we should eventually come up with a perhaps more suitable recovery
> scheme in the generic netdev queuing layer for these situations.
> 
> If a mapping fails, usually it's because of resource exhaustion and that's
> a "wait and try again" type situation in hoping that whatever releases
> are keeping the allocation from happening will get released.
> 
> But that's not exactly what we do right now when NETDEV_TX_BUSY is
> signalled.  We'll just loop over and over in the TX software interrupt
> handler for the qdisc.
> 
> Perhaps an hrtimer or similar would be more appropriate.
> 
Hm, timers seem like they might be a bit too 'dead reckoned' I think.  I.e.
theres no correlation or interlock between how long we set the timer to expire,
and the likelyhood that the resource limitation is cleared (or was cleared and
recurred from a subsequent glut of traffic).  

Perhaps a solution is a signalling mechanism tied to completion interrupts?
I.e. a mapping failure gets reported to the stack, which causes the
correspondnig queue to be stopped, until such time a the driver signals a safe
restart by the reception of a tx completion interrupt?  I'm actually tinkering
right now with a mechanism that provides guidance to the stack as to how many
dma descriptors are available in a given net_device that might come in handy
here
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-07 20:33   ` Neil Horman
@ 2014-05-07 20:44     ` David Miller
  2014-05-07 21:54       ` Neil Horman
  2014-05-08  9:02     ` David Laight
  1 sibling, 1 reply; 26+ messages in thread
From: David Miller @ 2014-05-07 20:44 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, cooldavid

From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 7 May 2014 16:33:17 -0400

> Perhaps a solution is a signalling mechanism tied to completion interrupts?
> I.e. a mapping failure gets reported to the stack, which causes the
> correspondnig queue to be stopped, until such time a the driver signals a safe
> restart by the reception of a tx completion interrupt?  I'm actually tinkering
> right now with a mechanism that provides guidance to the stack as to how many
> dma descriptors are available in a given net_device that might come in handy
> here

Another idea is to signal it from the IOMMU unmap itself, since that is the
point where the resource becomes available.

Actually either approach is fine with me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-07 20:44     ` David Miller
@ 2014-05-07 21:54       ` Neil Horman
  0 siblings, 0 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-07 21:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, cooldavid

On Wed, May 07, 2014 at 04:44:34PM -0400, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Wed, 7 May 2014 16:33:17 -0400
> 
> > Perhaps a solution is a signalling mechanism tied to completion interrupts?
> > I.e. a mapping failure gets reported to the stack, which causes the
> > correspondnig queue to be stopped, until such time a the driver signals a safe
> > restart by the reception of a tx completion interrupt?  I'm actually tinkering
> > right now with a mechanism that provides guidance to the stack as to how many
> > dma descriptors are available in a given net_device that might come in handy
> > here
> 
> Another idea is to signal it from the IOMMU unmap itself, since that is the
> point where the resource becomes available.
> 
> Actually either approach is fine with me.
> 
I'll look into it shortly.  Thanks for the idea!
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH] jme: Fix DMA unmap warning
  2014-05-07 20:33   ` Neil Horman
  2014-05-07 20:44     ` David Miller
@ 2014-05-08  9:02     ` David Laight
  2014-05-08 11:00       ` Neil Horman
  2014-05-08 17:24       ` David Miller
  1 sibling, 2 replies; 26+ messages in thread
From: David Laight @ 2014-05-08  9:02 UTC (permalink / raw)
  To: 'Neil Horman', David Miller; +Cc: netdev, cooldavid

From: Neil Horman
...
> Perhaps a solution is a signalling mechanism tied to completion interrupts?
> I.e. a mapping failure gets reported to the stack, which causes the
> correspondnig queue to be stopped, until such time a the driver signals a safe
> restart by the reception of a tx completion interrupt?  I'm actually tinkering
> right now with a mechanism that provides guidance to the stack as to how many
> dma descriptors are available in a given net_device that might come in handy

Is there any mileage in the driver pre-allocating a block of iommu entries
and then allocating them to the tx and rx buffers itself?
This might need some 'claw back' mechanism to get 'fair' (ok working)
allocations when there aren't enough entries for all the drivers.

I remember some old systems where the cost of setting up the iommu
entries was such that the breakeven point for copying data was
measured as about 1k bytes. I've no idea what it is for these systems.

	David

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-08  9:02     ` David Laight
@ 2014-05-08 11:00       ` Neil Horman
  2014-05-08 11:16         ` David Laight
  2014-05-08 17:24       ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Neil Horman @ 2014-05-08 11:00 UTC (permalink / raw)
  To: David Laight; +Cc: David Miller, netdev, cooldavid

On Thu, May 08, 2014 at 09:02:04AM +0000, David Laight wrote:
> From: Neil Horman
> ...
> > Perhaps a solution is a signalling mechanism tied to completion interrupts?
> > I.e. a mapping failure gets reported to the stack, which causes the
> > correspondnig queue to be stopped, until such time a the driver signals a safe
> > restart by the reception of a tx completion interrupt?  I'm actually tinkering
> > right now with a mechanism that provides guidance to the stack as to how many
> > dma descriptors are available in a given net_device that might come in handy
> 
> Is there any mileage in the driver pre-allocating a block of iommu entries
> and then allocating them to the tx and rx buffers itself?
> This might need some 'claw back' mechanism to get 'fair' (ok working)
> allocations when there aren't enough entries for all the drivers.
> 
I don't think that will work (or more specifically, it won't work in a wide
enough range of cases).  We can't reasonably predict how many devices will need
to use dma and how much space each device will need.  A common desktop system
will likely have no issue with reserving enough room for each nic, but a system
with 5+ nics/infiniband/fcoe on board is going to have a lot more trouble.  And
if you happen to be using swiotlb for whatever reason, you may not be able to
reserve enough space at all.  

What we really need is just a way for the dma_* api to put backpressure on
callers for a very brief period of time.  Thats why I was thinking something
involving completion interupts or unmapping events.

> I remember some old systems where the cost of setting up the iommu
> entries was such that the breakeven point for copying data was
> measured as about 1k bytes. I've no idea what it is for these systems.
> 
> 	David
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH] jme: Fix DMA unmap warning
  2014-05-08 11:00       ` Neil Horman
@ 2014-05-08 11:16         ` David Laight
  0 siblings, 0 replies; 26+ messages in thread
From: David Laight @ 2014-05-08 11:16 UTC (permalink / raw)
  To: 'Neil Horman'; +Cc: David Miller, netdev, cooldavid

From: Neil Horman
> On Thu, May 08, 2014 at 09:02:04AM +0000, David Laight wrote:
> > From: Neil Horman
> > ...
> > > Perhaps a solution is a signalling mechanism tied to completion interrupts?
> > > I.e. a mapping failure gets reported to the stack, which causes the
> > > correspondnig queue to be stopped, until such time a the driver signals a safe
> > > restart by the reception of a tx completion interrupt?  I'm actually tinkering
> > > right now with a mechanism that provides guidance to the stack as to how many
> > > dma descriptors are available in a given net_device that might come in handy
> >
> > Is there any mileage in the driver pre-allocating a block of iommu entries
> > and then allocating them to the tx and rx buffers itself?
> > This might need some 'claw back' mechanism to get 'fair' (ok working)
> > allocations when there aren't enough entries for all the drivers.
> >
> I don't think that will work (or more specifically, it won't work in a wide
> enough range of cases).  We can't reasonably predict how many devices will need
> to use dma and how much space each device will need.  A common desktop system
> will likely have no issue with reserving enough room for each nic, but a system
> with 5+ nics/infiniband/fcoe on board is going to have a lot more trouble.  And
> if you happen to be using swiotlb for whatever reason, you may not be able to
> reserve enough space at all.

Which is why I suggested a 'claw back' mechanism to reclaim some entries.
NFI how it could be made to work though.

> What we really need is just a way for the dma_* api to put backpressure on
> callers for a very brief period of time.  Thats why I was thinking something
> involving completion interupts or unmapping events.

A problem I see is that a large number of iommu entries can get used
by network drivers for receive buffers.
If the system is short of iommu entries, then the number of rx buffers
will need to be limited.

Maybe there should be 'low' and 'high' priority requests?
Ethernet drivers using low priority requests for most requests
- except the first few rx buffers, and the first tx buffer?

Your wakeup mechanism would still be needed.

	David

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-08  9:02     ` David Laight
  2014-05-08 11:00       ` Neil Horman
@ 2014-05-08 17:24       ` David Miller
  1 sibling, 0 replies; 26+ messages in thread
From: David Miller @ 2014-05-08 17:24 UTC (permalink / raw)
  To: David.Laight; +Cc: nhorman, netdev, cooldavid

From: David Laight <David.Laight@ACULAB.COM>
Date: Thu, 8 May 2014 09:02:04 +0000

> From: Neil Horman
> ...
>> Perhaps a solution is a signalling mechanism tied to completion interrupts?
>> I.e. a mapping failure gets reported to the stack, which causes the
>> correspondnig queue to be stopped, until such time a the driver signals a safe
>> restart by the reception of a tx completion interrupt?  I'm actually tinkering
>> right now with a mechanism that provides guidance to the stack as to how many
>> dma descriptors are available in a given net_device that might come in handy
> 
> Is there any mileage in the driver pre-allocating a block of iommu entries
> and then allocating them to the tx and rx buffers itself?
> This might need some 'claw back' mechanism to get 'fair' (ok working)
> allocations when there aren't enough entries for all the drivers.

The idea of preallocation has been explored before, but those efforts
never went very far.

In the case that we're mapping SKBs into the TX or RX ring, there is
little benefit cost wise.  As described much of the cost is installing
the translation, and that can't be done until we have the SKB itself.

Would it help with resource exhaustion?  I'm not so sure, because I'd
rather have everything that isn't currently in use available to those
entities that have an immediate need rather than holding onto space
"just in case".

> I remember some old systems where the cost of setting up the iommu
> entries was such that the breakeven point for copying data was
> measured as about 1k bytes. I've no idea what it is for these systems.

There are usually two costs associated with that, the first is the
spinlock that protects the iommu allocation data structures, the
second is programming the IOMMU hardware to flush the I/O TLB when
mappings change.

There isn't much you can do about the spinlock, but for the other
problem I experimented and implemented a scheme where the allocations
are done sequentially and therefore the I/O TLB flush only happens
once each time we wrap around thus mitigating that code.

See arch/sparc/kernel/iommu.c:iommu_range_alloc()

Unfortunately, on newer sparc64 systems the IOMMU PTE updates are done
with hypervisor calls of which I have no control over and they
unconditionally do an IOMMU TLB flush, and therefore this mitigation
trick is no longer possible.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-07 19:56 ` David Miller
  2014-05-07 20:33   ` Neil Horman
@ 2014-05-08 19:29   ` Neil Horman
  2014-05-09  8:55     ` David Laight
                       ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-08 19:29 UTC (permalink / raw)
  To: netdev; +Cc: davem, Neil Horman

What about something like this?  Its not even compile tested, but let me know
what you think of the idea.  The reasoning behind it is that transient resources
like dma address ranges in the iommu or swiotlb have the following attributes

1) they are quickly allocated and freed

2) Usually handled by simply trying again at some later point in time

3) Likely to fail again if tried again immediately.

4) Not condusive to interlocked signaling mechanisms as the time it takes to
find and signal a waiting tx queue that resources are now available may take
longer than needed, and may still result in failure, as competing allocators
may race in and claim said resources during the signaling period.

As such, what if we try something more simple like a linear backoff? In this
example we add a TX return code that indicates that the driver is not busy, but
unable to transmit due to resource constraints.  This signals the qdisc layer to
skip trying to drain this transmit queue for a short period of time, with
subsequent simmilar errors causing increased backoff.  Once the condition is
cleared, the backoff delay is removed and operation returns to normal.

Thouthgs?

Neil

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
---
 include/linux/netdevice.h |  4 ++++
 net/sched/sch_generic.c   | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a803d79..2b88ccd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -105,6 +105,7 @@ enum netdev_tx {
 	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
 	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
 	NETDEV_TX_LOCKED = 0x20,	/* driver tx lock was already taken */
+	NETDEV_TX_BACKOFF= 0x40		/* driver resource contrained */
 };
 typedef enum netdev_tx netdev_tx_t;
 
@@ -572,6 +573,9 @@ struct netdev_queue {
 
 	unsigned long		state;
 
+	unsigned long		backoff_count;
+	unsigned long		skip_count;
+
 #ifdef CONFIG_BQL
 	struct dql		dql;
 #endif
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e1543b0..64bf6fe 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -30,6 +30,8 @@
 #include <net/pkt_sched.h>
 #include <net/dst.h>
 
+#define MAX_BACKOFF_COUNT 4
+
 /* Qdisc to use by default */
 const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
 EXPORT_SYMBOL(default_qdisc_ops);
@@ -121,6 +123,15 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 {
 	int ret = NETDEV_TX_BUSY;
 
+	/* Only attempt transmit module the backoff_count */
+	if ((txq->backoff_count) {
+		if (txq->skip_count % txq->backoff_count) {
+			txq->skip_count++;
+			return ret;
+		}
+		txq->skip_count = 0;
+	}
+		
 	/* And release qdisc */
 	spin_unlock(root_lock);
 
@@ -135,9 +146,17 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
 		ret = qdisc_qlen(q);
+		txq->backoff_count = 0;
 	} else if (ret == NETDEV_TX_LOCKED) {
 		/* Driver try lock failed */
 		ret = handle_dev_cpu_collision(skb, txq, q);
+		txq->backoff_count = 0;
+	} else if (ret == NETDEV_TX_BACKOFF) {
+		/* The driver claims to be resource constrained */
+		if (txq->backoff_count != MAX_BACKOFF_COUNT)
+			txq->backoff_count++;
+		txq->skip_count++;
+		ret = dev_requeue_skb(skb, q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
@@ -145,6 +164,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 					     dev->name, ret, q->q.qlen);
 
 		ret = dev_requeue_skb(skb, q);
+		txq->backoff_count = 0;
 	}
 
 	if (ret && netif_xmit_frozen_or_stopped(txq))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
@ 2014-05-09  8:55     ` David Laight
  2014-05-09 11:17       ` Neil Horman
  2014-05-18 16:19     ` Neil Horman
  2014-05-21 18:05     ` Cong Wang
  2 siblings, 1 reply; 26+ messages in thread
From: David Laight @ 2014-05-09  8:55 UTC (permalink / raw)
  To: 'Neil Horman', netdev; +Cc: davem

From: Neil Horman
> What about something like this?  Its not even compile tested, but let me know
> what you think of the idea.  The reasoning behind it is that transient resources
> like dma address ranges in the iommu or swiotlb have the following attributes
> 
> 1) they are quickly allocated and freed

I'm not sure that is true for iommu entries.
The ones allocated for ethernet receive are effectively permanently allocated.

Imagine a system with 512 iommu entries.
An ethernet driver allocates 128 RX ring entries using one iommu entry each.
There are now no iommu entries left for anything else.
That system will only work if the ethernet driver reduces the number of
active rx buffers.

It is also possible (but less likely) that ethernet transmit will
use so many iommu entries that none are left for more important things.
The network will work with only one active transmit, but you may
have to do disc and/or usb transfers even when resource limited.

	David

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-09  8:55     ` David Laight
@ 2014-05-09 11:17       ` Neil Horman
  2014-05-09 12:53         ` David Laight
  0 siblings, 1 reply; 26+ messages in thread
From: Neil Horman @ 2014-05-09 11:17 UTC (permalink / raw)
  To: David Laight; +Cc: netdev, davem

On Fri, May 09, 2014 at 08:55:10AM +0000, David Laight wrote:
> From: Neil Horman
> > What about something like this?  Its not even compile tested, but let me know
> > what you think of the idea.  The reasoning behind it is that transient resources
> > like dma address ranges in the iommu or swiotlb have the following attributes
> > 
> > 1) they are quickly allocated and freed
> 
> I'm not sure that is true for iommu entries.
> The ones allocated for ethernet receive are effectively permanently allocated.
> 
I disagree.  A review of several NIC drivers shows the pseudocode for the RX
patch to be:

For SKB X on the RX ring:
	If LENGTH(SKB) < COPYBREAK
		SKB2 = ALLOCATE_SKB
		COPY_DATA(SKB2, SKB1)
		RECEIVE(SKB2)
	Else
		UNMAP(SKB1)
		RECEIVE(SKB1)
		SKB1 = ALLOCATE_SKB
		MAP(SKB1)
Done

The value of COPYBREAK is configurable, but is never more than 256 bytes, and is
often 128 or fewer bytes (sometimes zero).  This will cause some udp traffic to
get handled as copies, but never more reasonably sized udp packets, and no well
behaved tcp traffic will ever get copied.  Those iommu entries will come and go
very quickly.

> Imagine a system with 512 iommu entries.
> An ethernet driver allocates 128 RX ring entries using one iommu entry each.
> There are now no iommu entries left for anything else.
That actually leaves 384 entries remaiing, but thats neither here nor there :).
  iommus work like tlbs, in that they don't have a fixed number of entries.
Each iommu has a set of page tables, wherein a set of pages can be mapped.  If
a packet fits within a single page, then yes, it takes up a single 'pte', if a
packet takes multiple pages (say a gso packet for example), or if some other
device is doing lots of high volume dma (FCoE/iscsi/roce/infiniband), then
multiple pte's will be taken up handling the larger data buffer.  Its a
limited resource shared unevenly between all dma-ing devices.  Thats why we
can't reserve entries, because you don't have alot of space to begin with, and
you don't know how much you'll need until you have the data to send, which can
vary wildly depending on the device.
 
> That system will only work if the ethernet driver reduces the number of
> active rx buffers.
> 
Reducing the number of active rx buffers is tantamount to reducing the ring size
of a NIC, which is already a tunable feature, and not one to be received overly
well by people trying to maximize their network througput.

> It is also possible (but less likely) that ethernet transmit will
> use so many iommu entries that none are left for more important things.
This is possible in all cases, not just transmit.

> The network will work with only one active transmit, but you may
> have to do disc and/or usb transfers even when resource limited.
> 
Hence my RFC patch in my prior note.  If we're resource constrained, push back
on the qdisc such that we try not to use as many mappings for short time without
causing too much overhead.  It doesn't affect receive of course, but its very
hard to deal with managing mapping use when the producer is not directly
controllable by us.

Neil

> 	David
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-09 11:17       ` Neil Horman
@ 2014-05-09 12:53         ` David Laight
  2014-05-09 15:33           ` Neil Horman
  0 siblings, 1 reply; 26+ messages in thread
From: David Laight @ 2014-05-09 12:53 UTC (permalink / raw)
  To: 'Neil Horman'; +Cc: netdev, davem

From: Neil Horman 
> On Fri, May 09, 2014 at 08:55:10AM +0000, David Laight wrote:
> > From: Neil Horman
> > > What about something like this?  Its not even compile tested, but let me know
> > > what you think of the idea.  The reasoning behind it is that transient resources
> > > like dma address ranges in the iommu or swiotlb have the following attributes
> > >
> > > 1) they are quickly allocated and freed
> >
> > I'm not sure that is true for iommu entries.
> > The ones allocated for ethernet receive are effectively permanently allocated.
> >
> I disagree.  A review of several NIC drivers shows the pseudocode for the RX
> patch to be:
> 
> For SKB X on the RX ring:
> 	If LENGTH(SKB) < COPYBREAK
> 		SKB2 = ALLOCATE_SKB
> 		COPY_DATA(SKB2, SKB1)
> 		RECEIVE(SKB2)
> 	Else
> 		UNMAP(SKB1)
> 		RECEIVE(SKB1)
> 		SKB1 = ALLOCATE_SKB
> 		MAP(SKB1)
> Done
> 
> The value of COPYBREAK is configurable, but is never more than 256 bytes, and is
> often 128 or fewer bytes (sometimes zero).  This will cause some udp traffic to
> get handled as copies, but never more reasonably sized udp packets, and no well
> behaved tcp traffic will ever get copied.  Those iommu entries will come and go
> very quickly.

If I understand correctly the iommu entries are needed for the ethernet
chip to access main memory. The usual state is that RX ring is full
buffers - all of which are mapped for dma.

> > Imagine a system with 512 iommu entries.
> > An ethernet driver allocates 128 RX ring entries using one iommu entry each.
> > There are now no iommu entries left for anything else.

> That actually leaves 384 entries remaiing, but thats neither here nor there :).
I seem to fail to write 'include 4 such interfaces and' ...

>   iommus work like tlbs, in that they don't have a fixed number of entries.
> Each iommu has a set of page tables, wherein a set of pages can be mapped.
> ...

Yes I realise that what is actually being allocated is io virtual address space.
But the '1 buffer' == '1 slot' simplification is reasonable appropriate for
some 'thought designs'.


> Its a
> limited resource shared unevenly between all dma-ing devices.  Thats why we
> can't reserve entries, because you don't have alot of space to begin with, and
> you don't know how much you'll need until you have the data to send, which can
> vary wildly depending on the device.

You do know how much resource is left though.

> > That system will only work if the ethernet driver reduces the number of
> > active rx buffers.
> >
> Reducing the number of active rx buffers is tantamount to reducing the ring size
> of a NIC, which is already a tunable feature, and not one to be received overly
> well by people trying to maximize their network througput.

Except that it needs to done automatically if there is a global constraint
(be it iommu space or dma-able memory).
In isolation a large rx ring is almost always an advantage.

> > It is also possible (but less likely) that ethernet transmit will
> > use so many iommu entries that none are left for more important things.
> This is possible in all cases, not just transmit.
> 
> > The network will work with only one active transmit, but you may
> > have to do disc and/or usb transfers even when resource limited.
> >
> Hence my RFC patch in my prior note.  If we're resource constrained, push back
> on the qdisc such that we try not to use as many mappings for short time without
> causing too much overhead.  It doesn't affect receive of course, but its very
> hard to deal with managing mapping use when the producer is not directly
> controllable by us.

But ethernet receive is likely to be the big user of iommu entries.
If you constrain it, then there probably won't be allocation failures
elsewhere.

	David

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-09 12:53         ` David Laight
@ 2014-05-09 15:33           ` Neil Horman
  2014-05-09 15:46             ` David Laight
  0 siblings, 1 reply; 26+ messages in thread
From: Neil Horman @ 2014-05-09 15:33 UTC (permalink / raw)
  To: David Laight; +Cc: netdev, davem

On Fri, May 09, 2014 at 12:53:29PM +0000, David Laight wrote:
> From: Neil Horman 
> > On Fri, May 09, 2014 at 08:55:10AM +0000, David Laight wrote:
> > > From: Neil Horman
> > > > What about something like this?  Its not even compile tested, but let me know
> > > > what you think of the idea.  The reasoning behind it is that transient resources
> > > > like dma address ranges in the iommu or swiotlb have the following attributes
> > > >
> > > > 1) they are quickly allocated and freed
> > >
> > > I'm not sure that is true for iommu entries.
> > > The ones allocated for ethernet receive are effectively permanently allocated.
> > >
> > I disagree.  A review of several NIC drivers shows the pseudocode for the RX
> > patch to be:
> > 
> > For SKB X on the RX ring:
> > 	If LENGTH(SKB) < COPYBREAK
> > 		SKB2 = ALLOCATE_SKB
> > 		COPY_DATA(SKB2, SKB1)
> > 		RECEIVE(SKB2)
> > 	Else
> > 		UNMAP(SKB1)
> > 		RECEIVE(SKB1)
> > 		SKB1 = ALLOCATE_SKB
> > 		MAP(SKB1)
> > Done
> > 
> > The value of COPYBREAK is configurable, but is never more than 256 bytes, and is
> > often 128 or fewer bytes (sometimes zero).  This will cause some udp traffic to
> > get handled as copies, but never more reasonably sized udp packets, and no well
> > behaved tcp traffic will ever get copied.  Those iommu entries will come and go
> > very quickly.
> 
> If I understand correctly the iommu entries are needed for the ethernet
> chip to access main memory. The usual state is that RX ring is full
> buffers - all of which are mapped for dma.
> 
This is true, but those buffers are not static, they are unmapped, and new ones
are mapped in their place on recieve, so theres an opportunity for those
unmapped buffers to get used by other entities/hardware during the recieve
process.  We could do something as you suggest, in which we create an api to
reserve the address space for a buffer, and just continually reuse them, but as
Dave points out, its a limited resource and reserving them may be unfair to
other transient users, especially given that the RX path has to 'over-reserve'
allocating space for the largest packet size receivable, even if we get less
than that.

> > > Imagine a system with 512 iommu entries.
> > > An ethernet driver allocates 128 RX ring entries using one iommu entry each.
> > > There are now no iommu entries left for anything else.
> 
> > That actually leaves 384 entries remaiing, but thats neither here nor there :).
> I seem to fail to write 'include 4 such interfaces and' ...
> 
Ah, that makes more sense

> >   iommus work like tlbs, in that they don't have a fixed number of entries.
> > Each iommu has a set of page tables, wherein a set of pages can be mapped.
> > ...
> 
> Yes I realise that what is actually being allocated is io virtual address space.
> But the '1 buffer' == '1 slot' simplification is reasonable appropriate for
> some 'thought designs'.
> 
Only partly.  Continuity is also a factor here.  You might have 1000 sots
remaining, but under heavy use, your largest contiguous slot could be
significantly smaller, which is what a dma using bit of code is actually
interested in.  Keeping track of the largest contiguous slot is significantly
more difficult, and its all still a bit dicey, because the larges slot might
change between the time a device asks what it is, and actually allocates it.

> 
> > Its a
> > limited resource shared unevenly between all dma-ing devices.  Thats why we
> > can't reserve entries, because you don't have alot of space to begin with, and
> > you don't know how much you'll need until you have the data to send, which can
> > vary wildly depending on the device.
> 
> You do know how much resource is left though.
> 
See above, you can certainly tell in aggregate how much free space is available,
but not what your largest free chunk is.

> > > That system will only work if the ethernet driver reduces the number of
> > > active rx buffers.
> > >
> > Reducing the number of active rx buffers is tantamount to reducing the ring size
> > of a NIC, which is already a tunable feature, and not one to be received overly
> > well by people trying to maximize their network througput.
> 
> Except that it needs to done automatically if there is a global constraint
> (be it iommu space or dma-able memory).
2 Things:

1) Need is really a strong term here.  The penalty for failing a dma mapping is
to drop the frame.  Thats not unacceptible in many use cases.

2) It seems to me that global constraint here implies a static, well known
number.  While its true we can interrogate an iommu, and compare its mapping
size to the ring size of all the NICS/devices on a system to see if we're likely
to exceed the iommu space available, we shouldn't do that.  If a given NIC
doesn't produce much traffic, its ring sizes aren't relevant to the computation.

We're not trying to address a static allocation scheme here.  If a system boots,
it implies that all the recive rings on all the devices were able to reserve the
amount of space they needed in the iommu (as you note earlier, they populate
their rings on init, effectively doing a iommu reservation).  The problem we're
addressing is the periodic lack of space that arises from temporary exhaustion
of iommu space under heavy I/O loads.  We won't know if that happens, until it
happens, and we can't just allocate for the worst case, because then we're sure
to run out of space as devices scale up.  Sharing is the way to do this whenever
possible.

> In isolation a large rx ring is almost always an advantage.
> 
No argument there.  But requiring a user to size a ring based on expected
traffic patterns seems like it won't be well received.

> > > It is also possible (but less likely) that ethernet transmit will
> > > use so many iommu entries that none are left for more important things.
> > This is possible in all cases, not just transmit.
> > 
> > > The network will work with only one active transmit, but you may
> > > have to do disc and/or usb transfers even when resource limited.
> > >
> > Hence my RFC patch in my prior note.  If we're resource constrained, push back
> > on the qdisc such that we try not to use as many mappings for short time without
> > causing too much overhead.  It doesn't affect receive of course, but its very
> > hard to deal with managing mapping use when the producer is not directly
> > controllable by us.
> 
> But ethernet receive is likely to be the big user of iommu entries.
> If you constrain it, then there probably won't be allocation failures
> elsewhere.
> 
What makes you say that?  Theres no reason a tx ring can't be just as full as a
receive ring under heavy traffic load.  If you want to constrain receive
allocations, do so, we have a knob for that already.

Neil

> 	David
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-09 15:33           ` Neil Horman
@ 2014-05-09 15:46             ` David Laight
  2014-05-09 17:02               ` Neil Horman
  0 siblings, 1 reply; 26+ messages in thread
From: David Laight @ 2014-05-09 15:46 UTC (permalink / raw)
  To: 'Neil Horman'; +Cc: netdev, davem

From: Neil Horman [mailto:nhorman@tuxdriver.com]
...
> 2 Things:
> 
> 1) Need is really a strong term here.  The penalty for failing a dma mapping is
> to drop the frame.  Thats not unacceptible in many use cases.

Indeed, but dropping an ethernet frame will be recovered by the higher layers.
While not ideal, it is a 'last resort' action.
Note that I'm not suggesting that your deferred retry of the transmit isn't
a good idea, just that it is probably papering over the cracks.

> 2) It seems to me that global constraint here implies a static, well known
> number.  While its true we can interrogate an iommu, and compare its mapping
> size to the ring size of all the NICS/devices on a system to see if we're likely
> to exceed the iommu space available, we shouldn't do that.  If a given NIC
> doesn't produce much traffic, its ring sizes aren't relevant to the computation.

An idle NIC will be using a lot of iommu entries for its receive buffers.

> We're not trying to address a static allocation scheme here.  If a system boots,
> it implies that all the recive rings on all the devices were able to reserve the
> amount of space they needed in the iommu (as you note earlier, they populate
> their rings on init, effectively doing a iommu reservation).  The problem we're
> addressing is the periodic lack of space that arises from temporary exhaustion
> of iommu space under heavy I/O loads.  We won't know if that happens, until it
> happens, and we can't just allocate for the worst case, because then we're sure
> to run out of space as devices scale up.  Sharing is the way to do this whenever
> possible.

Do you have any data for which drivers have active iommu entries when an
allocate fails?

I can imagine systems where almost all the iommu entries are being used
for ethernet rx buffers, and everything else is fighting for the last
few entries.

	David

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-09 15:46             ` David Laight
@ 2014-05-09 17:02               ` Neil Horman
  0 siblings, 0 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-09 17:02 UTC (permalink / raw)
  To: David Laight; +Cc: netdev, davem

On Fri, May 09, 2014 at 03:46:14PM +0000, David Laight wrote:
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> ...
> > 2 Things:
> > 
> > 1) Need is really a strong term here.  The penalty for failing a dma mapping is
> > to drop the frame.  Thats not unacceptible in many use cases.
> 
> Indeed, but dropping an ethernet frame will be recovered by the higher layers.
> While not ideal, it is a 'last resort' action.
No argument there, but its the best we have at the moment.

> Note that I'm not suggesting that your deferred retry of the transmit isn't
> a good idea, just that it is probably papering over the cracks.
> 
Except that forcing everyone to a lower througput based on worst case scenarios
doesn't seem like a better solution to me.

> > 2) It seems to me that global constraint here implies a static, well known
> > number.  While its true we can interrogate an iommu, and compare its mapping
> > size to the ring size of all the NICS/devices on a system to see if we're likely
> > to exceed the iommu space available, we shouldn't do that.  If a given NIC
> > doesn't produce much traffic, its ring sizes aren't relevant to the computation.
> 
> An idle NIC will be using a lot of iommu entries for its receive buffers.
> 
Yes, and thats really the point!  Only an Idle NIC will be (mis)using alot of
extra iommu entries.  But we have no way to know if a NIC will be idle, we have
to reserve space for them in the iommu because thats how the hardware is
designed.  The only recourse we have here is to reserve less space for each NIC
RX ring, which punishes those NICS that are active, which is the opposite of
what we really want to do here.

> > We're not trying to address a static allocation scheme here.  If a system boots,
> > it implies that all the recive rings on all the devices were able to reserve the
> > amount of space they needed in the iommu (as you note earlier, they populate
> > their rings on init, effectively doing a iommu reservation).  The problem we're
> > addressing is the periodic lack of space that arises from temporary exhaustion
> > of iommu space under heavy I/O loads.  We won't know if that happens, until it
> > happens, and we can't just allocate for the worst case, because then we're sure
> > to run out of space as devices scale up.  Sharing is the way to do this whenever
> > possible.
> 
> Do you have any data for which drivers have active iommu entries when an
> allocate fails?
> 
No, but I'm not sure it matters which driver holds DMA descriptors when an
allocation failure occurs.  I'm operating under the assumption here that drivers
are attempting to allocate a reasonable number of buffers for the work they need
to do.  I'm not arguing that we can't reclaim space by forcing any given drier
to allocate less, only that doing so isn't helpful in that it will just decrease
receive througput.  We're trading in one problem for another.

> I can imagine systems where almost all the iommu entries are being used
> for ethernet rx buffers, and everything else is fighting for the last
> few entries.
> 
I would argue that its not quite as unbalance as all/few, but yes, your ponit is
sound in that the tx path is fighting for a reduced pool of shared buffers
because of the nature of the RX paths' pre-allocation needs.  We could reduce
the number of those buffers statically, but thats really an administrative job
because the kernel never really knows when a NIC is going to be a high volume
producer or consumer.

Hmm, that actually makes me wonder, is this a job for something like tuned up in
user space?  or a cmobination of something like my backoff patch and tuned?  The
backoff patch helps the tx path in case of an actuall exhaustion, and tuned can
be administratively loaded with a profile to indicate which nics are likely to
be low volume, and therefore can have their ring sizes reduced, giving the iommu
the maximum free pool to services all the other dma users?

What do you think?
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-05 18:51 [PATCH] jme: Fix DMA unmap warning Neil Horman
  2014-05-06 20:25 ` Govindarajulu Varadarajan
  2014-05-07 19:56 ` David Miller
@ 2014-05-11 12:49 ` Ben Hutchings
  2014-05-11 13:04   ` Neil Horman
  2 siblings, 1 reply; 26+ messages in thread
From: Ben Hutchings @ 2014-05-11 12:49 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Guo-Fu Tseng, David S. Miller

[-- Attachment #1: Type: text/plain, Size: 915 bytes --]

On Mon, 2014-05-05 at 14:51 -0400, Neil Horman wrote:
[...] 
> -static void
> +static void jme_drop_tx_map(struct jme_adapter *jme, int startidx, int endidx)
> +{
> +	struct jme_ring *txring = &(jme->txring[0]);
> +	struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
> +	int mask = jme->tx_ring_mask;
> +	int j;
> +
> +	for (j = startidx ; j < endidx ; ++j) {
> +		ctxbi = txbi + ((startidx + j + 2) & (mask));

So you're starting at startidx * 2 + 2, that can't be right.

[...]
> @@ -2228,7 +2268,8 @@ jme_start_xmit(struct sk_buff *skb, struct net_device *netdev)
>  		return NETDEV_TX_BUSY;
>  	}
>  
> -	jme_fill_tx_desc(jme, skb, idx);
> +	if (jme_fill_tx_desc(jme, skb, idx))
> +		return NETDEV_TX_BUSY;

NETDEV_TX_OK

Ben.

>  	jwrite32(jme, JME_TXCS, jme->reg_txcs |
>  				TXCS_SELECT_QUEUE0 |

-- 
Ben Hutchings
Sturgeon's Law: Ninety percent of everything is crap.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] jme: Fix DMA unmap warning
  2014-05-11 12:49 ` [PATCH] jme: Fix DMA unmap warning Ben Hutchings
@ 2014-05-11 13:04   ` Neil Horman
  0 siblings, 0 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-11 13:04 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Guo-Fu Tseng, David S. Miller

On Sun, May 11, 2014 at 01:49:29PM +0100, Ben Hutchings wrote:
> On Mon, 2014-05-05 at 14:51 -0400, Neil Horman wrote:
> [...] 
> > -static void
> > +static void jme_drop_tx_map(struct jme_adapter *jme, int startidx, int endidx)
> > +{
> > +	struct jme_ring *txring = &(jme->txring[0]);
> > +	struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
> > +	int mask = jme->tx_ring_mask;
> > +	int j;
> > +
> > +	for (j = startidx ; j < endidx ; ++j) {
> > +		ctxbi = txbi + ((startidx + j + 2) & (mask));
> 
> So you're starting at startidx * 2 + 2, that can't be right.
> 
Shoot, I think j needs to be initalized to 0.

> [...]
> > @@ -2228,7 +2268,8 @@ jme_start_xmit(struct sk_buff *skb, struct net_device *netdev)
> >  		return NETDEV_TX_BUSY;
> >  	}
> >  
> > -	jme_fill_tx_desc(jme, skb, idx);
> > +	if (jme_fill_tx_desc(jme, skb, idx))
> > +		return NETDEV_TX_BUSY;
> 
> NETDEV_TX_OK
> 
Yup, I'll send a subsequent patch to fix it.  Thanks
Neil

> Ben.
> 
> >  	jwrite32(jme, JME_TXCS, jme->reg_txcs |
> >  				TXCS_SELECT_QUEUE0 |
> 
> -- 
> Ben Hutchings
> Sturgeon's Law: Ninety percent of everything is crap.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
  2014-05-09  8:55     ` David Laight
@ 2014-05-18 16:19     ` Neil Horman
  2014-05-21 18:05     ` Cong Wang
  2 siblings, 0 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-18 16:19 UTC (permalink / raw)
  To: netdev; +Cc: davem

On Thu, May 08, 2014 at 03:29:52PM -0400, Neil Horman wrote:
> What about something like this?  Its not even compile tested, but let me know
> what you think of the idea.  The reasoning behind it is that transient resources
> like dma address ranges in the iommu or swiotlb have the following attributes
> 
> 1) they are quickly allocated and freed
> 
> 2) Usually handled by simply trying again at some later point in time
> 
> 3) Likely to fail again if tried again immediately.
> 
> 4) Not condusive to interlocked signaling mechanisms as the time it takes to
> find and signal a waiting tx queue that resources are now available may take
> longer than needed, and may still result in failure, as competing allocators
> may race in and claim said resources during the signaling period.
> 
> As such, what if we try something more simple like a linear backoff? In this
> example we add a TX return code that indicates that the driver is not busy, but
> unable to transmit due to resource constraints.  This signals the qdisc layer to
> skip trying to drain this transmit queue for a short period of time, with
> subsequent simmilar errors causing increased backoff.  Once the condition is
> cleared, the backoff delay is removed and operation returns to normal.
> 
> Thouthgs?
> 
> Neil
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>  include/linux/netdevice.h |  4 ++++
>  net/sched/sch_generic.c   | 20 ++++++++++++++++++++
>  2 files changed, 24 insertions(+)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index a803d79..2b88ccd 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -105,6 +105,7 @@ enum netdev_tx {
>  	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
>  	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
>  	NETDEV_TX_LOCKED = 0x20,	/* driver tx lock was already taken */
> +	NETDEV_TX_BACKOFF= 0x40		/* driver resource contrained */
>  };
>  typedef enum netdev_tx netdev_tx_t;
>  
> @@ -572,6 +573,9 @@ struct netdev_queue {
>  
>  	unsigned long		state;
>  
> +	unsigned long		backoff_count;
> +	unsigned long		skip_count;
> +
>  #ifdef CONFIG_BQL
>  	struct dql		dql;
>  #endif
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index e1543b0..64bf6fe 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -30,6 +30,8 @@
>  #include <net/pkt_sched.h>
>  #include <net/dst.h>
>  
> +#define MAX_BACKOFF_COUNT 4
> +
>  /* Qdisc to use by default */
>  const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
>  EXPORT_SYMBOL(default_qdisc_ops);
> @@ -121,6 +123,15 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>  {
>  	int ret = NETDEV_TX_BUSY;
>  
> +	/* Only attempt transmit module the backoff_count */
> +	if ((txq->backoff_count) {
> +		if (txq->skip_count % txq->backoff_count) {
> +			txq->skip_count++;
> +			return ret;
> +		}
> +		txq->skip_count = 0;
> +	}
> +		
>  	/* And release qdisc */
>  	spin_unlock(root_lock);
>  
> @@ -135,9 +146,17 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>  	if (dev_xmit_complete(ret)) {
>  		/* Driver sent out skb successfully or skb was consumed */
>  		ret = qdisc_qlen(q);
> +		txq->backoff_count = 0;
>  	} else if (ret == NETDEV_TX_LOCKED) {
>  		/* Driver try lock failed */
>  		ret = handle_dev_cpu_collision(skb, txq, q);
> +		txq->backoff_count = 0;
> +	} else if (ret == NETDEV_TX_BACKOFF) {
> +		/* The driver claims to be resource constrained */
> +		if (txq->backoff_count != MAX_BACKOFF_COUNT)
> +			txq->backoff_count++;
> +		txq->skip_count++;
> +		ret = dev_requeue_skb(skb, q);
>  	} else {
>  		/* Driver returned NETDEV_TX_BUSY - requeue skb */
>  		if (unlikely(ret != NETDEV_TX_BUSY))
> @@ -145,6 +164,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>  					     dev->name, ret, q->q.qlen);
>  
>  		ret = dev_requeue_skb(skb, q);
> +		txq->backoff_count = 0;
>  	}
>  
>  	if (ret && netif_xmit_frozen_or_stopped(txq))

Ping, Dave do you have any thoughts on this approach?  Is it worth proposing
formally?
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
  2014-05-09  8:55     ` David Laight
  2014-05-18 16:19     ` Neil Horman
@ 2014-05-21 18:05     ` Cong Wang
  2014-05-21 18:49       ` Neil Horman
  2 siblings, 1 reply; 26+ messages in thread
From: Cong Wang @ 2014-05-21 18:05 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, David Miller

On Thu, May 8, 2014 at 12:29 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> What about something like this?  Its not even compile tested, but let me know
> what you think of the idea.  The reasoning behind it is that transient resources
> like dma address ranges in the iommu or swiotlb have the following attributes
>
> 1) they are quickly allocated and freed
>
> 2) Usually handled by simply trying again at some later point in time
>
> 3) Likely to fail again if tried again immediately.
>
> 4) Not condusive to interlocked signaling mechanisms as the time it takes to
> find and signal a waiting tx queue that resources are now available may take
> longer than needed, and may still result in failure, as competing allocators
> may race in and claim said resources during the signaling period.
>
> As such, what if we try something more simple like a linear backoff? In this
> example we add a TX return code that indicates that the driver is not busy, but
> unable to transmit due to resource constraints.  This signals the qdisc layer to
> skip trying to drain this transmit queue for a short period of time, with
> subsequent simmilar errors causing increased backoff.  Once the condition is
> cleared, the backoff delay is removed and operation returns to normal.

Loos like this is a more general issue which should be solved for every
spinlock:

http://thread.gmane.org/gmane.linux.kernel/1437186

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver
  2014-05-21 18:05     ` Cong Wang
@ 2014-05-21 18:49       ` Neil Horman
  0 siblings, 0 replies; 26+ messages in thread
From: Neil Horman @ 2014-05-21 18:49 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, David Miller

On Wed, May 21, 2014 at 11:05:28AM -0700, Cong Wang wrote:
> On Thu, May 8, 2014 at 12:29 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> > What about something like this?  Its not even compile tested, but let me know
> > what you think of the idea.  The reasoning behind it is that transient resources
> > like dma address ranges in the iommu or swiotlb have the following attributes
> >
> > 1) they are quickly allocated and freed
> >
> > 2) Usually handled by simply trying again at some later point in time
> >
> > 3) Likely to fail again if tried again immediately.
> >
> > 4) Not condusive to interlocked signaling mechanisms as the time it takes to
> > find and signal a waiting tx queue that resources are now available may take
> > longer than needed, and may still result in failure, as competing allocators
> > may race in and claim said resources during the signaling period.
> >
> > As such, what if we try something more simple like a linear backoff? In this
> > example we add a TX return code that indicates that the driver is not busy, but
> > unable to transmit due to resource constraints.  This signals the qdisc layer to
> > skip trying to drain this transmit queue for a short period of time, with
> > subsequent simmilar errors causing increased backoff.  Once the condition is
> > cleared, the backoff delay is removed and operation returns to normal.
> 
> Loos like this is a more general issue which should be solved for every
> spinlock:
> 
> http://thread.gmane.org/gmane.linux.kernel/1437186
> 
Good point, though this isn't a locking situation so much as a failed
transmission situation.  The same mechanism applies though, and this seems to be
simmilar to what the ticketed spinlocks do
Neil

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-05-21 18:49 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-05 18:51 [PATCH] jme: Fix DMA unmap warning Neil Horman
2014-05-06 20:25 ` Govindarajulu Varadarajan
2014-05-06 20:50   ` Neil Horman
2014-05-06 20:53     ` David Miller
2014-05-06 21:04       ` Govindarajulu Varadarajan
2014-05-06 21:10         ` David Miller
2014-05-07 19:56 ` David Miller
2014-05-07 20:33   ` Neil Horman
2014-05-07 20:44     ` David Miller
2014-05-07 21:54       ` Neil Horman
2014-05-08  9:02     ` David Laight
2014-05-08 11:00       ` Neil Horman
2014-05-08 11:16         ` David Laight
2014-05-08 17:24       ` David Miller
2014-05-08 19:29   ` [RFC PATCH] net: Provide linear backoff mechanism for constrained resources at the driver Neil Horman
2014-05-09  8:55     ` David Laight
2014-05-09 11:17       ` Neil Horman
2014-05-09 12:53         ` David Laight
2014-05-09 15:33           ` Neil Horman
2014-05-09 15:46             ` David Laight
2014-05-09 17:02               ` Neil Horman
2014-05-18 16:19     ` Neil Horman
2014-05-21 18:05     ` Cong Wang
2014-05-21 18:49       ` Neil Horman
2014-05-11 12:49 ` [PATCH] jme: Fix DMA unmap warning Ben Hutchings
2014-05-11 13:04   ` Neil Horman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.