netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ravb: minimize TX data copying
@ 2015-07-25 20:42 Sergei Shtylyov
  2015-07-27  8:22 ` David Miller
  2015-07-27  8:47 ` David Laight
  0 siblings, 2 replies; 4+ messages in thread
From: Sergei Shtylyov @ 2015-07-25 20:42 UTC (permalink / raw)
  To: netdev; +Cc: linux-sh

Renesas Ethernet AVB controller requires that all data are aligned on 4-byte
boundary.  While it's  easily achievable for  the RX  data with  the help of
skb_reserve() (we even align on 128-byte boundary as recommended by the manual),
we  can't  do the same with the TX data, and it always comes  unaligned from
the networking core. Originally we solved it an easy way, copying all packet
to  a  preallocated  aligned buffer; however, it's enough to copy only up to
3 first bytes from each packet, doing the transfer using 2 TX descriptors
instead of just 1. Here's an implementation of the new  TX algorithm that
significantly reduces the driver's memory requirements.

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

---
The patch is against Dave Miller's 'net-next.git' repo.

Changes in version 2:
- fixed terminology in the comment to #define DPTR_ALIGN;
- fixed a typo in the coment to #define NUM_TX_DESC.

 drivers/net/ethernet/renesas/ravb.h      |    5 +
 drivers/net/ethernet/renesas/ravb_main.c |  104 +++++++++++++++++--------------
 2 files changed, 64 insertions(+), 45 deletions(-)

Index: net-next/drivers/net/ethernet/renesas/ravb.h
===================================================================
--- net-next.orig/drivers/net/ethernet/renesas/ravb.h
+++ net-next/drivers/net/ethernet/renesas/ravb.h
@@ -658,6 +658,8 @@ struct ravb_desc {
 	__le32 dptr;	/* Descriptor pointer */
 };
 
+#define DPTR_ALIGN	4	/* Required descriptor pointer alignment */
+
 enum DIE_DT {
 	/* Frame data */
 	DT_FMID		= 0x40,
@@ -739,6 +741,7 @@ enum RAVB_QUEUE {
 #define RX_QUEUE_OFFSET	4
 #define NUM_RX_QUEUE	2
 #define NUM_TX_QUEUE	2
+#define NUM_TX_DESC	2	/* TX descriptors per packet */
 
 struct ravb_tstamp_skb {
 	struct list_head list;
@@ -777,9 +780,9 @@ struct ravb_private {
 	dma_addr_t tx_desc_dma[NUM_TX_QUEUE];
 	struct ravb_ex_rx_desc *rx_ring[NUM_RX_QUEUE];
 	struct ravb_tx_desc *tx_ring[NUM_TX_QUEUE];
+	void *tx_align[NUM_TX_QUEUE];
 	struct sk_buff **rx_skb[NUM_RX_QUEUE];
 	struct sk_buff **tx_skb[NUM_TX_QUEUE];
-	void **tx_buffers[NUM_TX_QUEUE];
 	u32 rx_over_errors;
 	u32 rx_fifo_errors;
 	struct net_device_stats stats[NUM_RX_QUEUE];
Index: net-next/drivers/net/ethernet/renesas/ravb_main.c
===================================================================
--- net-next.orig/drivers/net/ethernet/renesas/ravb_main.c
+++ net-next/drivers/net/ethernet/renesas/ravb_main.c
@@ -195,12 +195,8 @@ static void ravb_ring_free(struct net_de
 	priv->tx_skb[q] = NULL;
 
 	/* Free aligned TX buffers */
-	if (priv->tx_buffers[q]) {
-		for (i = 0; i < priv->num_tx_ring[q]; i++)
-			kfree(priv->tx_buffers[q][i]);
-	}
-	kfree(priv->tx_buffers[q]);
-	priv->tx_buffers[q] = NULL;
+	kfree(priv->tx_align[q]);
+	priv->tx_align[q] = NULL;
 
 	if (priv->rx_ring[q]) {
 		ring_size = sizeof(struct ravb_ex_rx_desc) *
@@ -212,7 +208,7 @@ static void ravb_ring_free(struct net_de
 
 	if (priv->tx_ring[q]) {
 		ring_size = sizeof(struct ravb_tx_desc) *
-			    (priv->num_tx_ring[q] + 1);
+			    (priv->num_tx_ring[q] * NUM_TX_DESC + 1);
 		dma_free_coherent(NULL, ring_size, priv->tx_ring[q],
 				  priv->tx_desc_dma[q]);
 		priv->tx_ring[q] = NULL;
@@ -227,7 +223,8 @@ static void ravb_ring_format(struct net_
 	struct ravb_tx_desc *tx_desc;
 	struct ravb_desc *desc;
 	int rx_ring_size = sizeof(*rx_desc) * priv->num_rx_ring[q];
-	int tx_ring_size = sizeof(*tx_desc) * priv->num_tx_ring[q];
+	int tx_ring_size = sizeof(*tx_desc) * priv->num_tx_ring[q] *
+			   NUM_TX_DESC;
 	dma_addr_t dma_addr;
 	int i;
 
@@ -260,11 +257,12 @@ static void ravb_ring_format(struct net_
 
 	memset(priv->tx_ring[q], 0, tx_ring_size);
 	/* Build TX ring buffer */
-	for (i = 0; i < priv->num_tx_ring[q]; i++) {
-		tx_desc = &priv->tx_ring[q][i];
+	for (i = 0, tx_desc = priv->tx_ring[q]; i < priv->num_tx_ring[q];
+	     i++, tx_desc++) {
+		tx_desc->die_dt = DT_EEMPTY;
+		tx_desc++;
 		tx_desc->die_dt = DT_EEMPTY;
 	}
-	tx_desc = &priv->tx_ring[q][i];
 	tx_desc->dptr = cpu_to_le32((u32)priv->tx_desc_dma[q]);
 	tx_desc->die_dt = DT_LINKFIX; /* type */
 
@@ -285,7 +283,6 @@ static int ravb_ring_init(struct net_dev
 	struct ravb_private *priv = netdev_priv(ndev);
 	struct sk_buff *skb;
 	int ring_size;
-	void *buffer;
 	int i;
 
 	/* Allocate RX and TX skb rings */
@@ -305,19 +302,11 @@ static int ravb_ring_init(struct net_dev
 	}
 
 	/* Allocate rings for the aligned buffers */
-	priv->tx_buffers[q] = kcalloc(priv->num_tx_ring[q],
-				      sizeof(*priv->tx_buffers[q]), GFP_KERNEL);
-	if (!priv->tx_buffers[q])
+	priv->tx_align[q] = kmalloc(DPTR_ALIGN * priv->num_tx_ring[q] +
+				    DPTR_ALIGN - 1, GFP_KERNEL);
+	if (!priv->tx_align[q])
 		goto error;
 
-	for (i = 0; i < priv->num_tx_ring[q]; i++) {
-		buffer = kmalloc(PKT_BUF_SZ + RAVB_ALIGN - 1, GFP_KERNEL);
-		if (!buffer)
-			goto error;
-		/* Aligned TX buffer */
-		priv->tx_buffers[q][i] = buffer;
-	}
-
 	/* Allocate all RX descriptors. */
 	ring_size = sizeof(struct ravb_ex_rx_desc) * (priv->num_rx_ring[q] + 1);
 	priv->rx_ring[q] = dma_alloc_coherent(NULL, ring_size,
@@ -329,7 +318,8 @@ static int ravb_ring_init(struct net_dev
 	priv->dirty_rx[q] = 0;
 
 	/* Allocate all TX descriptors. */
-	ring_size = sizeof(struct ravb_tx_desc) * (priv->num_tx_ring[q] + 1);
+	ring_size = sizeof(struct ravb_tx_desc) *
+		    (priv->num_tx_ring[q] * NUM_TX_DESC + 1);
 	priv->tx_ring[q] = dma_alloc_coherent(NULL, ring_size,
 					      &priv->tx_desc_dma[q],
 					      GFP_KERNEL);
@@ -443,7 +433,8 @@ static int ravb_tx_free(struct net_devic
 	u32 size;
 
 	for (; priv->cur_tx[q] - priv->dirty_tx[q] > 0; priv->dirty_tx[q]++) {
-		entry = priv->dirty_tx[q] % priv->num_tx_ring[q];
+		entry = priv->dirty_tx[q] % (priv->num_tx_ring[q] *
+					     NUM_TX_DESC);
 		desc = &priv->tx_ring[q][entry];
 		if (desc->die_dt != DT_FEMPTY)
 			break;
@@ -451,14 +442,18 @@ static int ravb_tx_free(struct net_devic
 		dma_rmb();
 		size = le16_to_cpu(desc->ds_tagl) & TX_DS;
 		/* Free the original skb. */
-		if (priv->tx_skb[q][entry]) {
+		if (priv->tx_skb[q][entry / NUM_TX_DESC]) {
 			dma_unmap_single(&ndev->dev, le32_to_cpu(desc->dptr),
 					 size, DMA_TO_DEVICE);
-			dev_kfree_skb_any(priv->tx_skb[q][entry]);
-			priv->tx_skb[q][entry] = NULL;
+			/* Last packet descriptor? */
+			if (entry % NUM_TX_DESC == NUM_TX_DESC - 1) {
+				entry /= NUM_TX_DESC;
+				dev_kfree_skb_any(priv->tx_skb[q][entry]);
+				priv->tx_skb[q][entry] = NULL;
+				stats->tx_packets++;
+			}
 			free_num++;
 		}
-		stats->tx_packets++;
 		stats->tx_bytes += size;
 		desc->die_dt = DT_EEMPTY;
 	}
@@ -1284,37 +1279,53 @@ static netdev_tx_t ravb_start_xmit(struc
 	u32 dma_addr;
 	void *buffer;
 	u32 entry;
+	u32 len;
 
 	spin_lock_irqsave(&priv->lock, flags);
-	if (priv->cur_tx[q] - priv->dirty_tx[q] >= priv->num_tx_ring[q]) {
+	if (priv->cur_tx[q] - priv->dirty_tx[q] > (priv->num_tx_ring[q] - 1) *
+	    NUM_TX_DESC) {
 		netif_err(priv, tx_queued, ndev,
 			  "still transmitting with the full ring!\n");
 		netif_stop_subqueue(ndev, q);
 		spin_unlock_irqrestore(&priv->lock, flags);
 		return NETDEV_TX_BUSY;
 	}
-	entry = priv->cur_tx[q] % priv->num_tx_ring[q];
-	priv->tx_skb[q][entry] = skb;
+	entry = priv->cur_tx[q] % (priv->num_tx_ring[q] * NUM_TX_DESC);
+	priv->tx_skb[q][entry / NUM_TX_DESC] = skb;
 
 	if (skb_put_padto(skb, ETH_ZLEN))
 		goto drop;
 
-	buffer = PTR_ALIGN(priv->tx_buffers[q][entry], RAVB_ALIGN);
-	memcpy(buffer, skb->data, skb->len);
-	desc = &priv->tx_ring[q][entry];
-	desc->ds_tagl = cpu_to_le16(skb->len);
-	dma_addr = dma_map_single(&ndev->dev, buffer, skb->len, DMA_TO_DEVICE);
+	buffer = PTR_ALIGN(priv->tx_align[q], DPTR_ALIGN) +
+		 entry / NUM_TX_DESC * DPTR_ALIGN;
+	len = PTR_ALIGN(skb->data, DPTR_ALIGN) - skb->data;
+	memcpy(buffer, skb->data, len);
+	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
 	if (dma_mapping_error(&ndev->dev, dma_addr))
 		goto drop;
+
+	desc = &priv->tx_ring[q][entry];
+	desc->ds_tagl = cpu_to_le16(len);
+	desc->dptr = cpu_to_le32(dma_addr);
+
+	buffer = skb->data + len;
+	len = skb->len - len;
+	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&ndev->dev, dma_addr))
+		goto unmap;
+
+	desc++;
+	desc->ds_tagl = cpu_to_le16(len);
 	desc->dptr = cpu_to_le32(dma_addr);
 
 	/* TX timestamp required */
 	if (q == RAVB_NC) {
 		ts_skb = kmalloc(sizeof(*ts_skb), GFP_ATOMIC);
 		if (!ts_skb) {
-			dma_unmap_single(&ndev->dev, dma_addr, skb->len,
+			desc--;
+			dma_unmap_single(&ndev->dev, dma_addr, len,
 					 DMA_TO_DEVICE);
-			goto drop;
+			goto unmap;
 		}
 		ts_skb->skb = skb;
 		ts_skb->tag = priv->ts_skb_tag++;
@@ -1330,13 +1341,15 @@ static netdev_tx_t ravb_start_xmit(struc
 
 	/* Descriptor type must be set after all the above writes */
 	dma_wmb();
-	desc->die_dt = DT_FSINGLE;
+	desc->die_dt = DT_FEND;
+	desc--;
+	desc->die_dt = DT_FSTART;
 
 	ravb_write(ndev, ravb_read(ndev, TCCR) | (TCCR_TSRQ0 << q), TCCR);
 
-	priv->cur_tx[q]++;
-	if (priv->cur_tx[q] - priv->dirty_tx[q] >= priv->num_tx_ring[q] &&
-	    !ravb_tx_free(ndev, q))
+	priv->cur_tx[q] += NUM_TX_DESC;
+	if (priv->cur_tx[q] - priv->dirty_tx[q] >
+	    (priv->num_tx_ring[q] - 1) * NUM_TX_DESC && !ravb_tx_free(ndev, q))
 		netif_stop_subqueue(ndev, q);
 
 exit:
@@ -1344,9 +1357,12 @@ exit:
 	spin_unlock_irqrestore(&priv->lock, flags);
 	return NETDEV_TX_OK;
 
+unmap:
+	dma_unmap_single(&ndev->dev, le32_to_cpu(desc->dptr),
+			 le16_to_cpu(desc->ds_tagl), DMA_TO_DEVICE);
 drop:
 	dev_kfree_skb_any(skb);
-	priv->tx_skb[q][entry] = NULL;
+	priv->tx_skb[q][entry / NUM_TX_DESC] = NULL;
 	goto exit;
 }
 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] ravb: minimize TX data copying
  2015-07-25 20:42 [PATCH v2] ravb: minimize TX data copying Sergei Shtylyov
@ 2015-07-27  8:22 ` David Miller
  2015-07-27  8:47 ` David Laight
  1 sibling, 0 replies; 4+ messages in thread
From: David Miller @ 2015-07-27  8:22 UTC (permalink / raw)
  To: sergei.shtylyov; +Cc: netdev, linux-sh

From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 25 Jul 2015 23:42:01 +0300

> Renesas Ethernet AVB controller requires that all data are aligned on 4-byte
> boundary.  While it's  easily achievable for  the RX  data with  the help of
> skb_reserve() (we even align on 128-byte boundary as recommended by the manual),
> we  can't  do the same with the TX data, and it always comes  unaligned from
> the networking core. Originally we solved it an easy way, copying all packet
> to  a  preallocated  aligned buffer; however, it's enough to copy only up to
> 3 first bytes from each packet, doing the transfer using 2 TX descriptors
> instead of just 1. Here's an implementation of the new  TX algorithm that
> significantly reduces the driver's memory requirements.
> 
> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH v2] ravb: minimize TX data copying
  2015-07-25 20:42 [PATCH v2] ravb: minimize TX data copying Sergei Shtylyov
  2015-07-27  8:22 ` David Miller
@ 2015-07-27  8:47 ` David Laight
  2015-07-27 11:57   ` Sergei Shtylyov
  1 sibling, 1 reply; 4+ messages in thread
From: David Laight @ 2015-07-27  8:47 UTC (permalink / raw)
  To: 'Sergei Shtylyov', netdev; +Cc: linux-sh

From: Sergei Shtylyov
> Sent: 25 July 2015 21:42
> Renesas Ethernet AVB controller requires that all data are aligned on 4-byte
> boundary.  While it's  easily achievable for  the RX  data with  the help of
> skb_reserve() (we even align on 128-byte boundary as recommended by the manual),
> we  can't  do the same with the TX data, and it always comes  unaligned from
> the networking core. Originally we solved it an easy way, copying all packet
> to  a  preallocated  aligned buffer; however, it's enough to copy only up to
> 3 first bytes from each packet, doing the transfer using 2 TX descriptors
> instead of just 1. Here's an implementation of the new  TX algorithm that
> significantly reduces the driver's memory requirements.
> 
...
> -	buffer = PTR_ALIGN(priv->tx_buffers[q][entry], RAVB_ALIGN);
> -	memcpy(buffer, skb->data, skb->len);
> -	desc = &priv->tx_ring[q][entry];
> -	desc->ds_tagl = cpu_to_le16(skb->len);
> -	dma_addr = dma_map_single(&ndev->dev, buffer, skb->len, DMA_TO_DEVICE);
> +	buffer = PTR_ALIGN(priv->tx_align[q], DPTR_ALIGN) +
> +		 entry / NUM_TX_DESC * DPTR_ALIGN;

The above would be clearer if tx_align was char[DPTR_ALIGN][].

> +	len = PTR_ALIGN(skb->data, DPTR_ALIGN) - skb->data;
> +	memcpy(buffer, skb->data, len);

Does this imply there has been an skb_linearize() ???
The old version didn't really need it (it was doing a copy anyway).

> +	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
>  	if (dma_mapping_error(&ndev->dev, dma_addr))
>  		goto drop;
> +
> +	desc = &priv->tx_ring[q][entry];
> +	desc->ds_tagl = cpu_to_le16(len);
> +	desc->dptr = cpu_to_le32(dma_addr);
> +
> +	buffer = skb->data + len;
> +	len = skb->len - len;
> +	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
> +	if (dma_mapping_error(&ndev->dev, dma_addr))
> +		goto unmap;
> +
> +	desc++;
> +	desc->ds_tagl = cpu_to_le16(len);

What happens if a fragment is less than DPTR_ALIGN bytes ???
Actually is looks like you relying on having a linear skb.

On systems with expensive dma_map it might be worth copying
small fragments.

	David

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] ravb: minimize TX data copying
  2015-07-27  8:47 ` David Laight
@ 2015-07-27 11:57   ` Sergei Shtylyov
  0 siblings, 0 replies; 4+ messages in thread
From: Sergei Shtylyov @ 2015-07-27 11:57 UTC (permalink / raw)
  To: David Laight, netdev; +Cc: linux-sh

On 7/27/2015 11:47 AM, David Laight wrote:

>> Renesas Ethernet AVB controller requires that all data are aligned on 4-byte
>> boundary.  While it's  easily achievable for  the RX  data with  the help of
>> skb_reserve() (we even align on 128-byte boundary as recommended by the manual),
>> we  can't  do the same with the TX data, and it always comes  unaligned from
>> the networking core. Originally we solved it an easy way, copying all packet
>> to  a  preallocated  aligned buffer; however, it's enough to copy only up to
>> 3 first bytes from each packet, doing the transfer using 2 TX descriptors
>> instead of just 1. Here's an implementation of the new  TX algorithm that
>> significantly reduces the driver's memory requirements.

> ...
>> -	buffer = PTR_ALIGN(priv->tx_buffers[q][entry], RAVB_ALIGN);
>> -	memcpy(buffer, skb->data, skb->len);
>> -	desc = &priv->tx_ring[q][entry];
>> -	desc->ds_tagl = cpu_to_le16(skb->len);
>> -	dma_addr = dma_map_single(&ndev->dev, buffer, skb->len, DMA_TO_DEVICE);
>> +	buffer = PTR_ALIGN(priv->tx_align[q], DPTR_ALIGN) +
>> +		 entry / NUM_TX_DESC * DPTR_ALIGN;

> The above would be clearer if tx_align was char[DPTR_ALIGN][].

    tx_align is a pointer, not an array.

>> +	len = PTR_ALIGN(skb->data, DPTR_ALIGN) - skb->data;
>> +	memcpy(buffer, skb->data, len);

> Does this imply there has been an skb_linearize() ???

    Sure, I don't support S/G (and it seems problematic given how the DMA 
descriptors are handled by the h/w).

> The old version didn't really need it (it was doing a copy anyway).

    It did since it copied the whole packet.

>> +	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
>>   	if (dma_mapping_error(&ndev->dev, dma_addr))
>>   		goto drop;
>> +
>> +	desc = &priv->tx_ring[q][entry];
>> +	desc->ds_tagl = cpu_to_le16(len);
>> +	desc->dptr = cpu_to_le32(dma_addr);
>> +
>> +	buffer = skb->data + len;
>> +	len = skb->len - len;
>> +	dma_addr = dma_map_single(&ndev->dev, buffer, len, DMA_TO_DEVICE);
>> +	if (dma_mapping_error(&ndev->dev, dma_addr))
>> +		goto unmap;
>> +
>> +	desc++;
>> +	desc->ds_tagl = cpu_to_le16(len);

> What happens if a fragment is less than DPTR_ALIGN bytes ???

    It's always the case. If you mean a packet shorter than DPTR_ALIGN, it can 
happen due to call to skb_put_padto(skb, ETH_ZLEN).

> Actually is looks like you relying on having a linear skb.

    Yes, and I was relying on it even before this patch.

> 	David

WBR, Sergei


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-07-27 11:57 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-25 20:42 [PATCH v2] ravb: minimize TX data copying Sergei Shtylyov
2015-07-27  8:22 ` David Miller
2015-07-27  8:47 ` David Laight
2015-07-27 11:57   ` Sergei Shtylyov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).