All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-05 17:07 ` Tushar Dave
  0 siblings, 0 replies; 33+ messages in thread
From: Tushar Dave @ 2016-12-05 17:07 UTC (permalink / raw)
  To: jeffrey.t.kirsher, intel-wired-lan; +Cc: netdev

Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
ordering enabled, i40e performance is significantly low on SPARC.

This patch sets PCIe relax ordering for SPARC arch by setting dma attr
DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
This has shown 10x increase in performance numbers.

e.g.
iperf TCP test with 10 threads on SPARC S7

Test 1: Without this patch

[root@brm-snt1-03 net]# iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
[  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
[  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
[  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
[  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
[  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
[ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
[ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
[ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
[ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
[  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
[  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
[  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
[ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
[  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
[  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
[ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
[ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
[ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
[SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec

Test 2: with this patch:

[root@brm-snt1-03 net]# iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
cookies.  Check SNMP counters.
[  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
[  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
[  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
[  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
[  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
[  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
[ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
[ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
[ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
[ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
[  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
[  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
[  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
[  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
[ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
[ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
[  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
[ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
[ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
[SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec

NOTE: In my testing, this patch does _not_ show any harm to i40e
performance numbers on x86.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6287bf6..800dca7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
+			dma_unmap_single_attrs(ring->dev,
+					       dma_unmap_addr(tx_buffer, dma),
+					       dma_unmap_len(tx_buffer, len),
+					       DMA_TO_DEVICE,
+					       ring->dma_attrs);
 	} else if (dma_unmap_len(tx_buffer, len)) {
-		dma_unmap_page(ring->dev,
-			       dma_unmap_addr(tx_buffer, dma),
-			       dma_unmap_len(tx_buffer, len),
-			       DMA_TO_DEVICE);
+		dma_unmap_single_attrs(ring->dev,
+				       dma_unmap_addr(tx_buffer, dma),
+				       dma_unmap_len(tx_buffer, len),
+				       DMA_TO_DEVICE,
+				       ring->dma_attrs);
 	}
 
 	tx_buffer->next_to_watch = NULL;
@@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 	struct i40e_tx_buffer *tx_buf;
 	struct i40e_tx_desc *tx_head;
 	struct i40e_tx_desc *tx_desc;
+	dma_addr_t addr;
+	size_t size;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = vsi->work_limit;
 
@@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 		napi_consume_skb(tx_buf->skb, napi_budget);
 
 		/* unmap skb header data */
-		dma_unmap_single(tx_ring->dev,
-				 dma_unmap_addr(tx_buf, dma),
-				 dma_unmap_len(tx_buf, len),
-				 DMA_TO_DEVICE);
+		dma_unmap_single_attrs(tx_ring->dev,
+				       dma_unmap_addr(tx_buf, dma),
+				       dma_unmap_len(tx_buf, len),
+				       DMA_TO_DEVICE,
+				       tx_ring->dma_attrs);
 
 		/* clear tx_buffer data */
 		tx_buf->skb = NULL;
@@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 				tx_desc = I40E_TX_DESC(tx_ring, 0);
 			}
 
+			addr = dma_unmap_addr(tx_buf, dma);
+			size = dma_unmap_len(tx_buf, len);
 			/* unmap any remaining paged data */
 			if (dma_unmap_len(tx_buf, len)) {
-				dma_unmap_page(tx_ring->dev,
-					       dma_unmap_addr(tx_buf, dma),
-					       dma_unmap_len(tx_buf, len),
-					       DMA_TO_DEVICE);
+				dma_unmap_single_attrs(tx_ring->dev,
+						       addr,
+						       size,
+						       DMA_TO_DEVICE,
+						       tx_ring->dma_attrs);
 				dma_unmap_len_set(tx_buf, len, 0);
 			}
 		}
@@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
 	 */
 	tx_ring->size += sizeof(u32);
 	tx_ring->size = ALIGN(tx_ring->size, 4096);
+#ifdef CONFIG_SPARC
+	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
+#else
+	tx_ring->dma_attrs = 0;
+#endif
 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
 					   &tx_ring->dma, GFP_KERNEL);
 	if (!tx_ring->desc) {
@@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 		if (!rx_bi->page)
 			continue;
 
-		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
+		dma_unmap_single_attrs(dev,
+				       rx_bi->dma,
+				       PAGE_SIZE,
+				       DMA_FROM_DEVICE,
+				       rx_ring->dma_attrs);
 		__free_pages(rx_bi->page, 0);
 
 		rx_bi->page = NULL;
@@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
+#ifdef CONFIG_SPARC
+	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
+#else
+	rx_ring->dma_attrs = 0;
+#endif
 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
 					   &rx_ring->dma, GFP_KERNEL);
 
@@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
 	}
 
 	/* map page for use */
-	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
+				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
@@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 		rx_ring->rx_stats.page_reuse_count++;
 	} else {
 		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
-			       DMA_FROM_DEVICE);
+		dma_unmap_single_attrs(rx_ring->dev,
+				       rx_buffer->dma,
+				       PAGE_SIZE,
+				       DMA_FROM_DEVICE,
+				       rx_ring->dma_attrs);
 	}
 
 	/* clear contents of buffer_info */
@@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 	first->skb = skb;
 	first->tx_flags = tx_flags;
 
-	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
+	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
+				   DMA_TO_DEVICE, tx_ring->dma_attrs);
 
 	tx_desc = I40E_TX_DESC(tx_ring, i);
 	tx_bi = first;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 5088405..9a86212 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -327,6 +327,7 @@ struct i40e_ring {
 
 	unsigned int size;		/* length of descriptor ring in bytes */
 	dma_addr_t dma;			/* physical address of ring */
+	unsigned long dma_attrs;	/* DMA attributes */
 
 	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
 	struct i40e_q_vector *q_vector;	/* Backreference to associated vector */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-05 17:07 ` Tushar Dave
  0 siblings, 0 replies; 33+ messages in thread
From: Tushar Dave @ 2016-12-05 17:07 UTC (permalink / raw)
  To: intel-wired-lan

Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
ordering enabled, i40e performance is significantly low on SPARC.

This patch sets PCIe relax ordering for SPARC arch by setting dma attr
DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
This has shown 10x increase in performance numbers.

e.g.
iperf TCP test with 10 threads on SPARC S7

Test 1: Without this patch

[root at brm-snt1-03 net]# iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
[  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
[  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
[  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
[  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
[  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
[ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
[ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
[ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
[ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
[  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
[  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
[  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
[ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
[  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
[  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
[ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
[ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
[ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
[SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec

Test 2: with this patch:

[root at brm-snt1-03 net]# iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
cookies.  Check SNMP counters.
[  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
[  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
[  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
[  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
[  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
[  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
[ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
[ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
[ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
[ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
[  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
[  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
[  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
[  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
[ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
[ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
[  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
[ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
[ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
[SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec

NOTE: In my testing, this patch does _not_ show any harm to i40e
performance numbers on x86.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
 2 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6287bf6..800dca7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
-			dma_unmap_single(ring->dev,
-					 dma_unmap_addr(tx_buffer, dma),
-					 dma_unmap_len(tx_buffer, len),
-					 DMA_TO_DEVICE);
+			dma_unmap_single_attrs(ring->dev,
+					       dma_unmap_addr(tx_buffer, dma),
+					       dma_unmap_len(tx_buffer, len),
+					       DMA_TO_DEVICE,
+					       ring->dma_attrs);
 	} else if (dma_unmap_len(tx_buffer, len)) {
-		dma_unmap_page(ring->dev,
-			       dma_unmap_addr(tx_buffer, dma),
-			       dma_unmap_len(tx_buffer, len),
-			       DMA_TO_DEVICE);
+		dma_unmap_single_attrs(ring->dev,
+				       dma_unmap_addr(tx_buffer, dma),
+				       dma_unmap_len(tx_buffer, len),
+				       DMA_TO_DEVICE,
+				       ring->dma_attrs);
 	}
 
 	tx_buffer->next_to_watch = NULL;
@@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 	struct i40e_tx_buffer *tx_buf;
 	struct i40e_tx_desc *tx_head;
 	struct i40e_tx_desc *tx_desc;
+	dma_addr_t addr;
+	size_t size;
 	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = vsi->work_limit;
 
@@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 		napi_consume_skb(tx_buf->skb, napi_budget);
 
 		/* unmap skb header data */
-		dma_unmap_single(tx_ring->dev,
-				 dma_unmap_addr(tx_buf, dma),
-				 dma_unmap_len(tx_buf, len),
-				 DMA_TO_DEVICE);
+		dma_unmap_single_attrs(tx_ring->dev,
+				       dma_unmap_addr(tx_buf, dma),
+				       dma_unmap_len(tx_buf, len),
+				       DMA_TO_DEVICE,
+				       tx_ring->dma_attrs);
 
 		/* clear tx_buffer data */
 		tx_buf->skb = NULL;
@@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 				tx_desc = I40E_TX_DESC(tx_ring, 0);
 			}
 
+			addr = dma_unmap_addr(tx_buf, dma);
+			size = dma_unmap_len(tx_buf, len);
 			/* unmap any remaining paged data */
 			if (dma_unmap_len(tx_buf, len)) {
-				dma_unmap_page(tx_ring->dev,
-					       dma_unmap_addr(tx_buf, dma),
-					       dma_unmap_len(tx_buf, len),
-					       DMA_TO_DEVICE);
+				dma_unmap_single_attrs(tx_ring->dev,
+						       addr,
+						       size,
+						       DMA_TO_DEVICE,
+						       tx_ring->dma_attrs);
 				dma_unmap_len_set(tx_buf, len, 0);
 			}
 		}
@@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
 	 */
 	tx_ring->size += sizeof(u32);
 	tx_ring->size = ALIGN(tx_ring->size, 4096);
+#ifdef CONFIG_SPARC
+	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
+#else
+	tx_ring->dma_attrs = 0;
+#endif
 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
 					   &tx_ring->dma, GFP_KERNEL);
 	if (!tx_ring->desc) {
@@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 		if (!rx_bi->page)
 			continue;
 
-		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
+		dma_unmap_single_attrs(dev,
+				       rx_bi->dma,
+				       PAGE_SIZE,
+				       DMA_FROM_DEVICE,
+				       rx_ring->dma_attrs);
 		__free_pages(rx_bi->page, 0);
 
 		rx_bi->page = NULL;
@@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
+#ifdef CONFIG_SPARC
+	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
+#else
+	rx_ring->dma_attrs = 0;
+#endif
 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
 					   &rx_ring->dma, GFP_KERNEL);
 
@@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
 	}
 
 	/* map page for use */
-	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
+				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
@@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
 		rx_ring->rx_stats.page_reuse_count++;
 	} else {
 		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
-			       DMA_FROM_DEVICE);
+		dma_unmap_single_attrs(rx_ring->dev,
+				       rx_buffer->dma,
+				       PAGE_SIZE,
+				       DMA_FROM_DEVICE,
+				       rx_ring->dma_attrs);
 	}
 
 	/* clear contents of buffer_info */
@@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 	first->skb = skb;
 	first->tx_flags = tx_flags;
 
-	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
+	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
+				   DMA_TO_DEVICE, tx_ring->dma_attrs);
 
 	tx_desc = I40E_TX_DESC(tx_ring, i);
 	tx_bi = first;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 5088405..9a86212 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -327,6 +327,7 @@ struct i40e_ring {
 
 	unsigned int size;		/* length of descriptor ring in bytes */
 	dma_addr_t dma;			/* physical address of ring */
+	unsigned long dma_attrs;	/* DMA attributes */
 
 	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
 	struct i40e_q_vector *q_vector;	/* Backreference to associated vector */
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 17:07 ` [Intel-wired-lan] " Tushar Dave
@ 2016-12-05 21:54   ` Alexander Duyck
  -1 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-05 21:54 UTC (permalink / raw)
  To: Tushar Dave; +Cc: Jeff Kirsher, intel-wired-lan, Netdev

On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
> ordering enabled, i40e performance is significantly low on SPARC.
>
> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> This has shown 10x increase in performance numbers.
>
> e.g.
> iperf TCP test with 10 threads on SPARC S7
>
> Test 1: Without this patch
>
> [root@brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>
> Test 2: with this patch:
>
> [root@brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> cookies.  Check SNMP counters.
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>
> NOTE: In my testing, this patch does _not_ show any harm to i40e
> performance numbers on x86.
>
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>

You went through and replaced all of the dma_unmap/map_page calls with
dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
patches to add the ability to map and unmap pages with attributes that
should be available for 4.10-rc1 so if you could wait on this patch
until then it would be preferred.

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>  2 files changed, 49 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 6287bf6..800dca7 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>                 else
>                         dev_kfree_skb_any(tx_buffer->skb);
>                 if (dma_unmap_len(tx_buffer, len))
> -                       dma_unmap_single(ring->dev,
> -                                        dma_unmap_addr(tx_buffer, dma),
> -                                        dma_unmap_len(tx_buffer, len),
> -                                        DMA_TO_DEVICE);
> +                       dma_unmap_single_attrs(ring->dev,
> +                                              dma_unmap_addr(tx_buffer, dma),
> +                                              dma_unmap_len(tx_buffer, len),
> +                                              DMA_TO_DEVICE,
> +                                              ring->dma_attrs);
>         } else if (dma_unmap_len(tx_buffer, len)) {
> -               dma_unmap_page(ring->dev,
> -                              dma_unmap_addr(tx_buffer, dma),
> -                              dma_unmap_len(tx_buffer, len),
> -                              DMA_TO_DEVICE);
> +               dma_unmap_single_attrs(ring->dev,
> +                                      dma_unmap_addr(tx_buffer, dma),
> +                                      dma_unmap_len(tx_buffer, len),
> +                                      DMA_TO_DEVICE,
> +                                      ring->dma_attrs);
>         }
>
>         tx_buffer->next_to_watch = NULL;
> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>         struct i40e_tx_buffer *tx_buf;
>         struct i40e_tx_desc *tx_head;
>         struct i40e_tx_desc *tx_desc;
> +       dma_addr_t addr;
> +       size_t size;
>         unsigned int total_bytes = 0, total_packets = 0;
>         unsigned int budget = vsi->work_limit;
>
> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>                 napi_consume_skb(tx_buf->skb, napi_budget);
>
>                 /* unmap skb header data */
> -               dma_unmap_single(tx_ring->dev,
> -                                dma_unmap_addr(tx_buf, dma),
> -                                dma_unmap_len(tx_buf, len),
> -                                DMA_TO_DEVICE);
> +               dma_unmap_single_attrs(tx_ring->dev,
> +                                      dma_unmap_addr(tx_buf, dma),
> +                                      dma_unmap_len(tx_buf, len),
> +                                      DMA_TO_DEVICE,
> +                                      tx_ring->dma_attrs);
>
>                 /* clear tx_buffer data */
>                 tx_buf->skb = NULL;
> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>                         }
>
> +                       addr = dma_unmap_addr(tx_buf, dma);
> +                       size = dma_unmap_len(tx_buf, len);

On some architectures this change could lead to issues since
dma_unmap_len could be 0 meaning that addr would never be used.

>                         /* unmap any remaining paged data */
>                         if (dma_unmap_len(tx_buf, len)) {
> -                               dma_unmap_page(tx_ring->dev,
> -                                              dma_unmap_addr(tx_buf, dma),
> -                                              dma_unmap_len(tx_buf, len),
> -                                              DMA_TO_DEVICE);
> +                               dma_unmap_single_attrs(tx_ring->dev,
> +                                                      addr,
> +                                                      size,
> +                                                      DMA_TO_DEVICE,
> +                                                      tx_ring->dma_attrs);
>                                 dma_unmap_len_set(tx_buf, len, 0);
>                         }
>                 }
> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
>          */
>         tx_ring->size += sizeof(u32);
>         tx_ring->size = ALIGN(tx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> +#else
> +       tx_ring->dma_attrs = 0;
> +#endif
>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>                                            &tx_ring->dma, GFP_KERNEL);
>         if (!tx_ring->desc) {

Also not a fan of adding yet ring attribute.  Is there any reason why
you couldn't simply add a set of inline functions at the start of
i40e_txrx.c that could replace the DMA map/unmap operations in this
code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
drivers?  Then the x86 code doesn't have to change while the SPARC
code will be able to be passed the attribute.

> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>                 if (!rx_bi->page)
>                         continue;
>
> -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
> +               dma_unmap_single_attrs(dev,
> +                                      rx_bi->dma,
> +                                      PAGE_SIZE,
> +                                      DMA_FROM_DEVICE,
> +                                      rx_ring->dma_attrs);
>                 __free_pages(rx_bi->page, 0);
>
>                 rx_bi->page = NULL;
> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
>         /* Round up to nearest 4K */
>         rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>         rx_ring->size = ALIGN(rx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> +#else
> +       rx_ring->dma_attrs = 0;
> +#endif
>         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>                                            &rx_ring->dma, GFP_KERNEL);
>
> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>         }
>
>         /* map page for use */
> -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
> +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
> +                                  DMA_FROM_DEVICE, rx_ring->dma_attrs);
>
>         /* if mapping failed free memory back to system since
>          * there isn't much point in holding memory we can't use
> @@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>                 rx_ring->rx_stats.page_reuse_count++;
>         } else {
>                 /* we are not reusing the buffer so unmap it */
> -               dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> -                              DMA_FROM_DEVICE);
> +               dma_unmap_single_attrs(rx_ring->dev,
> +                                      rx_buffer->dma,
> +                                      PAGE_SIZE,
> +                                      DMA_FROM_DEVICE,
> +                                      rx_ring->dma_attrs);
>         }
>
>         /* clear contents of buffer_info */
> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
>         first->skb = skb;
>         first->tx_flags = tx_flags;
>
> -       dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> +                                  DMA_TO_DEVICE, tx_ring->dma_attrs);
>
>         tx_desc = I40E_TX_DESC(tx_ring, i);
>         tx_bi = first;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 5088405..9a86212 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -327,6 +327,7 @@ struct i40e_ring {
>
>         unsigned int size;              /* length of descriptor ring in bytes */
>         dma_addr_t dma;                 /* physical address of ring */
> +       unsigned long dma_attrs;        /* DMA attributes */
>
>         struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>         struct i40e_q_vector *q_vector; /* Backreference to associated vector */
> --
> 1.9.1
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-05 21:54   ` Alexander Duyck
  0 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-05 21:54 UTC (permalink / raw)
  To: intel-wired-lan

On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
> ordering enabled, i40e performance is significantly low on SPARC.
>
> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> This has shown 10x increase in performance numbers.
>
> e.g.
> iperf TCP test with 10 threads on SPARC S7
>
> Test 1: Without this patch
>
> [root at brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>
> Test 2: with this patch:
>
> [root at brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> cookies.  Check SNMP counters.
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>
> NOTE: In my testing, this patch does _not_ show any harm to i40e
> performance numbers on x86.
>
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>

You went through and replaced all of the dma_unmap/map_page calls with
dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
patches to add the ability to map and unmap pages with attributes that
should be available for 4.10-rc1 so if you could wait on this patch
until then it would be preferred.

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>  2 files changed, 49 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 6287bf6..800dca7 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>                 else
>                         dev_kfree_skb_any(tx_buffer->skb);
>                 if (dma_unmap_len(tx_buffer, len))
> -                       dma_unmap_single(ring->dev,
> -                                        dma_unmap_addr(tx_buffer, dma),
> -                                        dma_unmap_len(tx_buffer, len),
> -                                        DMA_TO_DEVICE);
> +                       dma_unmap_single_attrs(ring->dev,
> +                                              dma_unmap_addr(tx_buffer, dma),
> +                                              dma_unmap_len(tx_buffer, len),
> +                                              DMA_TO_DEVICE,
> +                                              ring->dma_attrs);
>         } else if (dma_unmap_len(tx_buffer, len)) {
> -               dma_unmap_page(ring->dev,
> -                              dma_unmap_addr(tx_buffer, dma),
> -                              dma_unmap_len(tx_buffer, len),
> -                              DMA_TO_DEVICE);
> +               dma_unmap_single_attrs(ring->dev,
> +                                      dma_unmap_addr(tx_buffer, dma),
> +                                      dma_unmap_len(tx_buffer, len),
> +                                      DMA_TO_DEVICE,
> +                                      ring->dma_attrs);
>         }
>
>         tx_buffer->next_to_watch = NULL;
> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>         struct i40e_tx_buffer *tx_buf;
>         struct i40e_tx_desc *tx_head;
>         struct i40e_tx_desc *tx_desc;
> +       dma_addr_t addr;
> +       size_t size;
>         unsigned int total_bytes = 0, total_packets = 0;
>         unsigned int budget = vsi->work_limit;
>
> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>                 napi_consume_skb(tx_buf->skb, napi_budget);
>
>                 /* unmap skb header data */
> -               dma_unmap_single(tx_ring->dev,
> -                                dma_unmap_addr(tx_buf, dma),
> -                                dma_unmap_len(tx_buf, len),
> -                                DMA_TO_DEVICE);
> +               dma_unmap_single_attrs(tx_ring->dev,
> +                                      dma_unmap_addr(tx_buf, dma),
> +                                      dma_unmap_len(tx_buf, len),
> +                                      DMA_TO_DEVICE,
> +                                      tx_ring->dma_attrs);
>
>                 /* clear tx_buffer data */
>                 tx_buf->skb = NULL;
> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>                         }
>
> +                       addr = dma_unmap_addr(tx_buf, dma);
> +                       size = dma_unmap_len(tx_buf, len);

On some architectures this change could lead to issues since
dma_unmap_len could be 0 meaning that addr would never be used.

>                         /* unmap any remaining paged data */
>                         if (dma_unmap_len(tx_buf, len)) {
> -                               dma_unmap_page(tx_ring->dev,
> -                                              dma_unmap_addr(tx_buf, dma),
> -                                              dma_unmap_len(tx_buf, len),
> -                                              DMA_TO_DEVICE);
> +                               dma_unmap_single_attrs(tx_ring->dev,
> +                                                      addr,
> +                                                      size,
> +                                                      DMA_TO_DEVICE,
> +                                                      tx_ring->dma_attrs);
>                                 dma_unmap_len_set(tx_buf, len, 0);
>                         }
>                 }
> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
>          */
>         tx_ring->size += sizeof(u32);
>         tx_ring->size = ALIGN(tx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> +#else
> +       tx_ring->dma_attrs = 0;
> +#endif
>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>                                            &tx_ring->dma, GFP_KERNEL);
>         if (!tx_ring->desc) {

Also not a fan of adding yet ring attribute.  Is there any reason why
you couldn't simply add a set of inline functions at the start of
i40e_txrx.c that could replace the DMA map/unmap operations in this
code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
drivers?  Then the x86 code doesn't have to change while the SPARC
code will be able to be passed the attribute.

> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>                 if (!rx_bi->page)
>                         continue;
>
> -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
> +               dma_unmap_single_attrs(dev,
> +                                      rx_bi->dma,
> +                                      PAGE_SIZE,
> +                                      DMA_FROM_DEVICE,
> +                                      rx_ring->dma_attrs);
>                 __free_pages(rx_bi->page, 0);
>
>                 rx_bi->page = NULL;
> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
>         /* Round up to nearest 4K */
>         rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>         rx_ring->size = ALIGN(rx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> +#else
> +       rx_ring->dma_attrs = 0;
> +#endif
>         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>                                            &rx_ring->dma, GFP_KERNEL);
>
> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>         }
>
>         /* map page for use */
> -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
> +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
> +                                  DMA_FROM_DEVICE, rx_ring->dma_attrs);
>
>         /* if mapping failed free memory back to system since
>          * there isn't much point in holding memory we can't use
> @@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>                 rx_ring->rx_stats.page_reuse_count++;
>         } else {
>                 /* we are not reusing the buffer so unmap it */
> -               dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> -                              DMA_FROM_DEVICE);
> +               dma_unmap_single_attrs(rx_ring->dev,
> +                                      rx_buffer->dma,
> +                                      PAGE_SIZE,
> +                                      DMA_FROM_DEVICE,
> +                                      rx_ring->dma_attrs);
>         }
>
>         /* clear contents of buffer_info */
> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
>         first->skb = skb;
>         first->tx_flags = tx_flags;
>
> -       dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> +                                  DMA_TO_DEVICE, tx_ring->dma_attrs);
>
>         tx_desc = I40E_TX_DESC(tx_ring, i);
>         tx_bi = first;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 5088405..9a86212 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -327,6 +327,7 @@ struct i40e_ring {
>
>         unsigned int size;              /* length of descriptor ring in bytes */
>         dma_addr_t dma;                 /* physical address of ring */
> +       unsigned long dma_attrs;        /* DMA attributes */
>
>         struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>         struct i40e_q_vector *q_vector; /* Backreference to associated vector */
> --
> 1.9.1
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 21:54   ` Alexander Duyck
@ 2016-12-05 22:23     ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-05 22:23 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Jeff Kirsher, intel-wired-lan, Netdev



On 12/05/2016 01:54 PM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>> ordering enabled, i40e performance is significantly low on SPARC.
>>
>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> # iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> # iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>> cookies.  Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>> performance numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>
> You went through and replaced all of the dma_unmap/map_page calls with
> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
Yes, because currently there is no DMA API for dma_map/unmap_page with 
dma attr*
> patches to add the ability to map and unmap pages with attributes that
> should be available for 4.10-rc1 so if you could wait on this patch
> until then it would be preferred.
:-) thanks. I will wait until your patches are out.
>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>                 else
>>                         dev_kfree_skb_any(tx_buffer->skb);
>>                 if (dma_unmap_len(tx_buffer, len))
>> -                       dma_unmap_single(ring->dev,
>> -                                        dma_unmap_addr(tx_buffer, dma),
>> -                                        dma_unmap_len(tx_buffer, len),
>> -                                        DMA_TO_DEVICE);
>> +                       dma_unmap_single_attrs(ring->dev,
>> +                                              dma_unmap_addr(tx_buffer, dma),
>> +                                              dma_unmap_len(tx_buffer, len),
>> +                                              DMA_TO_DEVICE,
>> +                                              ring->dma_attrs);
>>         } else if (dma_unmap_len(tx_buffer, len)) {
>> -               dma_unmap_page(ring->dev,
>> -                              dma_unmap_addr(tx_buffer, dma),
>> -                              dma_unmap_len(tx_buffer, len),
>> -                              DMA_TO_DEVICE);
>> +               dma_unmap_single_attrs(ring->dev,
>> +                                      dma_unmap_addr(tx_buffer, dma),
>> +                                      dma_unmap_len(tx_buffer, len),
>> +                                      DMA_TO_DEVICE,
>> +                                      ring->dma_attrs);
>>         }
>>
>>         tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>         struct i40e_tx_buffer *tx_buf;
>>         struct i40e_tx_desc *tx_head;
>>         struct i40e_tx_desc *tx_desc;
>> +       dma_addr_t addr;
>> +       size_t size;
>>         unsigned int total_bytes = 0, total_packets = 0;
>>         unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>                 /* unmap skb header data */
>> -               dma_unmap_single(tx_ring->dev,
>> -                                dma_unmap_addr(tx_buf, dma),
>> -                                dma_unmap_len(tx_buf, len),
>> -                                DMA_TO_DEVICE);
>> +               dma_unmap_single_attrs(tx_ring->dev,
>> +                                      dma_unmap_addr(tx_buf, dma),
>> +                                      dma_unmap_len(tx_buf, len),
>> +                                      DMA_TO_DEVICE,
>> +                                      tx_ring->dma_attrs);
>>
>>                 /* clear tx_buffer data */
>>                 tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>                         }
>>
>> +                       addr = dma_unmap_addr(tx_buf, dma);
>> +                       size = dma_unmap_len(tx_buf, len);
>
> On some architectures this change could lead to issues since
> dma_unmap_len could be 0 meaning that addr would never be used.
I see. Thanks.
>
>>                         /* unmap any remaining paged data */
>>                         if (dma_unmap_len(tx_buf, len)) {
>> -                               dma_unmap_page(tx_ring->dev,
>> -                                              dma_unmap_addr(tx_buf, dma),
>> -                                              dma_unmap_len(tx_buf, len),
>> -                                              DMA_TO_DEVICE);
>> +                               dma_unmap_single_attrs(tx_ring->dev,
>> +                                                      addr,
>> +                                                      size,
>> +                                                      DMA_TO_DEVICE,
>> +                                                      tx_ring->dma_attrs);
>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>                         }
>>                 }
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
>>          */
>>         tx_ring->size += sizeof(u32);
>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>> +#else
>> +       tx_ring->dma_attrs = 0;
>> +#endif
>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>                                            &tx_ring->dma, GFP_KERNEL);
>>         if (!tx_ring->desc) {
>
> Also not a fan of adding yet ring attribute.  Is there any reason why
> you couldn't simply add a set of inline functions at the start of
> i40e_txrx.c that could replace the DMA map/unmap operations in this
> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
> drivers?  Then the x86 code doesn't have to change while the SPARC
> code will be able to be passed the attribute.
Sure I can do that.

I will follow up with patch after your patches for map/unmap page with 
dma attr will be out.

Thanks.

-Tushar
>
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>                 if (!rx_bi->page)
>>                         continue;
>>
>> -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
>> +               dma_unmap_single_attrs(dev,
>> +                                      rx_bi->dma,
>> +                                      PAGE_SIZE,
>> +                                      DMA_FROM_DEVICE,
>> +                                      rx_ring->dma_attrs);
>>                 __free_pages(rx_bi->page, 0);
>>
>>                 rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
>>         /* Round up to nearest 4K */
>>         rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>         rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>> +#else
>> +       rx_ring->dma_attrs = 0;
>> +#endif
>>         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>                                            &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>>         }
>>
>>         /* map page for use */
>> -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
>> +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
>> +                                  DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>         /* if mapping failed free memory back to system since
>>          * there isn't much point in holding memory we can't use
>> @@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>                 rx_ring->rx_stats.page_reuse_count++;
>>         } else {
>>                 /* we are not reusing the buffer so unmap it */
>> -               dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -                              DMA_FROM_DEVICE);
>> +               dma_unmap_single_attrs(rx_ring->dev,
>> +                                      rx_buffer->dma,
>> +                                      PAGE_SIZE,
>> +                                      DMA_FROM_DEVICE,
>> +                                      rx_ring->dma_attrs);
>>         }
>>
>>         /* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
>>         first->skb = skb;
>>         first->tx_flags = tx_flags;
>>
>> -       dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +                                  DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>         tx_desc = I40E_TX_DESC(tx_ring, i);
>>         tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>         unsigned int size;              /* length of descriptor ring in bytes */
>>         dma_addr_t dma;                 /* physical address of ring */
>> +       unsigned long dma_attrs;        /* DMA attributes */
>>
>>         struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>>         struct i40e_q_vector *q_vector; /* Backreference to associated vector */
>> --
>> 1.9.1
>>
>> _______________________________________________
>> Intel-wired-lan mailing list
>> Intel-wired-lan@lists.osuosl.org
>> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-05 22:23     ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-05 22:23 UTC (permalink / raw)
  To: intel-wired-lan



On 12/05/2016 01:54 PM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>> ordering enabled, i40e performance is significantly low on SPARC.
>>
>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> # iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> # iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>> cookies.  Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>> performance numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>
> You went through and replaced all of the dma_unmap/map_page calls with
> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
Yes, because currently there is no DMA API for dma_map/unmap_page with 
dma attr*
> patches to add the ability to map and unmap pages with attributes that
> should be available for 4.10-rc1 so if you could wait on this patch
> until then it would be preferred.
:-) thanks. I will wait until your patches are out.
>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69 ++++++++++++++++++++---------
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>                 else
>>                         dev_kfree_skb_any(tx_buffer->skb);
>>                 if (dma_unmap_len(tx_buffer, len))
>> -                       dma_unmap_single(ring->dev,
>> -                                        dma_unmap_addr(tx_buffer, dma),
>> -                                        dma_unmap_len(tx_buffer, len),
>> -                                        DMA_TO_DEVICE);
>> +                       dma_unmap_single_attrs(ring->dev,
>> +                                              dma_unmap_addr(tx_buffer, dma),
>> +                                              dma_unmap_len(tx_buffer, len),
>> +                                              DMA_TO_DEVICE,
>> +                                              ring->dma_attrs);
>>         } else if (dma_unmap_len(tx_buffer, len)) {
>> -               dma_unmap_page(ring->dev,
>> -                              dma_unmap_addr(tx_buffer, dma),
>> -                              dma_unmap_len(tx_buffer, len),
>> -                              DMA_TO_DEVICE);
>> +               dma_unmap_single_attrs(ring->dev,
>> +                                      dma_unmap_addr(tx_buffer, dma),
>> +                                      dma_unmap_len(tx_buffer, len),
>> +                                      DMA_TO_DEVICE,
>> +                                      ring->dma_attrs);
>>         }
>>
>>         tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>         struct i40e_tx_buffer *tx_buf;
>>         struct i40e_tx_desc *tx_head;
>>         struct i40e_tx_desc *tx_desc;
>> +       dma_addr_t addr;
>> +       size_t size;
>>         unsigned int total_bytes = 0, total_packets = 0;
>>         unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>                 /* unmap skb header data */
>> -               dma_unmap_single(tx_ring->dev,
>> -                                dma_unmap_addr(tx_buf, dma),
>> -                                dma_unmap_len(tx_buf, len),
>> -                                DMA_TO_DEVICE);
>> +               dma_unmap_single_attrs(tx_ring->dev,
>> +                                      dma_unmap_addr(tx_buf, dma),
>> +                                      dma_unmap_len(tx_buf, len),
>> +                                      DMA_TO_DEVICE,
>> +                                      tx_ring->dma_attrs);
>>
>>                 /* clear tx_buffer data */
>>                 tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>                         }
>>
>> +                       addr = dma_unmap_addr(tx_buf, dma);
>> +                       size = dma_unmap_len(tx_buf, len);
>
> On some architectures this change could lead to issues since
> dma_unmap_len could be 0 meaning that addr would never be used.
I see. Thanks.
>
>>                         /* unmap any remaining paged data */
>>                         if (dma_unmap_len(tx_buf, len)) {
>> -                               dma_unmap_page(tx_ring->dev,
>> -                                              dma_unmap_addr(tx_buf, dma),
>> -                                              dma_unmap_len(tx_buf, len),
>> -                                              DMA_TO_DEVICE);
>> +                               dma_unmap_single_attrs(tx_ring->dev,
>> +                                                      addr,
>> +                                                      size,
>> +                                                      DMA_TO_DEVICE,
>> +                                                      tx_ring->dma_attrs);
>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>                         }
>>                 }
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
>>          */
>>         tx_ring->size += sizeof(u32);
>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>> +#else
>> +       tx_ring->dma_attrs = 0;
>> +#endif
>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>                                            &tx_ring->dma, GFP_KERNEL);
>>         if (!tx_ring->desc) {
>
> Also not a fan of adding yet ring attribute.  Is there any reason why
> you couldn't simply add a set of inline functions at the start of
> i40e_txrx.c that could replace the DMA map/unmap operations in this
> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
> drivers?  Then the x86 code doesn't have to change while the SPARC
> code will be able to be passed the attribute.
Sure I can do that.

I will follow up with patch after your patches for map/unmap page with 
dma attr will be out.

Thanks.

-Tushar
>
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>                 if (!rx_bi->page)
>>                         continue;
>>
>> -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
>> +               dma_unmap_single_attrs(dev,
>> +                                      rx_bi->dma,
>> +                                      PAGE_SIZE,
>> +                                      DMA_FROM_DEVICE,
>> +                                      rx_ring->dma_attrs);
>>                 __free_pages(rx_bi->page, 0);
>>
>>                 rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
>>         /* Round up to nearest 4K */
>>         rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>         rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>> +#else
>> +       rx_ring->dma_attrs = 0;
>> +#endif
>>         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>                                            &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>>         }
>>
>>         /* map page for use */
>> -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
>> +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page), PAGE_SIZE,
>> +                                  DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>         /* if mapping failed free memory back to system since
>>          * there isn't much point in holding memory we can't use
>> @@ -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>                 rx_ring->rx_stats.page_reuse_count++;
>>         } else {
>>                 /* we are not reusing the buffer so unmap it */
>> -               dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -                              DMA_FROM_DEVICE);
>> +               dma_unmap_single_attrs(rx_ring->dev,
>> +                                      rx_buffer->dma,
>> +                                      PAGE_SIZE,
>> +                                      DMA_FROM_DEVICE,
>> +                                      rx_ring->dma_attrs);
>>         }
>>
>>         /* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
>>         first->skb = skb;
>>         first->tx_flags = tx_flags;
>>
>> -       dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +                                  DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>         tx_desc = I40E_TX_DESC(tx_ring, i);
>>         tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>         unsigned int size;              /* length of descriptor ring in bytes */
>>         dma_addr_t dma;                 /* physical address of ring */
>> +       unsigned long dma_attrs;        /* DMA attributes */
>>
>>         struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>>         struct i40e_q_vector *q_vector; /* Backreference to associated vector */
>> --
>> 1.9.1
>>
>> _______________________________________________
>> Intel-wired-lan mailing list
>> Intel-wired-lan at lists.osuosl.org
>> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 22:23     ` tndave
@ 2016-12-06 17:10       ` Alexander Duyck
  -1 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-06 17:10 UTC (permalink / raw)
  To: tndave; +Cc: Jeff Kirsher, intel-wired-lan, Netdev

On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>
>
> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>
>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>> wrote:
>>>
>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>
>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>> This has shown 10x increase in performance numbers.
>>>
>>> e.g.
>>> iperf TCP test with 10 threads on SPARC S7
>>>
>>> Test 1: Without this patch
>>>
>>> # iperf -s
>>> ------------------------------------------------------------
>>> Server listening on TCP port 5001
>>> TCP window size: 85.3 KByte (default)
>>> ------------------------------------------------------------
>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>
>>> Test 2: with this patch:
>>>
>>> # iperf -s
>>> ------------------------------------------------------------
>>> Server listening on TCP port 5001
>>> TCP window size: 85.3 KByte (default)
>>> ------------------------------------------------------------
>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>> cookies.  Check SNMP counters.
>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>>
>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>> performance numbers on x86.
>>>
>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>
>>
>> You went through and replaced all of the dma_unmap/map_page calls with
>> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
>
> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
> attr*
>>
>> patches to add the ability to map and unmap pages with attributes that
>> should be available for 4.10-rc1 so if you could wait on this patch
>> until then it would be preferred.
>
> :-) thanks. I will wait until your patches are out.
>
>>
>>> ---
>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>> ++++++++++++++++++++---------
>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> index 6287bf6..800dca7 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>> i40e_ring *ring,
>>>                 else
>>>                         dev_kfree_skb_any(tx_buffer->skb);
>>>                 if (dma_unmap_len(tx_buffer, len))
>>> -                       dma_unmap_single(ring->dev,
>>> -                                        dma_unmap_addr(tx_buffer, dma),
>>> -                                        dma_unmap_len(tx_buffer, len),
>>> -                                        DMA_TO_DEVICE);
>>> +                       dma_unmap_single_attrs(ring->dev,
>>> +                                              dma_unmap_addr(tx_buffer,
>>> dma),
>>> +                                              dma_unmap_len(tx_buffer,
>>> len),
>>> +                                              DMA_TO_DEVICE,
>>> +                                              ring->dma_attrs);
>>>         } else if (dma_unmap_len(tx_buffer, len)) {
>>> -               dma_unmap_page(ring->dev,
>>> -                              dma_unmap_addr(tx_buffer, dma),
>>> -                              dma_unmap_len(tx_buffer, len),
>>> -                              DMA_TO_DEVICE);
>>> +               dma_unmap_single_attrs(ring->dev,
>>> +                                      dma_unmap_addr(tx_buffer, dma),
>>> +                                      dma_unmap_len(tx_buffer, len),
>>> +                                      DMA_TO_DEVICE,
>>> +                                      ring->dma_attrs);
>>>         }
>>>
>>>         tx_buffer->next_to_watch = NULL;
>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>         struct i40e_tx_buffer *tx_buf;
>>>         struct i40e_tx_desc *tx_head;
>>>         struct i40e_tx_desc *tx_desc;
>>> +       dma_addr_t addr;
>>> +       size_t size;
>>>         unsigned int total_bytes = 0, total_packets = 0;
>>>         unsigned int budget = vsi->work_limit;
>>>
>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>>
>>>                 /* unmap skb header data */
>>> -               dma_unmap_single(tx_ring->dev,
>>> -                                dma_unmap_addr(tx_buf, dma),
>>> -                                dma_unmap_len(tx_buf, len),
>>> -                                DMA_TO_DEVICE);
>>> +               dma_unmap_single_attrs(tx_ring->dev,
>>> +                                      dma_unmap_addr(tx_buf, dma),
>>> +                                      dma_unmap_len(tx_buf, len),
>>> +                                      DMA_TO_DEVICE,
>>> +                                      tx_ring->dma_attrs);
>>>
>>>                 /* clear tx_buffer data */
>>>                 tx_buf->skb = NULL;
>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>                         }
>>>
>>> +                       addr = dma_unmap_addr(tx_buf, dma);
>>> +                       size = dma_unmap_len(tx_buf, len);
>>
>>
>> On some architectures this change could lead to issues since
>> dma_unmap_len could be 0 meaning that addr would never be used.
>
> I see. Thanks.
>
>>
>>>                         /* unmap any remaining paged data */
>>>                         if (dma_unmap_len(tx_buf, len)) {
>>> -                               dma_unmap_page(tx_ring->dev,
>>> -                                              dma_unmap_addr(tx_buf,
>>> dma),
>>> -                                              dma_unmap_len(tx_buf,
>>> len),
>>> -                                              DMA_TO_DEVICE);
>>> +                               dma_unmap_single_attrs(tx_ring->dev,
>>> +                                                      addr,
>>> +                                                      size,
>>> +                                                      DMA_TO_DEVICE,
>>> +
>>> tx_ring->dma_attrs);
>>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>>                         }
>>>                 }
>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>> *tx_ring)
>>>          */
>>>         tx_ring->size += sizeof(u32);
>>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>>> +#ifdef CONFIG_SPARC
>>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>> +#else
>>> +       tx_ring->dma_attrs = 0;
>>> +#endif
>>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>                                            &tx_ring->dma, GFP_KERNEL);
>>>         if (!tx_ring->desc) {
>>
>>
>> Also not a fan of adding yet ring attribute.  Is there any reason why
>> you couldn't simply add a set of inline functions at the start of
>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>> drivers?  Then the x86 code doesn't have to change while the SPARC
>> code will be able to be passed the attribute.
>
> Sure I can do that.
>
> I will follow up with patch after your patches for map/unmap page with dma
> attr will be out.
>
> Thanks.
>
> -Tushar
>

I was thinking about it and I realized we can probably simplify this
even further.  In the case of most other architectures the
DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
there is probably no reason not to just always pass that attribute
with the DMA mappings.  From what I can tell the only other
architecture that uses this is the PowerPC Cell architecture.

Also I was wondering if you actually needed to enable this attribute
for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
didn't see anything about reads.  I'm just wondering if changing the
code for Tx has any effect?  If not you could probably drop those
changes and just focus on Rx.

Thanks.

- Alex

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-06 17:10       ` Alexander Duyck
  0 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-06 17:10 UTC (permalink / raw)
  To: intel-wired-lan

On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>
>
> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>
>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>> wrote:
>>>
>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>
>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>> This has shown 10x increase in performance numbers.
>>>
>>> e.g.
>>> iperf TCP test with 10 threads on SPARC S7
>>>
>>> Test 1: Without this patch
>>>
>>> # iperf -s
>>> ------------------------------------------------------------
>>> Server listening on TCP port 5001
>>> TCP window size: 85.3 KByte (default)
>>> ------------------------------------------------------------
>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>
>>> Test 2: with this patch:
>>>
>>> # iperf -s
>>> ------------------------------------------------------------
>>> Server listening on TCP port 5001
>>> TCP window size: 85.3 KByte (default)
>>> ------------------------------------------------------------
>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>> cookies.  Check SNMP counters.
>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>> [ ID] Interval       Transfer     Bandwidth
>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>>
>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>> performance numbers on x86.
>>>
>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>
>>
>> You went through and replaced all of the dma_unmap/map_page calls with
>> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
>
> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
> attr*
>>
>> patches to add the ability to map and unmap pages with attributes that
>> should be available for 4.10-rc1 so if you could wait on this patch
>> until then it would be preferred.
>
> :-) thanks. I will wait until your patches are out.
>
>>
>>> ---
>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>> ++++++++++++++++++++---------
>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> index 6287bf6..800dca7 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>> i40e_ring *ring,
>>>                 else
>>>                         dev_kfree_skb_any(tx_buffer->skb);
>>>                 if (dma_unmap_len(tx_buffer, len))
>>> -                       dma_unmap_single(ring->dev,
>>> -                                        dma_unmap_addr(tx_buffer, dma),
>>> -                                        dma_unmap_len(tx_buffer, len),
>>> -                                        DMA_TO_DEVICE);
>>> +                       dma_unmap_single_attrs(ring->dev,
>>> +                                              dma_unmap_addr(tx_buffer,
>>> dma),
>>> +                                              dma_unmap_len(tx_buffer,
>>> len),
>>> +                                              DMA_TO_DEVICE,
>>> +                                              ring->dma_attrs);
>>>         } else if (dma_unmap_len(tx_buffer, len)) {
>>> -               dma_unmap_page(ring->dev,
>>> -                              dma_unmap_addr(tx_buffer, dma),
>>> -                              dma_unmap_len(tx_buffer, len),
>>> -                              DMA_TO_DEVICE);
>>> +               dma_unmap_single_attrs(ring->dev,
>>> +                                      dma_unmap_addr(tx_buffer, dma),
>>> +                                      dma_unmap_len(tx_buffer, len),
>>> +                                      DMA_TO_DEVICE,
>>> +                                      ring->dma_attrs);
>>>         }
>>>
>>>         tx_buffer->next_to_watch = NULL;
>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>         struct i40e_tx_buffer *tx_buf;
>>>         struct i40e_tx_desc *tx_head;
>>>         struct i40e_tx_desc *tx_desc;
>>> +       dma_addr_t addr;
>>> +       size_t size;
>>>         unsigned int total_bytes = 0, total_packets = 0;
>>>         unsigned int budget = vsi->work_limit;
>>>
>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>>
>>>                 /* unmap skb header data */
>>> -               dma_unmap_single(tx_ring->dev,
>>> -                                dma_unmap_addr(tx_buf, dma),
>>> -                                dma_unmap_len(tx_buf, len),
>>> -                                DMA_TO_DEVICE);
>>> +               dma_unmap_single_attrs(tx_ring->dev,
>>> +                                      dma_unmap_addr(tx_buf, dma),
>>> +                                      dma_unmap_len(tx_buf, len),
>>> +                                      DMA_TO_DEVICE,
>>> +                                      tx_ring->dma_attrs);
>>>
>>>                 /* clear tx_buffer data */
>>>                 tx_buf->skb = NULL;
>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>                         }
>>>
>>> +                       addr = dma_unmap_addr(tx_buf, dma);
>>> +                       size = dma_unmap_len(tx_buf, len);
>>
>>
>> On some architectures this change could lead to issues since
>> dma_unmap_len could be 0 meaning that addr would never be used.
>
> I see. Thanks.
>
>>
>>>                         /* unmap any remaining paged data */
>>>                         if (dma_unmap_len(tx_buf, len)) {
>>> -                               dma_unmap_page(tx_ring->dev,
>>> -                                              dma_unmap_addr(tx_buf,
>>> dma),
>>> -                                              dma_unmap_len(tx_buf,
>>> len),
>>> -                                              DMA_TO_DEVICE);
>>> +                               dma_unmap_single_attrs(tx_ring->dev,
>>> +                                                      addr,
>>> +                                                      size,
>>> +                                                      DMA_TO_DEVICE,
>>> +
>>> tx_ring->dma_attrs);
>>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>>                         }
>>>                 }
>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>> *tx_ring)
>>>          */
>>>         tx_ring->size += sizeof(u32);
>>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>>> +#ifdef CONFIG_SPARC
>>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>> +#else
>>> +       tx_ring->dma_attrs = 0;
>>> +#endif
>>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>                                            &tx_ring->dma, GFP_KERNEL);
>>>         if (!tx_ring->desc) {
>>
>>
>> Also not a fan of adding yet ring attribute.  Is there any reason why
>> you couldn't simply add a set of inline functions at the start of
>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>> drivers?  Then the x86 code doesn't have to change while the SPARC
>> code will be able to be passed the attribute.
>
> Sure I can do that.
>
> I will follow up with patch after your patches for map/unmap page with dma
> attr will be out.
>
> Thanks.
>
> -Tushar
>

I was thinking about it and I realized we can probably simplify this
even further.  In the case of most other architectures the
DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
there is probably no reason not to just always pass that attribute
with the DMA mappings.  From what I can tell the only other
architecture that uses this is the PowerPC Cell architecture.

Also I was wondering if you actually needed to enable this attribute
for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
didn't see anything about reads.  I'm just wondering if changing the
code for Tx has any effect?  If not you could probably drop those
changes and just focus on Rx.

Thanks.

- Alex

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-06 17:10       ` Alexander Duyck
@ 2016-12-06 22:04         ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-06 22:04 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Jeff Kirsher, intel-wired-lan, Netdev



On 12/06/2016 09:10 AM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>>
>>
>> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>>
>>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>>> wrote:
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>>> cookies.  Check SNMP counters.
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>>>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>>>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>>>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>>>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>>>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>>>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>>>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>>>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>>>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>>>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>
>>>
>>> You went through and replaced all of the dma_unmap/map_page calls with
>>> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
>>
>> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
>> attr*
>>>
>>> patches to add the ability to map and unmap pages with attributes that
>>> should be available for 4.10-rc1 so if you could wait on this patch
>>> until then it would be preferred.
>>
>> :-) thanks. I will wait until your patches are out.
>>
>>>
>>>> ---
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>>> i40e_ring *ring,
>>>>                 else
>>>>                         dev_kfree_skb_any(tx_buffer->skb);
>>>>                 if (dma_unmap_len(tx_buffer, len))
>>>> -                       dma_unmap_single(ring->dev,
>>>> -                                        dma_unmap_addr(tx_buffer, dma),
>>>> -                                        dma_unmap_len(tx_buffer, len),
>>>> -                                        DMA_TO_DEVICE);
>>>> +                       dma_unmap_single_attrs(ring->dev,
>>>> +                                              dma_unmap_addr(tx_buffer,
>>>> dma),
>>>> +                                              dma_unmap_len(tx_buffer,
>>>> len),
>>>> +                                              DMA_TO_DEVICE,
>>>> +                                              ring->dma_attrs);
>>>>         } else if (dma_unmap_len(tx_buffer, len)) {
>>>> -               dma_unmap_page(ring->dev,
>>>> -                              dma_unmap_addr(tx_buffer, dma),
>>>> -                              dma_unmap_len(tx_buffer, len),
>>>> -                              DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(ring->dev,
>>>> +                                      dma_unmap_addr(tx_buffer, dma),
>>>> +                                      dma_unmap_len(tx_buffer, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      ring->dma_attrs);
>>>>         }
>>>>
>>>>         tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>         struct i40e_tx_buffer *tx_buf;
>>>>         struct i40e_tx_desc *tx_head;
>>>>         struct i40e_tx_desc *tx_desc;
>>>> +       dma_addr_t addr;
>>>> +       size_t size;
>>>>         unsigned int total_bytes = 0, total_packets = 0;
>>>>         unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>>                 /* unmap skb header data */
>>>> -               dma_unmap_single(tx_ring->dev,
>>>> -                                dma_unmap_addr(tx_buf, dma),
>>>> -                                dma_unmap_len(tx_buf, len),
>>>> -                                DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                      dma_unmap_addr(tx_buf, dma),
>>>> +                                      dma_unmap_len(tx_buf, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      tx_ring->dma_attrs);
>>>>
>>>>                 /* clear tx_buffer data */
>>>>                 tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>>                         }
>>>>
>>>> +                       addr = dma_unmap_addr(tx_buf, dma);
>>>> +                       size = dma_unmap_len(tx_buf, len);
>>>
>>>
>>> On some architectures this change could lead to issues since
>>> dma_unmap_len could be 0 meaning that addr would never be used.
>>
>> I see. Thanks.
>>
>>>
>>>>                         /* unmap any remaining paged data */
>>>>                         if (dma_unmap_len(tx_buf, len)) {
>>>> -                               dma_unmap_page(tx_ring->dev,
>>>> -                                              dma_unmap_addr(tx_buf,
>>>> dma),
>>>> -                                              dma_unmap_len(tx_buf,
>>>> len),
>>>> -                                              DMA_TO_DEVICE);
>>>> +                               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                                      addr,
>>>> +                                                      size,
>>>> +                                                      DMA_TO_DEVICE,
>>>> +
>>>> tx_ring->dma_attrs);
>>>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>>>                         }
>>>>                 }
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>>          */
>>>>         tx_ring->size += sizeof(u32);
>>>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>>> +#else
>>>> +       tx_ring->dma_attrs = 0;
>>>> +#endif
>>>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>>                                            &tx_ring->dma, GFP_KERNEL);
>>>>         if (!tx_ring->desc) {
>>>
>>>
>>> Also not a fan of adding yet ring attribute.  Is there any reason why
>>> you couldn't simply add a set of inline functions at the start of
>>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>>> drivers?  Then the x86 code doesn't have to change while the SPARC
>>> code will be able to be passed the attribute.
>>
>> Sure I can do that.
>>
>> I will follow up with patch after your patches for map/unmap page with dma
>> attr will be out.
>>
>> Thanks.
>>
>> -Tushar
>>
>
> I was thinking about it and I realized we can probably simplify this
> even further.  In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
> tell there is probably no reason not to just always pass that
> attribute with the DMA mappings.  From what I can tell the only
> other architecture that uses this is the PowerPC Cell architecture.
Yes, besides SPARC64, only PowerPC Cell architecture uses
DMA_ATTR_WEAK_ORDERING; I guess it should be okay to always pass
DMA_ATTR_WEAK_ORDERING.
>
> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers?  The patch that
> enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes,
> but I didn't see anything about reads.  I'm just wondering if
> changing the code for Tx has any effect?  If not you could probably
> drop those changes and just focus on Rx.
The patch I sent enabled DMA_ATTR_WEAK_ORDERING for sparc64 so that
write to & read from both rx and tx dma buffers can be relaxed order.

Passing DMA_ATTR_WEAK_ORDERING for tx dma buff doesn't have the same
impact as it has with DMA_ATTR_WEAK_ORDERING and rx dma buffers.
However, I can only confirm if DMA_ATTR_WEAK_ORDERING is not needed at
all for tx dma buffer after collecting some more data!

Thanks.

-Tushar

>
> Thanks.
>
> - Alex
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-06 22:04         ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-06 22:04 UTC (permalink / raw)
  To: intel-wired-lan



On 12/06/2016 09:10 AM, Alexander Duyck wrote:
> On Mon, Dec 5, 2016 at 2:23 PM, tndave <tushar.n.dave@oracle.com> wrote:
>>
>>
>> On 12/05/2016 01:54 PM, Alexander Duyck wrote:
>>>
>>> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
>>> wrote:
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe relax
>>>> ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>>>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924
>>>> [ 14] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> # iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>>>> cookies.  Check SNMP counters.
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876
>>>> [  5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874
>>>> [  6] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872
>>>> [  7] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880
>>>> [  8] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878
>>>> [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884
>>>> [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>>>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890
>>>> [ 12] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888
>>>> [ 13] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec
>>>> [  5]  0.0-20.0 sec  7.48 GBytes  3.21 Gbits/sec
>>>> [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>>>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec
>>>> [  9]  0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec
>>>> [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>>>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec
>>>> [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec
>>>> [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>>>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec
>>>> [SUM]  0.0-20.0 sec  73.8 GBytes  31.6 Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>
>>>
>>> You went through and replaced all of the dma_unmap/map_page calls with
>>> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
>>
>> Yes, because currently there is no DMA API for dma_map/unmap_page with dma
>> attr*
>>>
>>> patches to add the ability to map and unmap pages with attributes that
>>> should be available for 4.10-rc1 so if you could wait on this patch
>>> until then it would be preferred.
>>
>> :-) thanks. I will wait until your patches are out.
>>
>>>
>>>> ---
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void i40e_unmap_and_free_tx_resource(struct
>>>> i40e_ring *ring,
>>>>                 else
>>>>                         dev_kfree_skb_any(tx_buffer->skb);
>>>>                 if (dma_unmap_len(tx_buffer, len))
>>>> -                       dma_unmap_single(ring->dev,
>>>> -                                        dma_unmap_addr(tx_buffer, dma),
>>>> -                                        dma_unmap_len(tx_buffer, len),
>>>> -                                        DMA_TO_DEVICE);
>>>> +                       dma_unmap_single_attrs(ring->dev,
>>>> +                                              dma_unmap_addr(tx_buffer,
>>>> dma),
>>>> +                                              dma_unmap_len(tx_buffer,
>>>> len),
>>>> +                                              DMA_TO_DEVICE,
>>>> +                                              ring->dma_attrs);
>>>>         } else if (dma_unmap_len(tx_buffer, len)) {
>>>> -               dma_unmap_page(ring->dev,
>>>> -                              dma_unmap_addr(tx_buffer, dma),
>>>> -                              dma_unmap_len(tx_buffer, len),
>>>> -                              DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(ring->dev,
>>>> +                                      dma_unmap_addr(tx_buffer, dma),
>>>> +                                      dma_unmap_len(tx_buffer, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      ring->dma_attrs);
>>>>         }
>>>>
>>>>         tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>         struct i40e_tx_buffer *tx_buf;
>>>>         struct i40e_tx_desc *tx_head;
>>>>         struct i40e_tx_desc *tx_desc;
>>>> +       dma_addr_t addr;
>>>> +       size_t size;
>>>>         unsigned int total_bytes = 0, total_packets = 0;
>>>>         unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                 napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>>                 /* unmap skb header data */
>>>> -               dma_unmap_single(tx_ring->dev,
>>>> -                                dma_unmap_addr(tx_buf, dma),
>>>> -                                dma_unmap_len(tx_buf, len),
>>>> -                                DMA_TO_DEVICE);
>>>> +               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                      dma_unmap_addr(tx_buf, dma),
>>>> +                                      dma_unmap_len(tx_buf, len),
>>>> +                                      DMA_TO_DEVICE,
>>>> +                                      tx_ring->dma_attrs);
>>>>
>>>>                 /* clear tx_buffer data */
>>>>                 tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>>                         }
>>>>
>>>> +                       addr = dma_unmap_addr(tx_buf, dma);
>>>> +                       size = dma_unmap_len(tx_buf, len);
>>>
>>>
>>> On some architectures this change could lead to issues since
>>> dma_unmap_len could be 0 meaning that addr would never be used.
>>
>> I see. Thanks.
>>
>>>
>>>>                         /* unmap any remaining paged data */
>>>>                         if (dma_unmap_len(tx_buf, len)) {
>>>> -                               dma_unmap_page(tx_ring->dev,
>>>> -                                              dma_unmap_addr(tx_buf,
>>>> dma),
>>>> -                                              dma_unmap_len(tx_buf,
>>>> len),
>>>> -                                              DMA_TO_DEVICE);
>>>> +                               dma_unmap_single_attrs(tx_ring->dev,
>>>> +                                                      addr,
>>>> +                                                      size,
>>>> +                                                      DMA_TO_DEVICE,
>>>> +
>>>> tx_ring->dma_attrs);
>>>>                                 dma_unmap_len_set(tx_buf, len, 0);
>>>>                         }
>>>>                 }
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>>          */
>>>>         tx_ring->size += sizeof(u32);
>>>>         tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
>>>> +#else
>>>> +       tx_ring->dma_attrs = 0;
>>>> +#endif
>>>>         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>>                                            &tx_ring->dma, GFP_KERNEL);
>>>>         if (!tx_ring->desc) {
>>>
>>>
>>> Also not a fan of adding yet ring attribute.  Is there any reason why
>>> you couldn't simply add a set of inline functions at the start of
>>> i40e_txrx.c that could replace the DMA map/unmap operations in this
>>> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
>>> drivers?  Then the x86 code doesn't have to change while the SPARC
>>> code will be able to be passed the attribute.
>>
>> Sure I can do that.
>>
>> I will follow up with patch after your patches for map/unmap page with dma
>> attr will be out.
>>
>> Thanks.
>>
>> -Tushar
>>
>
> I was thinking about it and I realized we can probably simplify this
> even further.  In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
> tell there is probably no reason not to just always pass that
> attribute with the DMA mappings.  From what I can tell the only
> other architecture that uses this is the PowerPC Cell architecture.
Yes, besides SPARC64, only PowerPC Cell architecture uses
DMA_ATTR_WEAK_ORDERING; I guess it should be okay to always pass
DMA_ATTR_WEAK_ORDERING.
>
> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers?  The patch that
> enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes,
> but I didn't see anything about reads.  I'm just wondering if
> changing the code for Tx has any effect?  If not you could probably
> drop those changes and just focus on Rx.
The patch I sent enabled DMA_ATTR_WEAK_ORDERING for sparc64 so that
write to & read from both rx and tx dma buffers can be relaxed order.

Passing DMA_ATTR_WEAK_ORDERING for tx dma buff doesn't have the same
impact as it has with DMA_ATTR_WEAK_ORDERING and rx dma buffers.
However, I can only confirm if DMA_ATTR_WEAK_ORDERING is not needed at
all for tx dma buffer after collecting some more data!

Thanks.

-Tushar

>
> Thanks.
>
> - Alex
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 21:54   ` Alexander Duyck
@ 2016-12-08 10:31     ` David Laight
  -1 siblings, 0 replies; 33+ messages in thread
From: David Laight @ 2016-12-08 10:31 UTC (permalink / raw)
  To: 'Alexander Duyck', Tushar Dave
  Cc: Jeff Kirsher, intel-wired-lan, Netdev

From: Alexander Duyck
> Sent: 05 December 2016 21:55
...
> > @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
> >          */
> >         tx_ring->size += sizeof(u32);
> >         tx_ring->size = ALIGN(tx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> > +#else
> > +       tx_ring->dma_attrs = 0;
> > +#endif
> >         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >                                            &tx_ring->dma, GFP_KERNEL);
> >         if (!tx_ring->desc) {
> 
> Also not a fan of adding yet ring attribute.  Is there any reason why
> you couldn't simply add a set of inline functions at the start of
> i40e_txrx.c that could replace the DMA map/unmap operations in this
> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
> drivers?  Then the x86 code doesn't have to change while the SPARC
> code will be able to be passed the attribute.

Or use something like:
#ifdef CONFIG_SPARC
       #define RING_DMA_ATTR DMA_ATTR_WEAK_ORDERING
#else
       #define RING_DMA_ATTR 0
#endif
and pass the constant to the function calls.

Is there actually ever a problem passing DMA_ATTR_WEAK_ORDERING?
I'd guess that it will be ignored if it can't be implemented (or isn't needed).

	David


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-08 10:31     ` David Laight
  0 siblings, 0 replies; 33+ messages in thread
From: David Laight @ 2016-12-08 10:31 UTC (permalink / raw)
  To: intel-wired-lan

From: Alexander Duyck
> Sent: 05 December 2016 21:55
...
> > @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
> >          */
> >         tx_ring->size += sizeof(u32);
> >         tx_ring->size = ALIGN(tx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING;
> > +#else
> > +       tx_ring->dma_attrs = 0;
> > +#endif
> >         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >                                            &tx_ring->dma, GFP_KERNEL);
> >         if (!tx_ring->desc) {
> 
> Also not a fan of adding yet ring attribute.  Is there any reason why
> you couldn't simply add a set of inline functions at the start of
> i40e_txrx.c that could replace the DMA map/unmap operations in this
> code but pass either 0 or DMA_ATTR_WEAK_ORDERING as needed for the
> drivers?  Then the x86 code doesn't have to change while the SPARC
> code will be able to be passed the attribute.

Or use something like:
#ifdef CONFIG_SPARC
       #define RING_DMA_ATTR DMA_ATTR_WEAK_ORDERING
#else
       #define RING_DMA_ATTR 0
#endif
and pass the constant to the function calls.

Is there actually ever a problem passing DMA_ATTR_WEAK_ORDERING?
I'd guess that it will be ignored if it can't be implemented (or isn't needed).

	David


^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-06 17:10       ` Alexander Duyck
@ 2016-12-08 10:43         ` David Laight
  -1 siblings, 0 replies; 33+ messages in thread
From: David Laight @ 2016-12-08 10:43 UTC (permalink / raw)
  To: 'Alexander Duyck', tndave; +Cc: Jeff Kirsher, intel-wired-lan, Netdev

From: Alexander Duyck
> Sent: 06 December 2016 17:10
...
> I was thinking about it and I realized we can probably simplify this
> even further.  In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
> there is probably no reason not to just always pass that attribute
> with the DMA mappings.  From what I can tell the only other
> architecture that uses this is the PowerPC Cell architecture.

And I should have read all the thread :-(

> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
> DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
> didn't see anything about reads.  I'm just wondering if changing the
> code for Tx has any effect?  If not you could probably drop those
> changes and just focus on Rx.

'Weak ordering' only applies to PCIe read transfers, so can only have
an effect on descriptor reads and transmit buffer reads.

Basically PCIe is a comms protocol and an endpoint (or the host) can
have multiple outstanding read requests (each of which might generate
multiple response messages.
The responses for each request must arrive in order, but responses for
different requests can be interleaved.
Setting 'not weak ordering' lets the host interwork with broken endpoints.
(Or, like we did, you fix the fpga's PCIe implementation.)

In this case you need the reads of both transmit and receive rings to
'overtake' reads of transmit data.

I'm not at all clear how this 'flag' can be set on dma_map().
It is a property of the PCIe subsystem.

	David



^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-08 10:43         ` David Laight
  0 siblings, 0 replies; 33+ messages in thread
From: David Laight @ 2016-12-08 10:43 UTC (permalink / raw)
  To: intel-wired-lan

From: Alexander Duyck
> Sent: 06 December 2016 17:10
...
> I was thinking about it and I realized we can probably simplify this
> even further.  In the case of most other architectures the
> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
> there is probably no reason not to just always pass that attribute
> with the DMA mappings.  From what I can tell the only other
> architecture that uses this is the PowerPC Cell architecture.

And I should have read all the thread :-(

> Also I was wondering if you actually needed to enable this attribute
> for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
> DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
> didn't see anything about reads.  I'm just wondering if changing the
> code for Tx has any effect?  If not you could probably drop those
> changes and just focus on Rx.

'Weak ordering' only applies to PCIe read transfers, so can only have
an effect on descriptor reads and transmit buffer reads.

Basically PCIe is a comms protocol and an endpoint (or the host) can
have multiple outstanding read requests (each of which might generate
multiple response messages.
The responses for each request must arrive in order, but responses for
different requests can be interleaved.
Setting 'not weak ordering' lets the host interwork with broken endpoints.
(Or, like we did, you fix the fpga's PCIe implementation.)

In this case you need the reads of both transmit and receive rings to
'overtake' reads of transmit data.

I'm not at all clear how this 'flag' can be set on dma_map().
It is a property of the PCIe subsystem.

	David



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-08 10:43         ` David Laight
@ 2016-12-08 16:05           ` Alexander Duyck
  -1 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-08 16:05 UTC (permalink / raw)
  To: David Laight; +Cc: tndave, Jeff Kirsher, intel-wired-lan, Netdev

On Thu, Dec 8, 2016 at 2:43 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexander Duyck
>> Sent: 06 December 2016 17:10
> ...
>> I was thinking about it and I realized we can probably simplify this
>> even further.  In the case of most other architectures the
>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
>> there is probably no reason not to just always pass that attribute
>> with the DMA mappings.  From what I can tell the only other
>> architecture that uses this is the PowerPC Cell architecture.
>
> And I should have read all the thread :-(
>
>> Also I was wondering if you actually needed to enable this attribute
>> for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
>> DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
>> didn't see anything about reads.  I'm just wondering if changing the
>> code for Tx has any effect?  If not you could probably drop those
>> changes and just focus on Rx.
>
> 'Weak ordering' only applies to PCIe read transfers, so can only have
> an effect on descriptor reads and transmit buffer reads.
>
> Basically PCIe is a comms protocol and an endpoint (or the host) can
> have multiple outstanding read requests (each of which might generate
> multiple response messages.
> The responses for each request must arrive in order, but responses for
> different requests can be interleaved.
> Setting 'not weak ordering' lets the host interwork with broken endpoints.
> (Or, like we did, you fix the fpga's PCIe implementation.)

I get the basics of relaxed ordering.  The question is how does the
Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
messages, and at what level the ordering is relaxed.  Odds are the
wording in the description where this attribute was added to Sparc is
just awkward, but I was wanting to verify if this only applies to
writes, or also read completions.

> In this case you need the reads of both transmit and receive rings to
> 'overtake' reads of transmit data.

Actually that isn't quite right.  With relaxed ordering completions
and writes can pass each other if I recall correctly, but reads will
always force all writes ahead of them to be completed before you can
begin generating the read completions.

> I'm not at all clear how this 'flag' can be set on dma_map().
> It is a property of the PCIe subsystem.

That was where my original question on this came in.  We can do a
blanket enable of relaxed ordering for Tx and Rx data buffers, but if
we only need it on Rx then there isn't any need for us to make
unnecessary changes.

- Alex

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-08 16:05           ` Alexander Duyck
  0 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-08 16:05 UTC (permalink / raw)
  To: intel-wired-lan

On Thu, Dec 8, 2016 at 2:43 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexander Duyck
>> Sent: 06 December 2016 17:10
> ...
>> I was thinking about it and I realized we can probably simplify this
>> even further.  In the case of most other architectures the
>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can tell
>> there is probably no reason not to just always pass that attribute
>> with the DMA mappings.  From what I can tell the only other
>> architecture that uses this is the PowerPC Cell architecture.
>
> And I should have read all the thread :-(
>
>> Also I was wondering if you actually needed to enable this attribute
>> for both Rx and Tx buffers or just Rx buffers?  The patch that enabled
>> DMA_ATTR_WEAK_ORDERING for Sparc64 seems to call out writes, but I
>> didn't see anything about reads.  I'm just wondering if changing the
>> code for Tx has any effect?  If not you could probably drop those
>> changes and just focus on Rx.
>
> 'Weak ordering' only applies to PCIe read transfers, so can only have
> an effect on descriptor reads and transmit buffer reads.
>
> Basically PCIe is a comms protocol and an endpoint (or the host) can
> have multiple outstanding read requests (each of which might generate
> multiple response messages.
> The responses for each request must arrive in order, but responses for
> different requests can be interleaved.
> Setting 'not weak ordering' lets the host interwork with broken endpoints.
> (Or, like we did, you fix the fpga's PCIe implementation.)

I get the basics of relaxed ordering.  The question is how does the
Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
messages, and at what level the ordering is relaxed.  Odds are the
wording in the description where this attribute was added to Sparc is
just awkward, but I was wanting to verify if this only applies to
writes, or also read completions.

> In this case you need the reads of both transmit and receive rings to
> 'overtake' reads of transmit data.

Actually that isn't quite right.  With relaxed ordering completions
and writes can pass each other if I recall correctly, but reads will
always force all writes ahead of them to be completed before you can
begin generating the read completions.

> I'm not at all clear how this 'flag' can be set on dma_map().
> It is a property of the PCIe subsystem.

That was where my original question on this came in.  We can do a
blanket enable of relaxed ordering for Tx and Rx data buffers, but if
we only need it on Rx then there isn't any need for us to make
unnecessary changes.

- Alex

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-08 16:05           ` Alexander Duyck
@ 2016-12-09  0:45             ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-09  0:45 UTC (permalink / raw)
  To: Alexander Duyck, David Laight; +Cc: Jeff Kirsher, intel-wired-lan, Netdev



On 12/08/2016 08:05 AM, Alexander Duyck wrote:
> On Thu, Dec 8, 2016 at 2:43 AM, David Laight
> <David.Laight@aculab.com> wrote:
>> From: Alexander Duyck
>>> Sent: 06 December 2016 17:10
>> ...
>>> I was thinking about it and I realized we can probably simplify
>>> this even further.  In the case of most other architectures the
>>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
>>> tell there is probably no reason not to just always pass that
>>> attribute with the DMA mappings.  From what I can tell the only
>>> other architecture that uses this is the PowerPC Cell
>>> architecture.
>>
>> And I should have read all the thread :-(
>>
>>> Also I was wondering if you actually needed to enable this
>>> attribute for both Rx and Tx buffers or just Rx buffers?  The
>>> patch that enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to
>>> call out writes, but I didn't see anything about reads.  I'm just
>>> wondering if changing the code for Tx has any effect?  If not you
>>> could probably drop those changes and just focus on Rx.
>>
>> 'Weak ordering' only applies to PCIe read transfers, so can only
>> have an effect on descriptor reads and transmit buffer reads.
>>
>> Basically PCIe is a comms protocol and an endpoint (or the host)
>> can have multiple outstanding read requests (each of which might
>> generate multiple response messages. The responses for each request
>> must arrive in order, but responses for different requests can be
>> interleaved. Setting 'not weak ordering' lets the host interwork
>> with broken endpoints. (Or, like we did, you fix the fpga's PCIe
>> implementation.)
>
> I get the basics of relaxed ordering.  The question is how does the
> Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
> messages, and at what level the ordering is relaxed.  Odds are the
> wording in the description where this attribute was added to Sparc
> is just awkward, but I was wanting to verify if this only applies to
> writes, or also read completions.
In Sparc64, passing DMA_ATTR_WEAK_ORDERING in dma map/unmap only affects 
PCIe root complex (Hostbridge). Using DMA_ATTR_WEAK_ORDERING, requested 
DMA transaction can be relaxed ordered within the PCIe root complex.

In Sparc64, memory writes can be held at PCIe root complex not letting
other memory writes to go through. By passing DMA_ATTR_WEAK_ORDERING in
dma map/unmap allows memory writes to bypass other memory writes in PCIe
root complex. (This applies to only PCIe root complex and does not 
affect at any other level of PCIe hierarchy e.g. PCIe bridges et al. 
Also the PCIe root complex when bypassing memory writes does follow PCIe 
relax ordering rules as per PCIe specification.

For reference [old but still relevant write-up]: PCI-Express Relaxed 
Ordering and the Sun SPARC Enterprise M-class Servers
https://blogs.oracle.com/olympus/entry/relaxed_ordering

>
>> In this case you need the reads of both transmit and receive rings
>> to 'overtake' reads of transmit data.
>
> Actually that isn't quite right.  With relaxed ordering completions
> and writes can pass each other if I recall correctly, but reads will
> always force all writes ahead of them to be completed before you can
> begin generating the read completions.
That is my understanding as well.

>
>> I'm not at all clear how this 'flag' can be set on dma_map(). It is
>> a property of the PCIe subsystem.
Because in Sparc64, passing DMA_ATTR_WEAK_ORDERING flag in DMA map/unmap 
adds an entry in IOMMU/ATU table so that an access to requested DMA 
address from PCIe root complex can be relaxed ordered.
>
> That was where my original question on this came in.  We can do a
> blanket enable of relaxed ordering for Tx and Rx data buffers, but
> if we only need it on Rx then there isn't any need for us to make
> unnecessary changes.
I ran some quick test and it is likely that we don't need
DMA_ATTR_WEAK_ORDERING for any TX dma buffer (because in case of TX dma
buffers, its all memory reads from device).

-Tushar
>
> - Alex
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-09  0:45             ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-09  0:45 UTC (permalink / raw)
  To: intel-wired-lan



On 12/08/2016 08:05 AM, Alexander Duyck wrote:
> On Thu, Dec 8, 2016 at 2:43 AM, David Laight
> <David.Laight@aculab.com> wrote:
>> From: Alexander Duyck
>>> Sent: 06 December 2016 17:10
>> ...
>>> I was thinking about it and I realized we can probably simplify
>>> this even further.  In the case of most other architectures the
>>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
>>> tell there is probably no reason not to just always pass that
>>> attribute with the DMA mappings.  From what I can tell the only
>>> other architecture that uses this is the PowerPC Cell
>>> architecture.
>>
>> And I should have read all the thread :-(
>>
>>> Also I was wondering if you actually needed to enable this
>>> attribute for both Rx and Tx buffers or just Rx buffers?  The
>>> patch that enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to
>>> call out writes, but I didn't see anything about reads.  I'm just
>>> wondering if changing the code for Tx has any effect?  If not you
>>> could probably drop those changes and just focus on Rx.
>>
>> 'Weak ordering' only applies to PCIe read transfers, so can only
>> have an effect on descriptor reads and transmit buffer reads.
>>
>> Basically PCIe is a comms protocol and an endpoint (or the host)
>> can have multiple outstanding read requests (each of which might
>> generate multiple response messages. The responses for each request
>> must arrive in order, but responses for different requests can be
>> interleaved. Setting 'not weak ordering' lets the host interwork
>> with broken endpoints. (Or, like we did, you fix the fpga's PCIe
>> implementation.)
>
> I get the basics of relaxed ordering.  The question is how does the
> Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
> messages, and at what level the ordering is relaxed.  Odds are the
> wording in the description where this attribute was added to Sparc
> is just awkward, but I was wanting to verify if this only applies to
> writes, or also read completions.
In Sparc64, passing DMA_ATTR_WEAK_ORDERING in dma map/unmap only affects 
PCIe root complex (Hostbridge). Using DMA_ATTR_WEAK_ORDERING, requested 
DMA transaction can be relaxed ordered within the PCIe root complex.

In Sparc64, memory writes can be held at PCIe root complex not letting
other memory writes to go through. By passing DMA_ATTR_WEAK_ORDERING in
dma map/unmap allows memory writes to bypass other memory writes in PCIe
root complex. (This applies to only PCIe root complex and does not 
affect at any other level of PCIe hierarchy e.g. PCIe bridges et al. 
Also the PCIe root complex when bypassing memory writes does follow PCIe 
relax ordering rules as per PCIe specification.

For reference [old but still relevant write-up]: PCI-Express Relaxed 
Ordering and the Sun SPARC Enterprise M-class Servers
https://blogs.oracle.com/olympus/entry/relaxed_ordering

>
>> In this case you need the reads of both transmit and receive rings
>> to 'overtake' reads of transmit data.
>
> Actually that isn't quite right.  With relaxed ordering completions
> and writes can pass each other if I recall correctly, but reads will
> always force all writes ahead of them to be completed before you can
> begin generating the read completions.
That is my understanding as well.

>
>> I'm not at all clear how this 'flag' can be set on dma_map(). It is
>> a property of the PCIe subsystem.
Because in Sparc64, passing DMA_ATTR_WEAK_ORDERING flag in DMA map/unmap 
adds an entry in IOMMU/ATU table so that an access to requested DMA 
address from PCIe root complex can be relaxed ordered.
>
> That was where my original question on this came in.  We can do a
> blanket enable of relaxed ordering for Tx and Rx data buffers, but
> if we only need it on Rx then there isn't any need for us to make
> unnecessary changes.
I ran some quick test and it is likely that we don't need
DMA_ATTR_WEAK_ORDERING for any TX dma buffer (because in case of TX dma
buffers, its all memory reads from device).

-Tushar
>
> - Alex
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-09  0:45             ` tndave
  (?)
@ 2016-12-09  1:16               ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-09  1:16 UTC (permalink / raw)
  To: Alexander Duyck, David Laight
  Cc: Jeff Kirsher, intel-wired-lan, Netdev, sparclinux



On 12/08/2016 04:45 PM, tndave wrote:
>
>
> On 12/08/2016 08:05 AM, Alexander Duyck wrote:
>> On Thu, Dec 8, 2016 at 2:43 AM, David Laight
>> <David.Laight@aculab.com> wrote:
>>> From: Alexander Duyck
>>>> Sent: 06 December 2016 17:10
>>> ...
>>>> I was thinking about it and I realized we can probably simplify
>>>> this even further.  In the case of most other architectures the
>>>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
>>>> tell there is probably no reason not to just always pass that
>>>> attribute with the DMA mappings.  From what I can tell the only
>>>> other architecture that uses this is the PowerPC Cell
>>>> architecture.
>>>
>>> And I should have read all the thread :-(
>>>
>>>> Also I was wondering if you actually needed to enable this
>>>> attribute for both Rx and Tx buffers or just Rx buffers?  The
>>>> patch that enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to
>>>> call out writes, but I didn't see anything about reads.  I'm just
>>>> wondering if changing the code for Tx has any effect?  If not you
>>>> could probably drop those changes and just focus on Rx.
>>>
>>> 'Weak ordering' only applies to PCIe read transfers, so can only
>>> have an effect on descriptor reads and transmit buffer reads.
>>>
>>> Basically PCIe is a comms protocol and an endpoint (or the host)
>>> can have multiple outstanding read requests (each of which might
>>> generate multiple response messages. The responses for each request
>>> must arrive in order, but responses for different requests can be
>>> interleaved. Setting 'not weak ordering' lets the host interwork
>>> with broken endpoints. (Or, like we did, you fix the fpga's PCIe
>>> implementation.)
>>
>> I get the basics of relaxed ordering.  The question is how does the
>> Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
>> messages, and at what level the ordering is relaxed.  Odds are the
>> wording in the description where this attribute was added to Sparc
>> is just awkward, but I was wanting to verify if this only applies to
>> writes, or also read completions.
> In Sparc64, passing DMA_ATTR_WEAK_ORDERING in dma map/unmap only affects
> PCIe root complex (Hostbridge). Using DMA_ATTR_WEAK_ORDERING, requested
> DMA transaction can be relaxed ordered within the PCIe root complex.
>
> In Sparc64, memory writes can be held at PCIe root complex not letting
> other memory writes to go through. By passing DMA_ATTR_WEAK_ORDERING in
> dma map/unmap allows memory writes to bypass other memory writes in PCIe
> root complex. (This applies to only PCIe root complex and does not
> affect at any other level of PCIe hierarchy e.g. PCIe bridges et al.
> Also the PCIe root complex when bypassing memory writes does follow PCIe
> relax ordering rules as per PCIe specification.
>
> For reference [old but still relevant write-up]: PCI-Express Relaxed
> Ordering and the Sun SPARC Enterprise M-class Servers
> https://blogs.oracle.com/olympus/entry/relaxed_ordering
>
>>
>>> In this case you need the reads of both transmit and receive rings
>>> to 'overtake' reads of transmit data.
>>
>> Actually that isn't quite right.  With relaxed ordering completions
>> and writes can pass each other if I recall correctly, but reads will
>> always force all writes ahead of them to be completed before you can
>> begin generating the read completions.
> That is my understanding as well.
>
>>
>>> I'm not at all clear how this 'flag' can be set on dma_map(). It is
>>> a property of the PCIe subsystem.
> Because in Sparc64, passing DMA_ATTR_WEAK_ORDERING flag in DMA map/unmap
> adds an entry in IOMMU/ATU table so that an access to requested DMA
> address from PCIe root complex can be relaxed ordered.
>>
>> That was where my original question on this came in.  We can do a
>> blanket enable of relaxed ordering for Tx and Rx data buffers, but
>> if we only need it on Rx then there isn't any need for us to make
>> unnecessary changes.
> I ran some quick test and it is likely that we don't need
> DMA_ATTR_WEAK_ORDERING for any TX dma buffer (because in case of TX dma
> buffers, its all memory reads from device).
in above line , s/from/by

+ cc sparclinux@vger.kernel.org

-Tushar
>
> -Tushar
>>
>> - Alex
>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-09  1:16               ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-09  1:16 UTC (permalink / raw)
  To: Alexander Duyck, David Laight
  Cc: Jeff Kirsher, intel-wired-lan, Netdev, sparclinux



On 12/08/2016 04:45 PM, tndave wrote:
>
>
> On 12/08/2016 08:05 AM, Alexander Duyck wrote:
>> On Thu, Dec 8, 2016 at 2:43 AM, David Laight
>> <David.Laight@aculab.com> wrote:
>>> From: Alexander Duyck
>>>> Sent: 06 December 2016 17:10
>>> ...
>>>> I was thinking about it and I realized we can probably simplify
>>>> this even further.  In the case of most other architectures the
>>>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
>>>> tell there is probably no reason not to just always pass that
>>>> attribute with the DMA mappings.  From what I can tell the only
>>>> other architecture that uses this is the PowerPC Cell
>>>> architecture.
>>>
>>> And I should have read all the thread :-(
>>>
>>>> Also I was wondering if you actually needed to enable this
>>>> attribute for both Rx and Tx buffers or just Rx buffers?  The
>>>> patch that enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to
>>>> call out writes, but I didn't see anything about reads.  I'm just
>>>> wondering if changing the code for Tx has any effect?  If not you
>>>> could probably drop those changes and just focus on Rx.
>>>
>>> 'Weak ordering' only applies to PCIe read transfers, so can only
>>> have an effect on descriptor reads and transmit buffer reads.
>>>
>>> Basically PCIe is a comms protocol and an endpoint (or the host)
>>> can have multiple outstanding read requests (each of which might
>>> generate multiple response messages. The responses for each request
>>> must arrive in order, but responses for different requests can be
>>> interleaved. Setting 'not weak ordering' lets the host interwork
>>> with broken endpoints. (Or, like we did, you fix the fpga's PCIe
>>> implementation.)
>>
>> I get the basics of relaxed ordering.  The question is how does the
>> Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
>> messages, and at what level the ordering is relaxed.  Odds are the
>> wording in the description where this attribute was added to Sparc
>> is just awkward, but I was wanting to verify if this only applies to
>> writes, or also read completions.
> In Sparc64, passing DMA_ATTR_WEAK_ORDERING in dma map/unmap only affects
> PCIe root complex (Hostbridge). Using DMA_ATTR_WEAK_ORDERING, requested
> DMA transaction can be relaxed ordered within the PCIe root complex.
>
> In Sparc64, memory writes can be held at PCIe root complex not letting
> other memory writes to go through. By passing DMA_ATTR_WEAK_ORDERING in
> dma map/unmap allows memory writes to bypass other memory writes in PCIe
> root complex. (This applies to only PCIe root complex and does not
> affect at any other level of PCIe hierarchy e.g. PCIe bridges et al.
> Also the PCIe root complex when bypassing memory writes does follow PCIe
> relax ordering rules as per PCIe specification.
>
> For reference [old but still relevant write-up]: PCI-Express Relaxed
> Ordering and the Sun SPARC Enterprise M-class Servers
> https://blogs.oracle.com/olympus/entry/relaxed_ordering
>
>>
>>> In this case you need the reads of both transmit and receive rings
>>> to 'overtake' reads of transmit data.
>>
>> Actually that isn't quite right.  With relaxed ordering completions
>> and writes can pass each other if I recall correctly, but reads will
>> always force all writes ahead of them to be completed before you can
>> begin generating the read completions.
> That is my understanding as well.
>
>>
>>> I'm not at all clear how this 'flag' can be set on dma_map(). It is
>>> a property of the PCIe subsystem.
> Because in Sparc64, passing DMA_ATTR_WEAK_ORDERING flag in DMA map/unmap
> adds an entry in IOMMU/ATU table so that an access to requested DMA
> address from PCIe root complex can be relaxed ordered.
>>
>> That was where my original question on this came in.  We can do a
>> blanket enable of relaxed ordering for Tx and Rx data buffers, but
>> if we only need it on Rx then there isn't any need for us to make
>> unnecessary changes.
> I ran some quick test and it is likely that we don't need
> DMA_ATTR_WEAK_ORDERING for any TX dma buffer (because in case of TX dma
> buffers, its all memory reads from device).
in above line , s/from/by

+ cc sparclinux@vger.kernel.org

-Tushar
>
> -Tushar
>>
>> - Alex
>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-09  1:16               ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-09  1:16 UTC (permalink / raw)
  To: intel-wired-lan



On 12/08/2016 04:45 PM, tndave wrote:
>
>
> On 12/08/2016 08:05 AM, Alexander Duyck wrote:
>> On Thu, Dec 8, 2016 at 2:43 AM, David Laight
>> <David.Laight@aculab.com> wrote:
>>> From: Alexander Duyck
>>>> Sent: 06 December 2016 17:10
>>> ...
>>>> I was thinking about it and I realized we can probably simplify
>>>> this even further.  In the case of most other architectures the
>>>> DMA_ATTR_WEAK_ORDERING has no effect anyway.  So from what I can
>>>> tell there is probably no reason not to just always pass that
>>>> attribute with the DMA mappings.  From what I can tell the only
>>>> other architecture that uses this is the PowerPC Cell
>>>> architecture.
>>>
>>> And I should have read all the thread :-(
>>>
>>>> Also I was wondering if you actually needed to enable this
>>>> attribute for both Rx and Tx buffers or just Rx buffers?  The
>>>> patch that enabled DMA_ATTR_WEAK_ORDERING for Sparc64 seems to
>>>> call out writes, but I didn't see anything about reads.  I'm just
>>>> wondering if changing the code for Tx has any effect?  If not you
>>>> could probably drop those changes and just focus on Rx.
>>>
>>> 'Weak ordering' only applies to PCIe read transfers, so can only
>>> have an effect on descriptor reads and transmit buffer reads.
>>>
>>> Basically PCIe is a comms protocol and an endpoint (or the host)
>>> can have multiple outstanding read requests (each of which might
>>> generate multiple response messages. The responses for each request
>>> must arrive in order, but responses for different requests can be
>>> interleaved. Setting 'not weak ordering' lets the host interwork
>>> with broken endpoints. (Or, like we did, you fix the fpga's PCIe
>>> implementation.)
>>
>> I get the basics of relaxed ordering.  The question is how does the
>> Sparc64 IOMMU translate DMA_ATTR_WEAK_ORDERING into relaxed ordering
>> messages, and at what level the ordering is relaxed.  Odds are the
>> wording in the description where this attribute was added to Sparc
>> is just awkward, but I was wanting to verify if this only applies to
>> writes, or also read completions.
> In Sparc64, passing DMA_ATTR_WEAK_ORDERING in dma map/unmap only affects
> PCIe root complex (Hostbridge). Using DMA_ATTR_WEAK_ORDERING, requested
> DMA transaction can be relaxed ordered within the PCIe root complex.
>
> In Sparc64, memory writes can be held at PCIe root complex not letting
> other memory writes to go through. By passing DMA_ATTR_WEAK_ORDERING in
> dma map/unmap allows memory writes to bypass other memory writes in PCIe
> root complex. (This applies to only PCIe root complex and does not
> affect at any other level of PCIe hierarchy e.g. PCIe bridges et al.
> Also the PCIe root complex when bypassing memory writes does follow PCIe
> relax ordering rules as per PCIe specification.
>
> For reference [old but still relevant write-up]: PCI-Express Relaxed
> Ordering and the Sun SPARC Enterprise M-class Servers
> https://blogs.oracle.com/olympus/entry/relaxed_ordering
>
>>
>>> In this case you need the reads of both transmit and receive rings
>>> to 'overtake' reads of transmit data.
>>
>> Actually that isn't quite right.  With relaxed ordering completions
>> and writes can pass each other if I recall correctly, but reads will
>> always force all writes ahead of them to be completed before you can
>> begin generating the read completions.
> That is my understanding as well.
>
>>
>>> I'm not at all clear how this 'flag' can be set on dma_map(). It is
>>> a property of the PCIe subsystem.
> Because in Sparc64, passing DMA_ATTR_WEAK_ORDERING flag in DMA map/unmap
> adds an entry in IOMMU/ATU table so that an access to requested DMA
> address from PCIe root complex can be relaxed ordered.
>>
>> That was where my original question on this came in.  We can do a
>> blanket enable of relaxed ordering for Tx and Rx data buffers, but
>> if we only need it on Rx then there isn't any need for us to make
>> unnecessary changes.
> I ran some quick test and it is likely that we don't need
> DMA_ATTR_WEAK_ORDERING for any TX dma buffer (because in case of TX dma
> buffers, its all memory reads from device).
in above line , s/from/by

+ cc sparclinux at vger.kernel.org

-Tushar
>
> -Tushar
>>
>> - Alex
>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 17:07 ` [Intel-wired-lan] " Tushar Dave
@ 2016-12-26 11:39   ` maowenan
  -1 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-26 11:39 UTC (permalink / raw)
  To: Tushar Dave, jeffrey.t.kirsher, intel-wired-lan
  Cc: netdev, weiyongjun (A), Dingtianhong



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Tushar Dave
> Sent: Tuesday, December 06, 2016 1:07 AM
> To: jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org
> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> 
> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
> enabled, i40e performance is significantly low on SPARC.
> 
[Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in 82599?  

And to enable relax ordering mode in 82599 for SPARC using below codes:
s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
{
	u32 i;

	/* Clear the rate limiters */
	for (i = 0; i < hw->mac.max_tx_queues; i++) {
		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
	}
	IXGBE_WRITE_FLUSH(hw);

#ifndef CONFIG_SPARC
	/* Disable relaxed ordering */
	for (i = 0; i < hw->mac.max_tx_queues; i++) {
		u32 regval;

		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
	}

	for (i = 0; i < hw->mac.max_rx_queues; i++) {
		u32 regval;

		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
	}
#endif
	return 0;
}



> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> This has shown 10x increase in performance numbers.
> 
> e.g.
> iperf TCP test with 10 threads on SPARC S7
> 
> Test 1: Without this patch
> 
> [root@brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 40980
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> 
> Test 2: with this patch:
> 
> [root@brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
> Check SNMP counters.
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 46882
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
> GBytes  31.6 Gbits/sec
> 
> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
> numbers on x86.
> 
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
> 1 +
>  2 files changed, 49 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 6287bf6..800dca7 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -551,15 +551,17 @@ static void
> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>  		else
>  			dev_kfree_skb_any(tx_buffer->skb);
>  		if (dma_unmap_len(tx_buffer, len))
> -			dma_unmap_single(ring->dev,
> -					 dma_unmap_addr(tx_buffer, dma),
> -					 dma_unmap_len(tx_buffer, len),
> -					 DMA_TO_DEVICE);
> +			dma_unmap_single_attrs(ring->dev,
> +					       dma_unmap_addr(tx_buffer, dma),
> +					       dma_unmap_len(tx_buffer, len),
> +					       DMA_TO_DEVICE,
> +					       ring->dma_attrs);
>  	} else if (dma_unmap_len(tx_buffer, len)) {
> -		dma_unmap_page(ring->dev,
> -			       dma_unmap_addr(tx_buffer, dma),
> -			       dma_unmap_len(tx_buffer, len),
> -			       DMA_TO_DEVICE);
> +		dma_unmap_single_attrs(ring->dev,
> +				       dma_unmap_addr(tx_buffer, dma),
> +				       dma_unmap_len(tx_buffer, len),
> +				       DMA_TO_DEVICE,
> +				       ring->dma_attrs);
>  	}
> 
>  	tx_buffer->next_to_watch = NULL;
> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  	struct i40e_tx_buffer *tx_buf;
>  	struct i40e_tx_desc *tx_head;
>  	struct i40e_tx_desc *tx_desc;
> +	dma_addr_t addr;
> +	size_t size;
>  	unsigned int total_bytes = 0, total_packets = 0;
>  	unsigned int budget = vsi->work_limit;
> 
> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  		napi_consume_skb(tx_buf->skb, napi_budget);
> 
>  		/* unmap skb header data */
> -		dma_unmap_single(tx_ring->dev,
> -				 dma_unmap_addr(tx_buf, dma),
> -				 dma_unmap_len(tx_buf, len),
> -				 DMA_TO_DEVICE);
> +		dma_unmap_single_attrs(tx_ring->dev,
> +				       dma_unmap_addr(tx_buf, dma),
> +				       dma_unmap_len(tx_buf, len),
> +				       DMA_TO_DEVICE,
> +				       tx_ring->dma_attrs);
> 
>  		/* clear tx_buffer data */
>  		tx_buf->skb = NULL;
> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>  			}
> 
> +			addr = dma_unmap_addr(tx_buf, dma);
> +			size = dma_unmap_len(tx_buf, len);
>  			/* unmap any remaining paged data */
>  			if (dma_unmap_len(tx_buf, len)) {
> -				dma_unmap_page(tx_ring->dev,
> -					       dma_unmap_addr(tx_buf, dma),
> -					       dma_unmap_len(tx_buf, len),
> -					       DMA_TO_DEVICE);
> +				dma_unmap_single_attrs(tx_ring->dev,
> +						       addr,
> +						       size,
> +						       DMA_TO_DEVICE,
> +						       tx_ring->dma_attrs);
>  				dma_unmap_len_set(tx_buf, len, 0);
>  			}
>  		}
> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> *tx_ring)
>  	 */
>  	tx_ring->size += sizeof(u32);
>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> +	tx_ring->dma_attrs = 0;
> +#endif
>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>  					   &tx_ring->dma, GFP_KERNEL);
>  	if (!tx_ring->desc) {
> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>  		if (!rx_bi->page)
>  			continue;
> 
> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> DMA_FROM_DEVICE);
> +		dma_unmap_single_attrs(dev,
> +				       rx_bi->dma,
> +				       PAGE_SIZE,
> +				       DMA_FROM_DEVICE,
> +				       rx_ring->dma_attrs);
>  		__free_pages(rx_bi->page, 0);
> 
>  		rx_bi->page = NULL;
> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> *rx_ring)
>  	/* Round up to nearest 4K */
>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> +	rx_ring->dma_attrs = 0;
> +#endif
>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>  					   &rx_ring->dma, GFP_KERNEL);
> 
> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> i40e_ring *rx_ring,
>  	}
> 
>  	/* map page for use */
> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> DMA_FROM_DEVICE);
> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> PAGE_SIZE,
> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
> 
>  	/* if mapping failed free memory back to system since
>  	 * there isn't much point in holding memory we can't use @@ -1695,8
> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>  		rx_ring->rx_stats.page_reuse_count++;
>  	} else {
>  		/* we are not reusing the buffer so unmap it */
> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> -			       DMA_FROM_DEVICE);
> +		dma_unmap_single_attrs(rx_ring->dev,
> +				       rx_buffer->dma,
> +				       PAGE_SIZE,
> +				       DMA_FROM_DEVICE,
> +				       rx_ring->dma_attrs);
>  	}
> 
>  	/* clear contents of buffer_info */
> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
> *tx_ring, struct sk_buff *skb,
>  	first->skb = skb;
>  	first->tx_flags = tx_flags;
> 
> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
> 
>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>  	tx_bi = first;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 5088405..9a86212 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -327,6 +327,7 @@ struct i40e_ring {
> 
>  	unsigned int size;		/* length of descriptor ring in bytes */
>  	dma_addr_t dma;			/* physical address of ring */
> +	unsigned long dma_attrs;	/* DMA attributes */
> 
>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>  	struct i40e_q_vector *q_vector;	/* Backreference to associated vector
> */
> --
> 1.9.1

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-26 11:39   ` maowenan
  0 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-26 11:39 UTC (permalink / raw)
  To: intel-wired-lan



> -----Original Message-----
> From: netdev-owner at vger.kernel.org [mailto:netdev-owner at vger.kernel.org]
> On Behalf Of Tushar Dave
> Sent: Tuesday, December 06, 2016 1:07 AM
> To: jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
> Cc: netdev at vger.kernel.org
> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> 
> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
> enabled, i40e performance is significantly low on SPARC.
> 
[Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in 82599?  

And to enable relax ordering mode in 82599 for SPARC using below codes:
s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
{
	u32 i;

	/* Clear the rate limiters */
	for (i = 0; i < hw->mac.max_tx_queues; i++) {
		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
	}
	IXGBE_WRITE_FLUSH(hw);

#ifndef CONFIG_SPARC
	/* Disable relaxed ordering */
	for (i = 0; i < hw->mac.max_tx_queues; i++) {
		u32 regval;

		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
	}

	for (i = 0; i < hw->mac.max_rx_queues; i++) {
		u32 regval;

		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
	}
#endif
	return 0;
}



> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> This has shown 10x increase in performance numbers.
> 
> e.g.
> iperf TCP test with 10 threads on SPARC S7
> 
> Test 1: Without this patch
> 
> [root at brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 40980
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> 
> Test 2: with this patch:
> 
> [root at brm-snt1-03 net]# iperf -s
> ------------------------------------------------------------
> Server listening on TCP port 5001
> TCP window size: 85.3 KByte (default)
> ------------------------------------------------------------
> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
> Check SNMP counters.
> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
> 5001 connected with 16.0.0.1 port 46882
> [ ID] Interval       Transfer     Bandwidth
> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
> GBytes  31.6 Gbits/sec
> 
> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
> numbers on x86.
> 
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
> 1 +
>  2 files changed, 49 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 6287bf6..800dca7 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -551,15 +551,17 @@ static void
> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>  		else
>  			dev_kfree_skb_any(tx_buffer->skb);
>  		if (dma_unmap_len(tx_buffer, len))
> -			dma_unmap_single(ring->dev,
> -					 dma_unmap_addr(tx_buffer, dma),
> -					 dma_unmap_len(tx_buffer, len),
> -					 DMA_TO_DEVICE);
> +			dma_unmap_single_attrs(ring->dev,
> +					       dma_unmap_addr(tx_buffer, dma),
> +					       dma_unmap_len(tx_buffer, len),
> +					       DMA_TO_DEVICE,
> +					       ring->dma_attrs);
>  	} else if (dma_unmap_len(tx_buffer, len)) {
> -		dma_unmap_page(ring->dev,
> -			       dma_unmap_addr(tx_buffer, dma),
> -			       dma_unmap_len(tx_buffer, len),
> -			       DMA_TO_DEVICE);
> +		dma_unmap_single_attrs(ring->dev,
> +				       dma_unmap_addr(tx_buffer, dma),
> +				       dma_unmap_len(tx_buffer, len),
> +				       DMA_TO_DEVICE,
> +				       ring->dma_attrs);
>  	}
> 
>  	tx_buffer->next_to_watch = NULL;
> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  	struct i40e_tx_buffer *tx_buf;
>  	struct i40e_tx_desc *tx_head;
>  	struct i40e_tx_desc *tx_desc;
> +	dma_addr_t addr;
> +	size_t size;
>  	unsigned int total_bytes = 0, total_packets = 0;
>  	unsigned int budget = vsi->work_limit;
> 
> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  		napi_consume_skb(tx_buf->skb, napi_budget);
> 
>  		/* unmap skb header data */
> -		dma_unmap_single(tx_ring->dev,
> -				 dma_unmap_addr(tx_buf, dma),
> -				 dma_unmap_len(tx_buf, len),
> -				 DMA_TO_DEVICE);
> +		dma_unmap_single_attrs(tx_ring->dev,
> +				       dma_unmap_addr(tx_buf, dma),
> +				       dma_unmap_len(tx_buf, len),
> +				       DMA_TO_DEVICE,
> +				       tx_ring->dma_attrs);
> 
>  		/* clear tx_buffer data */
>  		tx_buf->skb = NULL;
> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>  			}
> 
> +			addr = dma_unmap_addr(tx_buf, dma);
> +			size = dma_unmap_len(tx_buf, len);
>  			/* unmap any remaining paged data */
>  			if (dma_unmap_len(tx_buf, len)) {
> -				dma_unmap_page(tx_ring->dev,
> -					       dma_unmap_addr(tx_buf, dma),
> -					       dma_unmap_len(tx_buf, len),
> -					       DMA_TO_DEVICE);
> +				dma_unmap_single_attrs(tx_ring->dev,
> +						       addr,
> +						       size,
> +						       DMA_TO_DEVICE,
> +						       tx_ring->dma_attrs);
>  				dma_unmap_len_set(tx_buf, len, 0);
>  			}
>  		}
> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> *tx_ring)
>  	 */
>  	tx_ring->size += sizeof(u32);
>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> +	tx_ring->dma_attrs = 0;
> +#endif
>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>  					   &tx_ring->dma, GFP_KERNEL);
>  	if (!tx_ring->desc) {
> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>  		if (!rx_bi->page)
>  			continue;
> 
> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> DMA_FROM_DEVICE);
> +		dma_unmap_single_attrs(dev,
> +				       rx_bi->dma,
> +				       PAGE_SIZE,
> +				       DMA_FROM_DEVICE,
> +				       rx_ring->dma_attrs);
>  		__free_pages(rx_bi->page, 0);
> 
>  		rx_bi->page = NULL;
> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> *rx_ring)
>  	/* Round up to nearest 4K */
>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
> +#ifdef CONFIG_SPARC
> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> +	rx_ring->dma_attrs = 0;
> +#endif
>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>  					   &rx_ring->dma, GFP_KERNEL);
> 
> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> i40e_ring *rx_ring,
>  	}
> 
>  	/* map page for use */
> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> DMA_FROM_DEVICE);
> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> PAGE_SIZE,
> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
> 
>  	/* if mapping failed free memory back to system since
>  	 * there isn't much point in holding memory we can't use @@ -1695,8
> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>  		rx_ring->rx_stats.page_reuse_count++;
>  	} else {
>  		/* we are not reusing the buffer so unmap it */
> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> -			       DMA_FROM_DEVICE);
> +		dma_unmap_single_attrs(rx_ring->dev,
> +				       rx_buffer->dma,
> +				       PAGE_SIZE,
> +				       DMA_FROM_DEVICE,
> +				       rx_ring->dma_attrs);
>  	}
> 
>  	/* clear contents of buffer_info */
> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
> *tx_ring, struct sk_buff *skb,
>  	first->skb = skb;
>  	first->tx_flags = tx_flags;
> 
> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
> 
>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>  	tx_bi = first;
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 5088405..9a86212 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -327,6 +327,7 @@ struct i40e_ring {
> 
>  	unsigned int size;		/* length of descriptor ring in bytes */
>  	dma_addr_t dma;			/* physical address of ring */
> +	unsigned long dma_attrs;	/* DMA attributes */
> 
>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>  	struct i40e_q_vector *q_vector;	/* Backreference to associated vector
> */
> --
> 1.9.1


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-26 11:39   ` [Intel-wired-lan] " maowenan
@ 2016-12-27 18:32     ` Alexander Duyck
  -1 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-27 18:32 UTC (permalink / raw)
  To: maowenan
  Cc: Tushar Dave, jeffrey.t.kirsher, intel-wired-lan, netdev,
	weiyongjun (A),
	Dingtianhong

On Mon, Dec 26, 2016 at 3:39 AM, maowenan <maowenan@huawei.com> wrote:
>
>
>> -----Original Message-----
>> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
>> On Behalf Of Tushar Dave
>> Sent: Tuesday, December 06, 2016 1:07 AM
>> To: jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
>> Cc: netdev@vger.kernel.org
>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
>> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
>> enabled, i40e performance is significantly low on SPARC.
>>
> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
> to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in 82599?
>
> And to enable relax ordering mode in 82599 for SPARC using below codes:
> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
> {
>         u32 i;
>
>         /* Clear the rate limiters */
>         for (i = 0; i < hw->mac.max_tx_queues; i++) {
>                 IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
>                 IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
>         }
>         IXGBE_WRITE_FLUSH(hw);
>
> #ifndef CONFIG_SPARC
>         /* Disable relaxed ordering */
>         for (i = 0; i < hw->mac.max_tx_queues; i++) {
>                 u32 regval;
>
>                 regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
>                 regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>                 IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
>         }
>
>         for (i = 0; i < hw->mac.max_rx_queues; i++) {
>                 u32 regval;
>
>                 regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
>                 regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
>                             IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
>                 IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
>         }
> #endif
>         return 0;
> }
>
>
>

He isn't changing anything on the device itself.  The patches were
just added recently that support this for SPARC.  The IOMMU on some
SPARC platforms can indicate memory ordering rules, and by doing this
they get the goodness of relaxed ordering without having true relaxed
ordering enabled on the PCIe bus.

>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> [root@brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> [root@brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
>> Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
>> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
>> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
>> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
>> GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
>> numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>> 1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void
>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>               else
>>                       dev_kfree_skb_any(tx_buffer->skb);
>>               if (dma_unmap_len(tx_buffer, len))
>> -                     dma_unmap_single(ring->dev,
>> -                                      dma_unmap_addr(tx_buffer, dma),
>> -                                      dma_unmap_len(tx_buffer, len),
>> -                                      DMA_TO_DEVICE);
>> +                     dma_unmap_single_attrs(ring->dev,
>> +                                            dma_unmap_addr(tx_buffer, dma),
>> +                                            dma_unmap_len(tx_buffer, len),
>> +                                            DMA_TO_DEVICE,
>> +                                            ring->dma_attrs);
>>       } else if (dma_unmap_len(tx_buffer, len)) {
>> -             dma_unmap_page(ring->dev,
>> -                            dma_unmap_addr(tx_buffer, dma),
>> -                            dma_unmap_len(tx_buffer, len),
>> -                            DMA_TO_DEVICE);
>> +             dma_unmap_single_attrs(ring->dev,
>> +                                    dma_unmap_addr(tx_buffer, dma),
>> +                                    dma_unmap_len(tx_buffer, len),
>> +                                    DMA_TO_DEVICE,
>> +                                    ring->dma_attrs);
>>       }
>>
>>       tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>       struct i40e_tx_buffer *tx_buf;
>>       struct i40e_tx_desc *tx_head;
>>       struct i40e_tx_desc *tx_desc;
>> +     dma_addr_t addr;
>> +     size_t size;
>>       unsigned int total_bytes = 0, total_packets = 0;
>>       unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>               napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>               /* unmap skb header data */
>> -             dma_unmap_single(tx_ring->dev,
>> -                              dma_unmap_addr(tx_buf, dma),
>> -                              dma_unmap_len(tx_buf, len),
>> -                              DMA_TO_DEVICE);
>> +             dma_unmap_single_attrs(tx_ring->dev,
>> +                                    dma_unmap_addr(tx_buf, dma),
>> +                                    dma_unmap_len(tx_buf, len),
>> +                                    DMA_TO_DEVICE,
>> +                                    tx_ring->dma_attrs);
>>
>>               /* clear tx_buffer data */
>>               tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                               tx_desc = I40E_TX_DESC(tx_ring, 0);
>>                       }
>>
>> +                     addr = dma_unmap_addr(tx_buf, dma);
>> +                     size = dma_unmap_len(tx_buf, len);
>>                       /* unmap any remaining paged data */
>>                       if (dma_unmap_len(tx_buf, len)) {
>> -                             dma_unmap_page(tx_ring->dev,
>> -                                            dma_unmap_addr(tx_buf, dma),
>> -                                            dma_unmap_len(tx_buf, len),
>> -                                            DMA_TO_DEVICE);
>> +                             dma_unmap_single_attrs(tx_ring->dev,
>> +                                                    addr,
>> +                                                    size,
>> +                                                    DMA_TO_DEVICE,
>> +                                                    tx_ring->dma_attrs);
>>                               dma_unmap_len_set(tx_buf, len, 0);
>>                       }
>>               }
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>> *tx_ring)
>>        */
>>       tx_ring->size += sizeof(u32);
>>       tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +     tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +     tx_ring->dma_attrs = 0;
>> +#endif
>>       tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>                                          &tx_ring->dma, GFP_KERNEL);
>>       if (!tx_ring->desc) {
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>               if (!rx_bi->page)
>>                       continue;
>>
>> -             dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +             dma_unmap_single_attrs(dev,
>> +                                    rx_bi->dma,
>> +                                    PAGE_SIZE,
>> +                                    DMA_FROM_DEVICE,
>> +                                    rx_ring->dma_attrs);
>>               __free_pages(rx_bi->page, 0);
>>
>>               rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>> *rx_ring)
>>       /* Round up to nearest 4K */
>>       rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>       rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +     rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +     rx_ring->dma_attrs = 0;
>> +#endif
>>       rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>                                          &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>> i40e_ring *rx_ring,
>>       }
>>
>>       /* map page for use */
>> -     dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +     dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>> PAGE_SIZE,
>> +                                DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>       /* if mapping failed free memory back to system since
>>        * there isn't much point in holding memory we can't use @@ -1695,8
>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>               rx_ring->rx_stats.page_reuse_count++;
>>       } else {
>>               /* we are not reusing the buffer so unmap it */
>> -             dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -                            DMA_FROM_DEVICE);
>> +             dma_unmap_single_attrs(rx_ring->dev,
>> +                                    rx_buffer->dma,
>> +                                    PAGE_SIZE,
>> +                                    DMA_FROM_DEVICE,
>> +                                    rx_ring->dma_attrs);
>>       }
>>
>>       /* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
>> *tx_ring, struct sk_buff *skb,
>>       first->skb = skb;
>>       first->tx_flags = tx_flags;
>>
>> -     dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +     dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +                                DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>       tx_desc = I40E_TX_DESC(tx_ring, i);
>>       tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>       unsigned int size;              /* length of descriptor ring in bytes */
>>       dma_addr_t dma;                 /* physical address of ring */
>> +     unsigned long dma_attrs;        /* DMA attributes */
>>
>>       struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>>       struct i40e_q_vector *q_vector; /* Backreference to associated vector
>> */
>> --
>> 1.9.1
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-27 18:32     ` Alexander Duyck
  0 siblings, 0 replies; 33+ messages in thread
From: Alexander Duyck @ 2016-12-27 18:32 UTC (permalink / raw)
  To: intel-wired-lan

On Mon, Dec 26, 2016 at 3:39 AM, maowenan <maowenan@huawei.com> wrote:
>
>
>> -----Original Message-----
>> From: netdev-owner at vger.kernel.org [mailto:netdev-owner at vger.kernel.org]
>> On Behalf Of Tushar Dave
>> Sent: Tuesday, December 06, 2016 1:07 AM
>> To: jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
>> Cc: netdev at vger.kernel.org
>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
>> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
>> enabled, i40e performance is significantly low on SPARC.
>>
> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
> to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in 82599?
>
> And to enable relax ordering mode in 82599 for SPARC using below codes:
> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
> {
>         u32 i;
>
>         /* Clear the rate limiters */
>         for (i = 0; i < hw->mac.max_tx_queues; i++) {
>                 IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
>                 IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
>         }
>         IXGBE_WRITE_FLUSH(hw);
>
> #ifndef CONFIG_SPARC
>         /* Disable relaxed ordering */
>         for (i = 0; i < hw->mac.max_tx_queues; i++) {
>                 u32 regval;
>
>                 regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
>                 regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>                 IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
>         }
>
>         for (i = 0; i < hw->mac.max_rx_queues; i++) {
>                 u32 regval;
>
>                 regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
>                 regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
>                             IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
>                 IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
>         }
> #endif
>         return 0;
> }
>
>
>

He isn't changing anything on the device itself.  The patches were
just added recently that support this for SPARC.  The IOMMU on some
SPARC platforms can indicate memory ordering rules, and by doing this
they get the goodness of relaxed ordering without having true relaxed
ordering enabled on the PCIe bus.

>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> [root at brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> [root at brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
>> Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
>> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
>> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
>> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
>> GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
>> numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>> 1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void
>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>               else
>>                       dev_kfree_skb_any(tx_buffer->skb);
>>               if (dma_unmap_len(tx_buffer, len))
>> -                     dma_unmap_single(ring->dev,
>> -                                      dma_unmap_addr(tx_buffer, dma),
>> -                                      dma_unmap_len(tx_buffer, len),
>> -                                      DMA_TO_DEVICE);
>> +                     dma_unmap_single_attrs(ring->dev,
>> +                                            dma_unmap_addr(tx_buffer, dma),
>> +                                            dma_unmap_len(tx_buffer, len),
>> +                                            DMA_TO_DEVICE,
>> +                                            ring->dma_attrs);
>>       } else if (dma_unmap_len(tx_buffer, len)) {
>> -             dma_unmap_page(ring->dev,
>> -                            dma_unmap_addr(tx_buffer, dma),
>> -                            dma_unmap_len(tx_buffer, len),
>> -                            DMA_TO_DEVICE);
>> +             dma_unmap_single_attrs(ring->dev,
>> +                                    dma_unmap_addr(tx_buffer, dma),
>> +                                    dma_unmap_len(tx_buffer, len),
>> +                                    DMA_TO_DEVICE,
>> +                                    ring->dma_attrs);
>>       }
>>
>>       tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>       struct i40e_tx_buffer *tx_buf;
>>       struct i40e_tx_desc *tx_head;
>>       struct i40e_tx_desc *tx_desc;
>> +     dma_addr_t addr;
>> +     size_t size;
>>       unsigned int total_bytes = 0, total_packets = 0;
>>       unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>               napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>               /* unmap skb header data */
>> -             dma_unmap_single(tx_ring->dev,
>> -                              dma_unmap_addr(tx_buf, dma),
>> -                              dma_unmap_len(tx_buf, len),
>> -                              DMA_TO_DEVICE);
>> +             dma_unmap_single_attrs(tx_ring->dev,
>> +                                    dma_unmap_addr(tx_buf, dma),
>> +                                    dma_unmap_len(tx_buf, len),
>> +                                    DMA_TO_DEVICE,
>> +                                    tx_ring->dma_attrs);
>>
>>               /* clear tx_buffer data */
>>               tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>                               tx_desc = I40E_TX_DESC(tx_ring, 0);
>>                       }
>>
>> +                     addr = dma_unmap_addr(tx_buf, dma);
>> +                     size = dma_unmap_len(tx_buf, len);
>>                       /* unmap any remaining paged data */
>>                       if (dma_unmap_len(tx_buf, len)) {
>> -                             dma_unmap_page(tx_ring->dev,
>> -                                            dma_unmap_addr(tx_buf, dma),
>> -                                            dma_unmap_len(tx_buf, len),
>> -                                            DMA_TO_DEVICE);
>> +                             dma_unmap_single_attrs(tx_ring->dev,
>> +                                                    addr,
>> +                                                    size,
>> +                                                    DMA_TO_DEVICE,
>> +                                                    tx_ring->dma_attrs);
>>                               dma_unmap_len_set(tx_buf, len, 0);
>>                       }
>>               }
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>> *tx_ring)
>>        */
>>       tx_ring->size += sizeof(u32);
>>       tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +     tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +     tx_ring->dma_attrs = 0;
>> +#endif
>>       tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>                                          &tx_ring->dma, GFP_KERNEL);
>>       if (!tx_ring->desc) {
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>               if (!rx_bi->page)
>>                       continue;
>>
>> -             dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +             dma_unmap_single_attrs(dev,
>> +                                    rx_bi->dma,
>> +                                    PAGE_SIZE,
>> +                                    DMA_FROM_DEVICE,
>> +                                    rx_ring->dma_attrs);
>>               __free_pages(rx_bi->page, 0);
>>
>>               rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>> *rx_ring)
>>       /* Round up to nearest 4K */
>>       rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>       rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +     rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +     rx_ring->dma_attrs = 0;
>> +#endif
>>       rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>                                          &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>> i40e_ring *rx_ring,
>>       }
>>
>>       /* map page for use */
>> -     dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +     dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>> PAGE_SIZE,
>> +                                DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>       /* if mapping failed free memory back to system since
>>        * there isn't much point in holding memory we can't use @@ -1695,8
>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>               rx_ring->rx_stats.page_reuse_count++;
>>       } else {
>>               /* we are not reusing the buffer so unmap it */
>> -             dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -                            DMA_FROM_DEVICE);
>> +             dma_unmap_single_attrs(rx_ring->dev,
>> +                                    rx_buffer->dma,
>> +                                    PAGE_SIZE,
>> +                                    DMA_FROM_DEVICE,
>> +                                    rx_ring->dma_attrs);
>>       }
>>
>>       /* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
>> *tx_ring, struct sk_buff *skb,
>>       first->skb = skb;
>>       first->tx_flags = tx_flags;
>>
>> -     dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +     dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +                                DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>       tx_desc = I40E_TX_DESC(tx_ring, i);
>>       tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>       unsigned int size;              /* length of descriptor ring in bytes */
>>       dma_addr_t dma;                 /* physical address of ring */
>> +     unsigned long dma_attrs;        /* DMA attributes */
>>
>>       struct i40e_vsi *vsi;           /* Backreference to associated VSI */
>>       struct i40e_q_vector *q_vector; /* Backreference to associated vector
>> */
>> --
>> 1.9.1
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan at lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-26 11:39   ` [Intel-wired-lan] " maowenan
@ 2016-12-27 22:27     ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-27 22:27 UTC (permalink / raw)
  To: maowenan, jeffrey.t.kirsher, intel-wired-lan
  Cc: netdev, weiyongjun (A), Dingtianhong



On 12/26/2016 03:39 AM, maowenan wrote:
>
>
>> -----Original Message-----
>> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
>> On Behalf Of Tushar Dave
>> Sent: Tuesday, December 06, 2016 1:07 AM
>> To: jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
>> Cc: netdev@vger.kernel.org
>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
>> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
>> enabled, i40e performance is significantly low on SPARC.
>>
> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
> to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
Yes.
i40e datasheet mentions some CSR that can be used to enable/disable PCIe
relaxed ordering in device; however I don't see the exact definition of
those register in datasheet.
(https://www.mail-archive.com/netdev@vger.kernel.org/msg117219.html).

> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
> 82599?
No.
DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.

-Tushar

>
> And to enable relax ordering mode in 82599 for SPARC using below codes:
> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
> {
> 	u32 i;
>
> 	/* Clear the rate limiters */
> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
> 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
> 	}
> 	IXGBE_WRITE_FLUSH(hw);
>
> #ifndef CONFIG_SPARC
> 	/* Disable relaxed ordering */
> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> 		u32 regval;
>
> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
> 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
> 	}
>
> 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
> 		u32 regval;
>
> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
> 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
> 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
> 	}
> #endif
> 	return 0;
> }
>
>
>
>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> [root@brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> [root@brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
>> Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
>> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
>> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
>> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
>> GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
>> numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>> 1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void
>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>  		else
>>  			dev_kfree_skb_any(tx_buffer->skb);
>>  		if (dma_unmap_len(tx_buffer, len))
>> -			dma_unmap_single(ring->dev,
>> -					 dma_unmap_addr(tx_buffer, dma),
>> -					 dma_unmap_len(tx_buffer, len),
>> -					 DMA_TO_DEVICE);
>> +			dma_unmap_single_attrs(ring->dev,
>> +					       dma_unmap_addr(tx_buffer, dma),
>> +					       dma_unmap_len(tx_buffer, len),
>> +					       DMA_TO_DEVICE,
>> +					       ring->dma_attrs);
>>  	} else if (dma_unmap_len(tx_buffer, len)) {
>> -		dma_unmap_page(ring->dev,
>> -			       dma_unmap_addr(tx_buffer, dma),
>> -			       dma_unmap_len(tx_buffer, len),
>> -			       DMA_TO_DEVICE);
>> +		dma_unmap_single_attrs(ring->dev,
>> +				       dma_unmap_addr(tx_buffer, dma),
>> +				       dma_unmap_len(tx_buffer, len),
>> +				       DMA_TO_DEVICE,
>> +				       ring->dma_attrs);
>>  	}
>>
>>  	tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  	struct i40e_tx_buffer *tx_buf;
>>  	struct i40e_tx_desc *tx_head;
>>  	struct i40e_tx_desc *tx_desc;
>> +	dma_addr_t addr;
>> +	size_t size;
>>  	unsigned int total_bytes = 0, total_packets = 0;
>>  	unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  		napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>  		/* unmap skb header data */
>> -		dma_unmap_single(tx_ring->dev,
>> -				 dma_unmap_addr(tx_buf, dma),
>> -				 dma_unmap_len(tx_buf, len),
>> -				 DMA_TO_DEVICE);
>> +		dma_unmap_single_attrs(tx_ring->dev,
>> +				       dma_unmap_addr(tx_buf, dma),
>> +				       dma_unmap_len(tx_buf, len),
>> +				       DMA_TO_DEVICE,
>> +				       tx_ring->dma_attrs);
>>
>>  		/* clear tx_buffer data */
>>  		tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>>  			}
>>
>> +			addr = dma_unmap_addr(tx_buf, dma);
>> +			size = dma_unmap_len(tx_buf, len);
>>  			/* unmap any remaining paged data */
>>  			if (dma_unmap_len(tx_buf, len)) {
>> -				dma_unmap_page(tx_ring->dev,
>> -					       dma_unmap_addr(tx_buf, dma),
>> -					       dma_unmap_len(tx_buf, len),
>> -					       DMA_TO_DEVICE);
>> +				dma_unmap_single_attrs(tx_ring->dev,
>> +						       addr,
>> +						       size,
>> +						       DMA_TO_DEVICE,
>> +						       tx_ring->dma_attrs);
>>  				dma_unmap_len_set(tx_buf, len, 0);
>>  			}
>>  		}
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>> *tx_ring)
>>  	 */
>>  	tx_ring->size += sizeof(u32);
>>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +	tx_ring->dma_attrs = 0;
>> +#endif
>>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>  					   &tx_ring->dma, GFP_KERNEL);
>>  	if (!tx_ring->desc) {
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>  		if (!rx_bi->page)
>>  			continue;
>>
>> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +		dma_unmap_single_attrs(dev,
>> +				       rx_bi->dma,
>> +				       PAGE_SIZE,
>> +				       DMA_FROM_DEVICE,
>> +				       rx_ring->dma_attrs);
>>  		__free_pages(rx_bi->page, 0);
>>
>>  		rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>> *rx_ring)
>>  	/* Round up to nearest 4K */
>>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +	rx_ring->dma_attrs = 0;
>> +#endif
>>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>  					   &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>> i40e_ring *rx_ring,
>>  	}
>>
>>  	/* map page for use */
>> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>> PAGE_SIZE,
>> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>  	/* if mapping failed free memory back to system since
>>  	 * there isn't much point in holding memory we can't use @@ -1695,8
>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>  		rx_ring->rx_stats.page_reuse_count++;
>>  	} else {
>>  		/* we are not reusing the buffer so unmap it */
>> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -			       DMA_FROM_DEVICE);
>> +		dma_unmap_single_attrs(rx_ring->dev,
>> +				       rx_buffer->dma,
>> +				       PAGE_SIZE,
>> +				       DMA_FROM_DEVICE,
>> +				       rx_ring->dma_attrs);
>>  	}
>>
>>  	/* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
>> *tx_ring, struct sk_buff *skb,
>>  	first->skb = skb;
>>  	first->tx_flags = tx_flags;
>>
>> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>>  	tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>  	unsigned int size;		/* length of descriptor ring in bytes */
>>  	dma_addr_t dma;			/* physical address of ring */
>> +	unsigned long dma_attrs;	/* DMA attributes */
>>
>>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>>  	struct i40e_q_vector *q_vector;	/* Backreference to associated vector
>> */
>> --
>> 1.9.1
>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-27 22:27     ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-27 22:27 UTC (permalink / raw)
  To: intel-wired-lan



On 12/26/2016 03:39 AM, maowenan wrote:
>
>
>> -----Original Message-----
>> From: netdev-owner at vger.kernel.org [mailto:netdev-owner at vger.kernel.org]
>> On Behalf Of Tushar Dave
>> Sent: Tuesday, December 06, 2016 1:07 AM
>> To: jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
>> Cc: netdev at vger.kernel.org
>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have standard
>> CSR where PCIe relaxed ordering can be set. Without PCIe relax ordering
>> enabled, i40e performance is significantly low on SPARC.
>>
> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have standard CSR
> to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control Register in 82599, right?
Yes.
i40e datasheet mentions some CSR that can be used to enable/disable PCIe
relaxed ordering in device; however I don't see the exact definition of
those register in datasheet.
(https://www.mail-archive.com/netdev at vger.kernel.org/msg117219.html).

> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
> 82599?
No.
DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.

-Tushar

>
> And to enable relax ordering mode in 82599 for SPARC using below codes:
> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
> {
> 	u32 i;
>
> 	/* Clear the rate limiters */
> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
> 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
> 	}
> 	IXGBE_WRITE_FLUSH(hw);
>
> #ifndef CONFIG_SPARC
> 	/* Disable relaxed ordering */
> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> 		u32 regval;
>
> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
> 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
> 	}
>
> 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
> 		u32 regval;
>
> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
> 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
> 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
> 	}
> #endif
> 	return 0;
> }
>
>
>
>> This patch sets PCIe relax ordering for SPARC arch by setting dma attr
>> DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>> This has shown 10x increase in performance numbers.
>>
>> e.g.
>> iperf TCP test with 10 threads on SPARC S7
>>
>> Test 1: Without this patch
>>
>> [root at brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 40932 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 40980
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>
>> Test 2: with this patch:
>>
>> [root at brm-snt1-03 net]# iperf -s
>> ------------------------------------------------------------
>> Server listening on TCP port 5001
>> TCP window size: 85.3 KByte (default)
>> ------------------------------------------------------------
>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending cookies.
>> Check SNMP counters.
>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [  5] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port 5001
>> connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port 5001 connected
>> with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port 5001 connected with 16.0.0.1
>> port 46884 [ 10] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886
>> [ 11] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12] local
>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local 16.0.0.7 port
>> 5001 connected with 16.0.0.1 port 46882
>> [ ID] Interval       Transfer     Bandwidth
>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec  7.48
>> GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15 Gbits/sec
>> [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]  0.0-20.0 sec  7.24
>> GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40 GBytes  3.17 Gbits/sec
>> [ 12]  0.0-20.0 sec  7.49 GBytes  3.21 Gbits/sec [  6]  0.0-20.0 sec  7.30
>> GBytes  3.13 Gbits/sec [ 11]  0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec
>> [ 13]  0.0-20.0 sec  7.22 GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8
>> GBytes  31.6 Gbits/sec
>>
>> NOTE: In my testing, this patch does _not_ show any harm to i40e performance
>> numbers on x86.
>>
>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>> ++++++++++++++++++++---------  drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>> 1 +
>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> index 6287bf6..800dca7 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> @@ -551,15 +551,17 @@ static void
>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>  		else
>>  			dev_kfree_skb_any(tx_buffer->skb);
>>  		if (dma_unmap_len(tx_buffer, len))
>> -			dma_unmap_single(ring->dev,
>> -					 dma_unmap_addr(tx_buffer, dma),
>> -					 dma_unmap_len(tx_buffer, len),
>> -					 DMA_TO_DEVICE);
>> +			dma_unmap_single_attrs(ring->dev,
>> +					       dma_unmap_addr(tx_buffer, dma),
>> +					       dma_unmap_len(tx_buffer, len),
>> +					       DMA_TO_DEVICE,
>> +					       ring->dma_attrs);
>>  	} else if (dma_unmap_len(tx_buffer, len)) {
>> -		dma_unmap_page(ring->dev,
>> -			       dma_unmap_addr(tx_buffer, dma),
>> -			       dma_unmap_len(tx_buffer, len),
>> -			       DMA_TO_DEVICE);
>> +		dma_unmap_single_attrs(ring->dev,
>> +				       dma_unmap_addr(tx_buffer, dma),
>> +				       dma_unmap_len(tx_buffer, len),
>> +				       DMA_TO_DEVICE,
>> +				       ring->dma_attrs);
>>  	}
>>
>>  	tx_buffer->next_to_watch = NULL;
>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  	struct i40e_tx_buffer *tx_buf;
>>  	struct i40e_tx_desc *tx_head;
>>  	struct i40e_tx_desc *tx_desc;
>> +	dma_addr_t addr;
>> +	size_t size;
>>  	unsigned int total_bytes = 0, total_packets = 0;
>>  	unsigned int budget = vsi->work_limit;
>>
>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  		napi_consume_skb(tx_buf->skb, napi_budget);
>>
>>  		/* unmap skb header data */
>> -		dma_unmap_single(tx_ring->dev,
>> -				 dma_unmap_addr(tx_buf, dma),
>> -				 dma_unmap_len(tx_buf, len),
>> -				 DMA_TO_DEVICE);
>> +		dma_unmap_single_attrs(tx_ring->dev,
>> +				       dma_unmap_addr(tx_buf, dma),
>> +				       dma_unmap_len(tx_buf, len),
>> +				       DMA_TO_DEVICE,
>> +				       tx_ring->dma_attrs);
>>
>>  		/* clear tx_buffer data */
>>  		tx_buf->skb = NULL;
>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>>  			}
>>
>> +			addr = dma_unmap_addr(tx_buf, dma);
>> +			size = dma_unmap_len(tx_buf, len);
>>  			/* unmap any remaining paged data */
>>  			if (dma_unmap_len(tx_buf, len)) {
>> -				dma_unmap_page(tx_ring->dev,
>> -					       dma_unmap_addr(tx_buf, dma),
>> -					       dma_unmap_len(tx_buf, len),
>> -					       DMA_TO_DEVICE);
>> +				dma_unmap_single_attrs(tx_ring->dev,
>> +						       addr,
>> +						       size,
>> +						       DMA_TO_DEVICE,
>> +						       tx_ring->dma_attrs);
>>  				dma_unmap_len_set(tx_buf, len, 0);
>>  			}
>>  		}
>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>> *tx_ring)
>>  	 */
>>  	tx_ring->size += sizeof(u32);
>>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +	tx_ring->dma_attrs = 0;
>> +#endif
>>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>  					   &tx_ring->dma, GFP_KERNEL);
>>  	if (!tx_ring->desc) {
>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>>  		if (!rx_bi->page)
>>  			continue;
>>
>> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +		dma_unmap_single_attrs(dev,
>> +				       rx_bi->dma,
>> +				       PAGE_SIZE,
>> +				       DMA_FROM_DEVICE,
>> +				       rx_ring->dma_attrs);
>>  		__free_pages(rx_bi->page, 0);
>>
>>  		rx_bi->page = NULL;
>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>> *rx_ring)
>>  	/* Round up to nearest 4K */
>>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
>> +#ifdef CONFIG_SPARC
>> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>> +	rx_ring->dma_attrs = 0;
>> +#endif
>>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>  					   &rx_ring->dma, GFP_KERNEL);
>>
>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>> i40e_ring *rx_ring,
>>  	}
>>
>>  	/* map page for use */
>> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>> DMA_FROM_DEVICE);
>> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>> PAGE_SIZE,
>> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>
>>  	/* if mapping failed free memory back to system since
>>  	 * there isn't much point in holding memory we can't use @@ -1695,8
>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
>>  		rx_ring->rx_stats.page_reuse_count++;
>>  	} else {
>>  		/* we are not reusing the buffer so unmap it */
>> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>> -			       DMA_FROM_DEVICE);
>> +		dma_unmap_single_attrs(rx_ring->dev,
>> +				       rx_buffer->dma,
>> +				       PAGE_SIZE,
>> +				       DMA_FROM_DEVICE,
>> +				       rx_ring->dma_attrs);
>>  	}
>>
>>  	/* clear contents of buffer_info */
>> @@ -2737,7 +2763,8 @@ static inline void i40e_tx_map(struct i40e_ring
>> *tx_ring, struct sk_buff *skb,
>>  	first->skb = skb;
>>  	first->tx_flags = tx_flags;
>>
>> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
>>
>>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>>  	tx_bi = first;
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> index 5088405..9a86212 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>
>>  	unsigned int size;		/* length of descriptor ring in bytes */
>>  	dma_addr_t dma;			/* physical address of ring */
>> +	unsigned long dma_attrs;	/* DMA attributes */
>>
>>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>>  	struct i40e_q_vector *q_vector;	/* Backreference to associated vector
>> */
>> --
>> 1.9.1
>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-05 21:54   ` Alexander Duyck
@ 2016-12-28  0:23     ` maowenan
  -1 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-28  0:23 UTC (permalink / raw)
  To: Alexander Duyck, Tushar Dave; +Cc: Jeff Kirsher, intel-wired-lan, Netdev



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Alexander Duyck
> Sent: Tuesday, December 06, 2016 5:55 AM
> To: Tushar Dave
> Cc: Jeff Kirsher; intel-wired-lan; Netdev
> Subject: Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for
> SPARC
> 
> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
> wrote:
> > Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> > standard CSR where PCIe relaxed ordering can be set. Without PCIe
> > relax ordering enabled, i40e performance is significantly low on SPARC.
> >
> > This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> > DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> > This has shown 10x increase in performance numbers.
> >
> > e.g.
> > iperf TCP test with 10 threads on SPARC S7
> >
> > Test 1: Without this patch
> >
> > [root@brm-snt1-03 net]# iperf -s
> > ------------------------------------------------------------
> > Server listening on TCP port 5001
> > TCP window size: 85.3 KByte (default)
> > ------------------------------------------------------------
> > [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
> > 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930 [  7]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928 [  8]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922 [  9]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932 [ 10]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920 [ 11]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
> > [ ID] Interval       Transfer     Bandwidth
> > [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> > [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> > [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> > [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> > [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> > [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> > [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> > [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> > [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> > [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> > [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> >
> > Test 2: with this patch:
> >
> > [root@brm-snt1-03 net]# iperf -s
> > ------------------------------------------------------------
> > Server listening on TCP port 5001
> > TCP window size: 85.3 KByte (default)
> > ------------------------------------------------------------
> > TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> > cookies.  Check SNMP counters.
> > [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
> > 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872 [  7]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880 [  8]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878 [  9]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884 [ 10]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886 [ 11]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
> > [ ID] Interval       Transfer     Bandwidth
> > [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
> > 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
> > Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
> > 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
> > GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
> > Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
> > 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
> > GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
> > Gbits/sec
> >
> > NOTE: In my testing, this patch does _not_ show any harm to i40e
> > performance numbers on x86.
> >
> > Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> 
> You went through and replaced all of the dma_unmap/map_page calls with
> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
> patches to add the ability to map and unmap pages with attributes that should
> be available for 4.10-rc1 so if you could wait on this patch until then it would be
> preferred.
> 
[Mao Wenan] Have you already sent out the related patches? I want to refer to
you how to enable this ability, then we can adopt it to configure relax ordering 
through DCA control register on device 82599.
Thank you.

> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> > ++++++++++++++++++++---------
> > drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
> >  2 files changed, 49 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > index 6287bf6..800dca7 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > @@ -551,15 +551,17 @@ static void
> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
> >                 else
> >                         dev_kfree_skb_any(tx_buffer->skb);
> >                 if (dma_unmap_len(tx_buffer, len))
> > -                       dma_unmap_single(ring->dev,
> > -
> dma_unmap_addr(tx_buffer, dma),
> > -                                        dma_unmap_len(tx_buffer,
> len),
> > -                                        DMA_TO_DEVICE);
> > +                       dma_unmap_single_attrs(ring->dev,
> > +
> dma_unmap_addr(tx_buffer, dma),
> > +
> dma_unmap_len(tx_buffer, len),
> > +                                              DMA_TO_DEVICE,
> > +                                              ring->dma_attrs);
> >         } else if (dma_unmap_len(tx_buffer, len)) {
> > -               dma_unmap_page(ring->dev,
> > -                              dma_unmap_addr(tx_buffer, dma),
> > -                              dma_unmap_len(tx_buffer, len),
> > -                              DMA_TO_DEVICE);
> > +               dma_unmap_single_attrs(ring->dev,
> > +                                      dma_unmap_addr(tx_buffer,
> dma),
> > +                                      dma_unmap_len(tx_buffer,
> len),
> > +                                      DMA_TO_DEVICE,
> > +                                      ring->dma_attrs);
> >         }
> >
> >         tx_buffer->next_to_watch = NULL; @@ -662,6 +664,8 @@ static
> > bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >         struct i40e_tx_buffer *tx_buf;
> >         struct i40e_tx_desc *tx_head;
> >         struct i40e_tx_desc *tx_desc;
> > +       dma_addr_t addr;
> > +       size_t size;
> >         unsigned int total_bytes = 0, total_packets = 0;
> >         unsigned int budget = vsi->work_limit;
> >
> > @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >                 napi_consume_skb(tx_buf->skb, napi_budget);
> >
> >                 /* unmap skb header data */
> > -               dma_unmap_single(tx_ring->dev,
> > -                                dma_unmap_addr(tx_buf, dma),
> > -                                dma_unmap_len(tx_buf, len),
> > -                                DMA_TO_DEVICE);
> > +               dma_unmap_single_attrs(tx_ring->dev,
> > +                                      dma_unmap_addr(tx_buf,
> dma),
> > +                                      dma_unmap_len(tx_buf, len),
> > +                                      DMA_TO_DEVICE,
> > +                                      tx_ring->dma_attrs);
> >
> >                 /* clear tx_buffer data */
> >                 tx_buf->skb = NULL;
> > @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
> >                         }
> >
> > +                       addr = dma_unmap_addr(tx_buf, dma);
> > +                       size = dma_unmap_len(tx_buf, len);
> 
> On some architectures this change could lead to issues since dma_unmap_len
> could be 0 meaning that addr would never be used.
> 
> >                         /* unmap any remaining paged data */
> >                         if (dma_unmap_len(tx_buf, len)) {
> > -                               dma_unmap_page(tx_ring->dev,
> > -
> dma_unmap_addr(tx_buf, dma),
> > -
> dma_unmap_len(tx_buf, len),
> > -                                              DMA_TO_DEVICE);
> > +
> dma_unmap_single_attrs(tx_ring->dev,
> > +                                                      addr,
> > +                                                      size,
> > +
> DMA_TO_DEVICE,
> > +
> > + tx_ring->dma_attrs);
> >                                 dma_unmap_len_set(tx_buf, len, 0);
> >                         }
> >                 }
> > @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> *tx_ring)
> >          */
> >         tx_ring->size += sizeof(u32);
> >         tx_ring->size = ALIGN(tx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> > +       tx_ring->dma_attrs = 0;
> > +#endif
> >         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >                                            &tx_ring->dma,
> GFP_KERNEL);
> >         if (!tx_ring->desc) {
> 
> Also not a fan of adding yet ring attribute.  Is there any reason why you
> couldn't simply add a set of inline functions at the start of i40e_txrx.c that could
> replace the DMA map/unmap operations in this code but pass either 0 or
> DMA_ATTR_WEAK_ORDERING as needed for the drivers?  Then the x86 code
> doesn't have to change while the SPARC code will be able to be passed the
> attribute.
> 
> > @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
> *rx_ring)
> >                 if (!rx_bi->page)
> >                         continue;
> >
> > -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> DMA_FROM_DEVICE);
> > +               dma_unmap_single_attrs(dev,
> > +                                      rx_bi->dma,
> > +                                      PAGE_SIZE,
> > +                                      DMA_FROM_DEVICE,
> > +                                      rx_ring->dma_attrs);
> >                 __free_pages(rx_bi->page, 0);
> >
> >                 rx_bi->page = NULL;
> > @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> *rx_ring)
> >         /* Round up to nearest 4K */
> >         rx_ring->size = rx_ring->count * sizeof(union
> i40e_32byte_rx_desc);
> >         rx_ring->size = ALIGN(rx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> > +       rx_ring->dma_attrs = 0;
> > +#endif
> >         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
> >                                            &rx_ring->dma,
> GFP_KERNEL);
> >
> > @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> i40e_ring *rx_ring,
> >         }
> >
> >         /* map page for use */
> > -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> DMA_FROM_DEVICE);
> > +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> PAGE_SIZE,
> > +                                  DMA_FROM_DEVICE,
> > + rx_ring->dma_attrs);
> >
> >         /* if mapping failed free memory back to system since
> >          * there isn't much point in holding memory we can't use @@
> > -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
> *rx_ring,
> >                 rx_ring->rx_stats.page_reuse_count++;
> >         } else {
> >                 /* we are not reusing the buffer so unmap it */
> > -               dma_unmap_page(rx_ring->dev, rx_buffer->dma,
> PAGE_SIZE,
> > -                              DMA_FROM_DEVICE);
> > +               dma_unmap_single_attrs(rx_ring->dev,
> > +                                      rx_buffer->dma,
> > +                                      PAGE_SIZE,
> > +                                      DMA_FROM_DEVICE,
> > +                                      rx_ring->dma_attrs);
> >         }
> >
> >         /* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@
> > static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
> >         first->skb = skb;
> >         first->tx_flags = tx_flags;
> >
> > -       dma = dma_map_single(tx_ring->dev, skb->data, size,
> DMA_TO_DEVICE);
> > +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> > +                                  DMA_TO_DEVICE,
> tx_ring->dma_attrs);
> >
> >         tx_desc = I40E_TX_DESC(tx_ring, i);
> >         tx_bi = first;
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > index 5088405..9a86212 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > @@ -327,6 +327,7 @@ struct i40e_ring {
> >
> >         unsigned int size;              /* length of descriptor ring in
> bytes */
> >         dma_addr_t dma;                 /* physical address of ring
> */
> > +       unsigned long dma_attrs;        /* DMA attributes */
> >
> >         struct i40e_vsi *vsi;           /* Backreference to associated
> VSI */
> >         struct i40e_q_vector *q_vector; /* Backreference to associated
> > vector */
> > --
> > 1.9.1
> >
> > _______________________________________________
> > Intel-wired-lan mailing list
> > Intel-wired-lan@lists.osuosl.org
> > http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-28  0:23     ` maowenan
  0 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-28  0:23 UTC (permalink / raw)
  To: intel-wired-lan



> -----Original Message-----
> From: netdev-owner at vger.kernel.org [mailto:netdev-owner at vger.kernel.org]
> On Behalf Of Alexander Duyck
> Sent: Tuesday, December 06, 2016 5:55 AM
> To: Tushar Dave
> Cc: Jeff Kirsher; intel-wired-lan; Netdev
> Subject: Re: [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for
> SPARC
> 
> On Mon, Dec 5, 2016 at 9:07 AM, Tushar Dave <tushar.n.dave@oracle.com>
> wrote:
> > Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> > standard CSR where PCIe relaxed ordering can be set. Without PCIe
> > relax ordering enabled, i40e performance is significantly low on SPARC.
> >
> > This patch sets PCIe relax ordering for SPARC arch by setting dma attr
> > DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> > This has shown 10x increase in performance numbers.
> >
> > e.g.
> > iperf TCP test with 10 threads on SPARC S7
> >
> > Test 1: Without this patch
> >
> > [root at brm-snt1-03 net]# iperf -s
> > ------------------------------------------------------------
> > Server listening on TCP port 5001
> > TCP window size: 85.3 KByte (default)
> > ------------------------------------------------------------
> > [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
> > 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40930 [  7]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40928 [  8]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40922 [  9]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40932 [ 10]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40920 [ 11]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40924 [ 14]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40980
> > [ ID] Interval       Transfer     Bandwidth
> > [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> > [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> > [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> > [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> > [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> > [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> > [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> > [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> > [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> > [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> > [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> >
> > Test 2: with this patch:
> >
> > [root at brm-snt1-03 net]# iperf -s
> > ------------------------------------------------------------
> > Server listening on TCP port 5001
> > TCP window size: 85.3 KByte (default)
> > ------------------------------------------------------------
> > TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> > cookies.  Check SNMP counters.
> > [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
> > 5] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46872 [  7]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46880 [  8]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46878 [  9]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46884 [ 10]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46886 [ 11]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46890 [ 12]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13]
> > local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46882
> > [ ID] Interval       Transfer     Bandwidth
> > [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
> > 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
> > Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
> > 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
> > GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
> > Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
> > 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
> > GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
> > Gbits/sec
> >
> > NOTE: In my testing, this patch does _not_ show any harm to i40e
> > performance numbers on x86.
> >
> > Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> 
> You went through and replaced all of the dma_unmap/map_page calls with
> dma_map/unmap_single_attrs  I would prefer you didn't do that.  I have
> patches to add the ability to map and unmap pages with attributes that should
> be available for 4.10-rc1 so if you could wait on this patch until then it would be
> preferred.
> 
[Mao Wenan] Have you already sent out the related patches? I want to refer to
you how to enable this ability, then we can adopt it to configure relax ordering 
through DCA control register on device 82599.
Thank you.

> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> > ++++++++++++++++++++---------
> > drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
> >  2 files changed, 49 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > index 6287bf6..800dca7 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > @@ -551,15 +551,17 @@ static void
> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
> >                 else
> >                         dev_kfree_skb_any(tx_buffer->skb);
> >                 if (dma_unmap_len(tx_buffer, len))
> > -                       dma_unmap_single(ring->dev,
> > -
> dma_unmap_addr(tx_buffer, dma),
> > -                                        dma_unmap_len(tx_buffer,
> len),
> > -                                        DMA_TO_DEVICE);
> > +                       dma_unmap_single_attrs(ring->dev,
> > +
> dma_unmap_addr(tx_buffer, dma),
> > +
> dma_unmap_len(tx_buffer, len),
> > +                                              DMA_TO_DEVICE,
> > +                                              ring->dma_attrs);
> >         } else if (dma_unmap_len(tx_buffer, len)) {
> > -               dma_unmap_page(ring->dev,
> > -                              dma_unmap_addr(tx_buffer, dma),
> > -                              dma_unmap_len(tx_buffer, len),
> > -                              DMA_TO_DEVICE);
> > +               dma_unmap_single_attrs(ring->dev,
> > +                                      dma_unmap_addr(tx_buffer,
> dma),
> > +                                      dma_unmap_len(tx_buffer,
> len),
> > +                                      DMA_TO_DEVICE,
> > +                                      ring->dma_attrs);
> >         }
> >
> >         tx_buffer->next_to_watch = NULL; @@ -662,6 +664,8 @@ static
> > bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >         struct i40e_tx_buffer *tx_buf;
> >         struct i40e_tx_desc *tx_head;
> >         struct i40e_tx_desc *tx_desc;
> > +       dma_addr_t addr;
> > +       size_t size;
> >         unsigned int total_bytes = 0, total_packets = 0;
> >         unsigned int budget = vsi->work_limit;
> >
> > @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >                 napi_consume_skb(tx_buf->skb, napi_budget);
> >
> >                 /* unmap skb header data */
> > -               dma_unmap_single(tx_ring->dev,
> > -                                dma_unmap_addr(tx_buf, dma),
> > -                                dma_unmap_len(tx_buf, len),
> > -                                DMA_TO_DEVICE);
> > +               dma_unmap_single_attrs(tx_ring->dev,
> > +                                      dma_unmap_addr(tx_buf,
> dma),
> > +                                      dma_unmap_len(tx_buf, len),
> > +                                      DMA_TO_DEVICE,
> > +                                      tx_ring->dma_attrs);
> >
> >                 /* clear tx_buffer data */
> >                 tx_buf->skb = NULL;
> > @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >                                 tx_desc = I40E_TX_DESC(tx_ring, 0);
> >                         }
> >
> > +                       addr = dma_unmap_addr(tx_buf, dma);
> > +                       size = dma_unmap_len(tx_buf, len);
> 
> On some architectures this change could lead to issues since dma_unmap_len
> could be 0 meaning that addr would never be used.
> 
> >                         /* unmap any remaining paged data */
> >                         if (dma_unmap_len(tx_buf, len)) {
> > -                               dma_unmap_page(tx_ring->dev,
> > -
> dma_unmap_addr(tx_buf, dma),
> > -
> dma_unmap_len(tx_buf, len),
> > -                                              DMA_TO_DEVICE);
> > +
> dma_unmap_single_attrs(tx_ring->dev,
> > +                                                      addr,
> > +                                                      size,
> > +
> DMA_TO_DEVICE,
> > +
> > + tx_ring->dma_attrs);
> >                                 dma_unmap_len_set(tx_buf, len, 0);
> >                         }
> >                 }
> > @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> *tx_ring)
> >          */
> >         tx_ring->size += sizeof(u32);
> >         tx_ring->size = ALIGN(tx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> > +       tx_ring->dma_attrs = 0;
> > +#endif
> >         tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >                                            &tx_ring->dma,
> GFP_KERNEL);
> >         if (!tx_ring->desc) {
> 
> Also not a fan of adding yet ring attribute.  Is there any reason why you
> couldn't simply add a set of inline functions at the start of i40e_txrx.c that could
> replace the DMA map/unmap operations in this code but pass either 0 or
> DMA_ATTR_WEAK_ORDERING as needed for the drivers?  Then the x86 code
> doesn't have to change while the SPARC code will be able to be passed the
> attribute.
> 
> > @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
> *rx_ring)
> >                 if (!rx_bi->page)
> >                         continue;
> >
> > -               dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> DMA_FROM_DEVICE);
> > +               dma_unmap_single_attrs(dev,
> > +                                      rx_bi->dma,
> > +                                      PAGE_SIZE,
> > +                                      DMA_FROM_DEVICE,
> > +                                      rx_ring->dma_attrs);
> >                 __free_pages(rx_bi->page, 0);
> >
> >                 rx_bi->page = NULL;
> > @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> *rx_ring)
> >         /* Round up to nearest 4K */
> >         rx_ring->size = rx_ring->count * sizeof(union
> i40e_32byte_rx_desc);
> >         rx_ring->size = ALIGN(rx_ring->size, 4096);
> > +#ifdef CONFIG_SPARC
> > +       rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> > +       rx_ring->dma_attrs = 0;
> > +#endif
> >         rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
> >                                            &rx_ring->dma,
> GFP_KERNEL);
> >
> > @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> i40e_ring *rx_ring,
> >         }
> >
> >         /* map page for use */
> > -       dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> DMA_FROM_DEVICE);
> > +       dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> PAGE_SIZE,
> > +                                  DMA_FROM_DEVICE,
> > + rx_ring->dma_attrs);
> >
> >         /* if mapping failed free memory back to system since
> >          * there isn't much point in holding memory we can't use @@
> > -1695,8 +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
> *rx_ring,
> >                 rx_ring->rx_stats.page_reuse_count++;
> >         } else {
> >                 /* we are not reusing the buffer so unmap it */
> > -               dma_unmap_page(rx_ring->dev, rx_buffer->dma,
> PAGE_SIZE,
> > -                              DMA_FROM_DEVICE);
> > +               dma_unmap_single_attrs(rx_ring->dev,
> > +                                      rx_buffer->dma,
> > +                                      PAGE_SIZE,
> > +                                      DMA_FROM_DEVICE,
> > +                                      rx_ring->dma_attrs);
> >         }
> >
> >         /* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@
> > static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
> >         first->skb = skb;
> >         first->tx_flags = tx_flags;
> >
> > -       dma = dma_map_single(tx_ring->dev, skb->data, size,
> DMA_TO_DEVICE);
> > +       dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> > +                                  DMA_TO_DEVICE,
> tx_ring->dma_attrs);
> >
> >         tx_desc = I40E_TX_DESC(tx_ring, i);
> >         tx_bi = first;
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > index 5088405..9a86212 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > @@ -327,6 +327,7 @@ struct i40e_ring {
> >
> >         unsigned int size;              /* length of descriptor ring in
> bytes */
> >         dma_addr_t dma;                 /* physical address of ring
> */
> > +       unsigned long dma_attrs;        /* DMA attributes */
> >
> >         struct i40e_vsi *vsi;           /* Backreference to associated
> VSI */
> >         struct i40e_q_vector *q_vector; /* Backreference to associated
> > vector */
> > --
> > 1.9.1
> >
> > _______________________________________________
> > Intel-wired-lan mailing list
> > Intel-wired-lan at lists.osuosl.org
> > http://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* RE: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-27 22:27     ` [Intel-wired-lan] " tndave
@ 2016-12-28  0:40       ` maowenan
  -1 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-28  0:40 UTC (permalink / raw)
  To: tndave, jeffrey.t.kirsher, intel-wired-lan
  Cc: netdev, weiyongjun (A), Dingtianhong



> -----Original Message-----
> From: tndave [mailto:tushar.n.dave@oracle.com]
> Sent: Wednesday, December 28, 2016 6:28 AM
> To: maowenan; jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; weiyongjun (A); Dingtianhong
> Subject: Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> 
> 
> 
> On 12/26/2016 03:39 AM, maowenan wrote:
> >
> >
> >> -----Original Message-----
> >> From: netdev-owner@vger.kernel.org
> >> [mailto:netdev-owner@vger.kernel.org]
> >> On Behalf Of Tushar Dave
> >> Sent: Tuesday, December 06, 2016 1:07 AM
> >> To: jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
> >> Cc: netdev@vger.kernel.org
> >> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> >>
> >> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> >> standard CSR where PCIe relaxed ordering can be set. Without PCIe
> >> relax ordering enabled, i40e performance is significantly low on SPARC.
> >>
> > [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have
> > standard CSR to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control
> Register in 82599, right?
> Yes.
> i40e datasheet mentions some CSR that can be used to enable/disable PCIe
> relaxed ordering in device; however I don't see the exact definition of those
> register in datasheet.
> (https://www.mail-archive.com/netdev@vger.kernel.org/msg117219.html).
> 
> > Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
> 82599?
> No.
> DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.
> 
> -Tushar

I understand that the PCIe Root Complex is the Host Bridge in the CPU that
connects the CPU and memory to the PCIe architecture. So this attribute 
DMA_ATTR_WEAK_ORDERING is only applied on CPU side(the SPARC in you 
system), it can't apply on i40e, is it right? 
And it is not the same as 82599 DCA control register's relax ordering bits.
-Mao Wenan

> >
> > And to enable relax ordering mode in 82599 for SPARC using below codes:
> > s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) {
> > 	u32 i;
> >
> > 	/* Clear the rate limiters */
> > 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> > 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
> > 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
> > 	}
> > 	IXGBE_WRITE_FLUSH(hw);
> >
> > #ifndef CONFIG_SPARC
> > 	/* Disable relaxed ordering */
> > 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> > 		u32 regval;
> >
> > 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
> > 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
> > 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
> > 	}
> >
> > 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
> > 		u32 regval;
> >
> > 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
> > 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
> > 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
> > 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
> > 	}
> > #endif
> > 	return 0;
> > }
> >
> >
> >
> >> This patch sets PCIe relax ordering for SPARC arch by setting dma
> >> attr DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> >> This has shown 10x increase in performance numbers.
> >>
> >> e.g.
> >> iperf TCP test with 10 threads on SPARC S7
> >>
> >> Test 1: Without this patch
> >>
> >> [root@brm-snt1-03 net]# iperf -s
> >> ------------------------------------------------------------
> >> Server listening on TCP port 5001
> >> TCP window size: 85.3 KByte (default)
> >> ------------------------------------------------------------
> >> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
> >> 5] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40932 [ 10] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40920 [ 11] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40924 [ 14] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40980
> >> [ ID] Interval       Transfer     Bandwidth
> >> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> >> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> >> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> >> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> >> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> >> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> >> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> >> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> >> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> >> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> >> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> >>
> >> Test 2: with this patch:
> >>
> >> [root@brm-snt1-03 net]# iperf -s
> >> ------------------------------------------------------------
> >> Server listening on TCP port 5001
> >> TCP window size: 85.3 KByte (default)
> >> ------------------------------------------------------------
> >> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> cookies.
> >> Check SNMP counters.
> >> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
> >> 5] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46884 [ 10] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46886 [ 11] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46890 [ 12] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46882
> >> [ ID] Interval       Transfer     Bandwidth
> >> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
> >> 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
> >> Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
> >> 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
> >> GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
> >> Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
> >> 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
> >> GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
> >> Gbits/sec
> >>
> >> NOTE: In my testing, this patch does _not_ show any harm to i40e
> >> performance numbers on x86.
> >>
> >> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> >> ---
> >>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> >> ++++++++++++++++++++---------
> >> ++++++++++++++++++++drivers/net/ethernet/intel/i40e/i40e_txrx.h |
> >> 1 +
> >>  2 files changed, 49 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> index 6287bf6..800dca7 100644
> >> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> @@ -551,15 +551,17 @@ static void
> >> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
> >>  		else
> >>  			dev_kfree_skb_any(tx_buffer->skb);
> >>  		if (dma_unmap_len(tx_buffer, len))
> >> -			dma_unmap_single(ring->dev,
> >> -					 dma_unmap_addr(tx_buffer, dma),
> >> -					 dma_unmap_len(tx_buffer, len),
> >> -					 DMA_TO_DEVICE);
> >> +			dma_unmap_single_attrs(ring->dev,
> >> +					       dma_unmap_addr(tx_buffer, dma),
> >> +					       dma_unmap_len(tx_buffer, len),
> >> +					       DMA_TO_DEVICE,
> >> +					       ring->dma_attrs);
> >>  	} else if (dma_unmap_len(tx_buffer, len)) {
> >> -		dma_unmap_page(ring->dev,
> >> -			       dma_unmap_addr(tx_buffer, dma),
> >> -			       dma_unmap_len(tx_buffer, len),
> >> -			       DMA_TO_DEVICE);
> >> +		dma_unmap_single_attrs(ring->dev,
> >> +				       dma_unmap_addr(tx_buffer, dma),
> >> +				       dma_unmap_len(tx_buffer, len),
> >> +				       DMA_TO_DEVICE,
> >> +				       ring->dma_attrs);
> >>  	}
> >>
> >>  	tx_buffer->next_to_watch = NULL;
> >> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  	struct i40e_tx_buffer *tx_buf;
> >>  	struct i40e_tx_desc *tx_head;
> >>  	struct i40e_tx_desc *tx_desc;
> >> +	dma_addr_t addr;
> >> +	size_t size;
> >>  	unsigned int total_bytes = 0, total_packets = 0;
> >>  	unsigned int budget = vsi->work_limit;
> >>
> >> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  		napi_consume_skb(tx_buf->skb, napi_budget);
> >>
> >>  		/* unmap skb header data */
> >> -		dma_unmap_single(tx_ring->dev,
> >> -				 dma_unmap_addr(tx_buf, dma),
> >> -				 dma_unmap_len(tx_buf, len),
> >> -				 DMA_TO_DEVICE);
> >> +		dma_unmap_single_attrs(tx_ring->dev,
> >> +				       dma_unmap_addr(tx_buf, dma),
> >> +				       dma_unmap_len(tx_buf, len),
> >> +				       DMA_TO_DEVICE,
> >> +				       tx_ring->dma_attrs);
> >>
> >>  		/* clear tx_buffer data */
> >>  		tx_buf->skb = NULL;
> >> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
> >>  			}
> >>
> >> +			addr = dma_unmap_addr(tx_buf, dma);
> >> +			size = dma_unmap_len(tx_buf, len);
> >>  			/* unmap any remaining paged data */
> >>  			if (dma_unmap_len(tx_buf, len)) {
> >> -				dma_unmap_page(tx_ring->dev,
> >> -					       dma_unmap_addr(tx_buf, dma),
> >> -					       dma_unmap_len(tx_buf, len),
> >> -					       DMA_TO_DEVICE);
> >> +				dma_unmap_single_attrs(tx_ring->dev,
> >> +						       addr,
> >> +						       size,
> >> +						       DMA_TO_DEVICE,
> >> +						       tx_ring->dma_attrs);
> >>  				dma_unmap_len_set(tx_buf, len, 0);
> >>  			}
> >>  		}
> >> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> >> *tx_ring)
> >>  	 */
> >>  	tx_ring->size += sizeof(u32);
> >>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
> >> +#ifdef CONFIG_SPARC
> >> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> >> +	tx_ring->dma_attrs = 0;
> >> +#endif
> >>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >>  					   &tx_ring->dma, GFP_KERNEL);
> >>  	if (!tx_ring->desc) {
> >> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
> *rx_ring)
> >>  		if (!rx_bi->page)
> >>  			continue;
> >>
> >> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> >> DMA_FROM_DEVICE);
> >> +		dma_unmap_single_attrs(dev,
> >> +				       rx_bi->dma,
> >> +				       PAGE_SIZE,
> >> +				       DMA_FROM_DEVICE,
> >> +				       rx_ring->dma_attrs);
> >>  		__free_pages(rx_bi->page, 0);
> >>
> >>  		rx_bi->page = NULL;
> >> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> >> *rx_ring)
> >>  	/* Round up to nearest 4K */
> >>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
> >>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
> >> +#ifdef CONFIG_SPARC
> >> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> >> +	rx_ring->dma_attrs = 0;
> >> +#endif
> >>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
> >>  					   &rx_ring->dma, GFP_KERNEL);
> >>
> >> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> >> i40e_ring *rx_ring,
> >>  	}
> >>
> >>  	/* map page for use */
> >> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> >> DMA_FROM_DEVICE);
> >> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> >> PAGE_SIZE,
> >> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
> >>
> >>  	/* if mapping failed free memory back to system since
> >>  	 * there isn't much point in holding memory we can't use @@
> -1695,8
> >> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
> >> +*rx_ring,
> >>  		rx_ring->rx_stats.page_reuse_count++;
> >>  	} else {
> >>  		/* we are not reusing the buffer so unmap it */
> >> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> >> -			       DMA_FROM_DEVICE);
> >> +		dma_unmap_single_attrs(rx_ring->dev,
> >> +				       rx_buffer->dma,
> >> +				       PAGE_SIZE,
> >> +				       DMA_FROM_DEVICE,
> >> +				       rx_ring->dma_attrs);
> >>  	}
> >>
> >>  	/* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@ static
> >> inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff
> >> *skb,
> >>  	first->skb = skb;
> >>  	first->tx_flags = tx_flags;
> >>
> >> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> >> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> >> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
> >>
> >>  	tx_desc = I40E_TX_DESC(tx_ring, i);
> >>  	tx_bi = first;
> >> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> index 5088405..9a86212 100644
> >> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> @@ -327,6 +327,7 @@ struct i40e_ring {
> >>
> >>  	unsigned int size;		/* length of descriptor ring in bytes */
> >>  	dma_addr_t dma;			/* physical address of ring */
> >> +	unsigned long dma_attrs;	/* DMA attributes */
> >>
> >>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
> >>  	struct i40e_q_vector *q_vector;	/* Backreference to associated
> vector
> >> */
> >> --
> >> 1.9.1
> >
> >

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-28  0:40       ` maowenan
  0 siblings, 0 replies; 33+ messages in thread
From: maowenan @ 2016-12-28  0:40 UTC (permalink / raw)
  To: intel-wired-lan



> -----Original Message-----
> From: tndave [mailto:tushar.n.dave at oracle.com]
> Sent: Wednesday, December 28, 2016 6:28 AM
> To: maowenan; jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
> Cc: netdev at vger.kernel.org; weiyongjun (A); Dingtianhong
> Subject: Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> 
> 
> 
> On 12/26/2016 03:39 AM, maowenan wrote:
> >
> >
> >> -----Original Message-----
> >> From: netdev-owner at vger.kernel.org
> >> [mailto:netdev-owner at vger.kernel.org]
> >> On Behalf Of Tushar Dave
> >> Sent: Tuesday, December 06, 2016 1:07 AM
> >> To: jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
> >> Cc: netdev at vger.kernel.org
> >> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
> >>
> >> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
> >> standard CSR where PCIe relaxed ordering can be set. Without PCIe
> >> relax ordering enabled, i40e performance is significantly low on SPARC.
> >>
> > [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have
> > standard CSR to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control
> Register in 82599, right?
> Yes.
> i40e datasheet mentions some CSR that can be used to enable/disable PCIe
> relaxed ordering in device; however I don't see the exact definition of those
> register in datasheet.
> (https://www.mail-archive.com/netdev at vger.kernel.org/msg117219.html).
> 
> > Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
> 82599?
> No.
> DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.
> 
> -Tushar

I understand that the PCIe Root Complex is the Host Bridge in the CPU that
connects the CPU and memory to the PCIe architecture. So this attribute 
DMA_ATTR_WEAK_ORDERING is only applied on CPU side(the SPARC in you 
system), it can't apply on i40e, is it right? 
And it is not the same as 82599 DCA control register's relax ordering bits.
-Mao Wenan

> >
> > And to enable relax ordering mode in 82599 for SPARC using below codes:
> > s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) {
> > 	u32 i;
> >
> > 	/* Clear the rate limiters */
> > 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> > 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
> > 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
> > 	}
> > 	IXGBE_WRITE_FLUSH(hw);
> >
> > #ifndef CONFIG_SPARC
> > 	/* Disable relaxed ordering */
> > 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
> > 		u32 regval;
> >
> > 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
> > 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
> > 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
> > 	}
> >
> > 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
> > 		u32 regval;
> >
> > 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
> > 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
> > 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
> > 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
> > 	}
> > #endif
> > 	return 0;
> > }
> >
> >
> >
> >> This patch sets PCIe relax ordering for SPARC arch by setting dma
> >> attr DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
> >> This has shown 10x increase in performance numbers.
> >>
> >> e.g.
> >> iperf TCP test with 10 threads on SPARC S7
> >>
> >> Test 1: Without this patch
> >>
> >> [root at brm-snt1-03 net]# iperf -s
> >> ------------------------------------------------------------
> >> Server listening on TCP port 5001
> >> TCP window size: 85.3 KByte (default)
> >> ------------------------------------------------------------
> >> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
> >> 5] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40932 [ 10] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40920 [ 11] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40924 [ 14] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 40980
> >> [ ID] Interval       Transfer     Bandwidth
> >> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
> >> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
> >> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
> >> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
> >> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
> >> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
> >> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
> >> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
> >> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
> >> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
> >> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
> >>
> >> Test 2: with this patch:
> >>
> >> [root at brm-snt1-03 net]# iperf -s
> >> ------------------------------------------------------------
> >> Server listening on TCP port 5001
> >> TCP window size: 85.3 KByte (default)
> >> ------------------------------------------------------------
> >> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
> cookies.
> >> Check SNMP counters.
> >> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
> >> 5] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46884 [ 10] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46886 [ 11] local 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46890 [ 12] local
> >> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local
> >> 16.0.0.7 port
> >> 5001 connected with 16.0.0.1 port 46882
> >> [ ID] Interval       Transfer     Bandwidth
> >> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
> >> 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
> >> Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
> >> 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
> >> GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
> >> Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
> >> 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
> >> GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
> >> Gbits/sec
> >>
> >> NOTE: In my testing, this patch does _not_ show any harm to i40e
> >> performance numbers on x86.
> >>
> >> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> >> ---
> >>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
> >> ++++++++++++++++++++---------
> >> ++++++++++++++++++++drivers/net/ethernet/intel/i40e/i40e_txrx.h |
> >> 1 +
> >>  2 files changed, 49 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> index 6287bf6..800dca7 100644
> >> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> >> @@ -551,15 +551,17 @@ static void
> >> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
> >>  		else
> >>  			dev_kfree_skb_any(tx_buffer->skb);
> >>  		if (dma_unmap_len(tx_buffer, len))
> >> -			dma_unmap_single(ring->dev,
> >> -					 dma_unmap_addr(tx_buffer, dma),
> >> -					 dma_unmap_len(tx_buffer, len),
> >> -					 DMA_TO_DEVICE);
> >> +			dma_unmap_single_attrs(ring->dev,
> >> +					       dma_unmap_addr(tx_buffer, dma),
> >> +					       dma_unmap_len(tx_buffer, len),
> >> +					       DMA_TO_DEVICE,
> >> +					       ring->dma_attrs);
> >>  	} else if (dma_unmap_len(tx_buffer, len)) {
> >> -		dma_unmap_page(ring->dev,
> >> -			       dma_unmap_addr(tx_buffer, dma),
> >> -			       dma_unmap_len(tx_buffer, len),
> >> -			       DMA_TO_DEVICE);
> >> +		dma_unmap_single_attrs(ring->dev,
> >> +				       dma_unmap_addr(tx_buffer, dma),
> >> +				       dma_unmap_len(tx_buffer, len),
> >> +				       DMA_TO_DEVICE,
> >> +				       ring->dma_attrs);
> >>  	}
> >>
> >>  	tx_buffer->next_to_watch = NULL;
> >> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  	struct i40e_tx_buffer *tx_buf;
> >>  	struct i40e_tx_desc *tx_head;
> >>  	struct i40e_tx_desc *tx_desc;
> >> +	dma_addr_t addr;
> >> +	size_t size;
> >>  	unsigned int total_bytes = 0, total_packets = 0;
> >>  	unsigned int budget = vsi->work_limit;
> >>
> >> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  		napi_consume_skb(tx_buf->skb, napi_budget);
> >>
> >>  		/* unmap skb header data */
> >> -		dma_unmap_single(tx_ring->dev,
> >> -				 dma_unmap_addr(tx_buf, dma),
> >> -				 dma_unmap_len(tx_buf, len),
> >> -				 DMA_TO_DEVICE);
> >> +		dma_unmap_single_attrs(tx_ring->dev,
> >> +				       dma_unmap_addr(tx_buf, dma),
> >> +				       dma_unmap_len(tx_buf, len),
> >> +				       DMA_TO_DEVICE,
> >> +				       tx_ring->dma_attrs);
> >>
> >>  		/* clear tx_buffer data */
> >>  		tx_buf->skb = NULL;
> >> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
> >>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
> >>  			}
> >>
> >> +			addr = dma_unmap_addr(tx_buf, dma);
> >> +			size = dma_unmap_len(tx_buf, len);
> >>  			/* unmap any remaining paged data */
> >>  			if (dma_unmap_len(tx_buf, len)) {
> >> -				dma_unmap_page(tx_ring->dev,
> >> -					       dma_unmap_addr(tx_buf, dma),
> >> -					       dma_unmap_len(tx_buf, len),
> >> -					       DMA_TO_DEVICE);
> >> +				dma_unmap_single_attrs(tx_ring->dev,
> >> +						       addr,
> >> +						       size,
> >> +						       DMA_TO_DEVICE,
> >> +						       tx_ring->dma_attrs);
> >>  				dma_unmap_len_set(tx_buf, len, 0);
> >>  			}
> >>  		}
> >> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
> >> *tx_ring)
> >>  	 */
> >>  	tx_ring->size += sizeof(u32);
> >>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
> >> +#ifdef CONFIG_SPARC
> >> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> >> +	tx_ring->dma_attrs = 0;
> >> +#endif
> >>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
> >>  					   &tx_ring->dma, GFP_KERNEL);
> >>  	if (!tx_ring->desc) {
> >> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
> *rx_ring)
> >>  		if (!rx_bi->page)
> >>  			continue;
> >>
> >> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
> >> DMA_FROM_DEVICE);
> >> +		dma_unmap_single_attrs(dev,
> >> +				       rx_bi->dma,
> >> +				       PAGE_SIZE,
> >> +				       DMA_FROM_DEVICE,
> >> +				       rx_ring->dma_attrs);
> >>  		__free_pages(rx_bi->page, 0);
> >>
> >>  		rx_bi->page = NULL;
> >> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
> >> *rx_ring)
> >>  	/* Round up to nearest 4K */
> >>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
> >>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
> >> +#ifdef CONFIG_SPARC
> >> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
> >> +	rx_ring->dma_attrs = 0;
> >> +#endif
> >>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
> >>  					   &rx_ring->dma, GFP_KERNEL);
> >>
> >> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
> >> i40e_ring *rx_ring,
> >>  	}
> >>
> >>  	/* map page for use */
> >> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
> >> DMA_FROM_DEVICE);
> >> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
> >> PAGE_SIZE,
> >> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
> >>
> >>  	/* if mapping failed free memory back to system since
> >>  	 * there isn't much point in holding memory we can't use @@
> -1695,8
> >> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
> >> +*rx_ring,
> >>  		rx_ring->rx_stats.page_reuse_count++;
> >>  	} else {
> >>  		/* we are not reusing the buffer so unmap it */
> >> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
> >> -			       DMA_FROM_DEVICE);
> >> +		dma_unmap_single_attrs(rx_ring->dev,
> >> +				       rx_buffer->dma,
> >> +				       PAGE_SIZE,
> >> +				       DMA_FROM_DEVICE,
> >> +				       rx_ring->dma_attrs);
> >>  	}
> >>
> >>  	/* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@ static
> >> inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff
> >> *skb,
> >>  	first->skb = skb;
> >>  	first->tx_flags = tx_flags;
> >>
> >> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
> >> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
> >> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
> >>
> >>  	tx_desc = I40E_TX_DESC(tx_ring, i);
> >>  	tx_bi = first;
> >> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> index 5088405..9a86212 100644
> >> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> >> @@ -327,6 +327,7 @@ struct i40e_ring {
> >>
> >>  	unsigned int size;		/* length of descriptor ring in bytes */
> >>  	dma_addr_t dma;			/* physical address of ring */
> >> +	unsigned long dma_attrs;	/* DMA attributes */
> >>
> >>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
> >>  	struct i40e_q_vector *q_vector;	/* Backreference to associated
> vector
> >> */
> >> --
> >> 1.9.1
> >
> >

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
  2016-12-28  0:40       ` [Intel-wired-lan] " maowenan
@ 2016-12-28 21:55         ` tndave
  -1 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-28 21:55 UTC (permalink / raw)
  To: maowenan, jeffrey.t.kirsher, intel-wired-lan
  Cc: netdev, weiyongjun (A), Dingtianhong



On 12/27/2016 04:40 PM, maowenan wrote:
>
>
>> -----Original Message-----
>> From: tndave [mailto:tushar.n.dave@oracle.com]
>> Sent: Wednesday, December 28, 2016 6:28 AM
>> To: maowenan; jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
>> Cc: netdev@vger.kernel.org; weiyongjun (A); Dingtianhong
>> Subject: Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>>
>>
>> On 12/26/2016 03:39 AM, maowenan wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: netdev-owner@vger.kernel.org
>>>> [mailto:netdev-owner@vger.kernel.org]
>>>> On Behalf Of Tushar Dave
>>>> Sent: Tuesday, December 06, 2016 1:07 AM
>>>> To: jeffrey.t.kirsher@intel.com; intel-wired-lan@lists.osuosl.org
>>>> Cc: netdev@vger.kernel.org
>>>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe
>>>> relax ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have
>>> standard CSR to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control
>> Register in 82599, right?
>> Yes.
>> i40e datasheet mentions some CSR that can be used to enable/disable PCIe
>> relaxed ordering in device; however I don't see the exact definition of those
>> register in datasheet.
>> (https://www.mail-archive.com/netdev@vger.kernel.org/msg117219.html).
>>
>>> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
>> 82599?
>> No.
>> DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.
>>
>> -Tushar
>
> I understand that the PCIe Root Complex is the Host Bridge in the CPU that
> connects the CPU and memory to the PCIe architecture. So this attribute
> DMA_ATTR_WEAK_ORDERING is only applied on CPU side(the SPARC in you
> system), it can't apply on i40e, is it right?
Yes.
> And it is not the same as 82599 DCA control register's relax ordering bits.
It is not same as 82599 DCA control register's relax ordering bits.

-Tushar

> -Mao Wenan
>
>>>
>>> And to enable relax ordering mode in 82599 for SPARC using below codes:
>>> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) {
>>> 	u32 i;
>>>
>>> 	/* Clear the rate limiters */
>>> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
>>> 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
>>> 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
>>> 	}
>>> 	IXGBE_WRITE_FLUSH(hw);
>>>
>>> #ifndef CONFIG_SPARC
>>> 	/* Disable relaxed ordering */
>>> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
>>> 		u32 regval;
>>>
>>> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
>>> 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>>> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
>>> 	}
>>>
>>> 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
>>> 		u32 regval;
>>>
>>> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
>>> 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
>>> 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
>>> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
>>> 	}
>>> #endif
>>> 	return 0;
>>> }
>>>
>>>
>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma
>>>> attr DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> [root@brm-snt1-03 net]# iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
>>>> 5] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40932 [ 10] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40920 [ 11] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40924 [ 14] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> [root@brm-snt1-03 net]# iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>> cookies.
>>>> Check SNMP counters.
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
>>>> 5] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46884 [ 10] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46886 [ 11] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46890 [ 12] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
>>>> 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
>>>> Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
>>>> 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
>>>> GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
>>>> Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
>>>> 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
>>>> GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
>>>> Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>> ---
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>> ++++++++++++++++++++drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>>>> 1 +
>>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void
>>>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>>>  		else
>>>>  			dev_kfree_skb_any(tx_buffer->skb);
>>>>  		if (dma_unmap_len(tx_buffer, len))
>>>> -			dma_unmap_single(ring->dev,
>>>> -					 dma_unmap_addr(tx_buffer, dma),
>>>> -					 dma_unmap_len(tx_buffer, len),
>>>> -					 DMA_TO_DEVICE);
>>>> +			dma_unmap_single_attrs(ring->dev,
>>>> +					       dma_unmap_addr(tx_buffer, dma),
>>>> +					       dma_unmap_len(tx_buffer, len),
>>>> +					       DMA_TO_DEVICE,
>>>> +					       ring->dma_attrs);
>>>>  	} else if (dma_unmap_len(tx_buffer, len)) {
>>>> -		dma_unmap_page(ring->dev,
>>>> -			       dma_unmap_addr(tx_buffer, dma),
>>>> -			       dma_unmap_len(tx_buffer, len),
>>>> -			       DMA_TO_DEVICE);
>>>> +		dma_unmap_single_attrs(ring->dev,
>>>> +				       dma_unmap_addr(tx_buffer, dma),
>>>> +				       dma_unmap_len(tx_buffer, len),
>>>> +				       DMA_TO_DEVICE,
>>>> +				       ring->dma_attrs);
>>>>  	}
>>>>
>>>>  	tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  	struct i40e_tx_buffer *tx_buf;
>>>>  	struct i40e_tx_desc *tx_head;
>>>>  	struct i40e_tx_desc *tx_desc;
>>>> +	dma_addr_t addr;
>>>> +	size_t size;
>>>>  	unsigned int total_bytes = 0, total_packets = 0;
>>>>  	unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  		napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>>  		/* unmap skb header data */
>>>> -		dma_unmap_single(tx_ring->dev,
>>>> -				 dma_unmap_addr(tx_buf, dma),
>>>> -				 dma_unmap_len(tx_buf, len),
>>>> -				 DMA_TO_DEVICE);
>>>> +		dma_unmap_single_attrs(tx_ring->dev,
>>>> +				       dma_unmap_addr(tx_buf, dma),
>>>> +				       dma_unmap_len(tx_buf, len),
>>>> +				       DMA_TO_DEVICE,
>>>> +				       tx_ring->dma_attrs);
>>>>
>>>>  		/* clear tx_buffer data */
>>>>  		tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>>  			}
>>>>
>>>> +			addr = dma_unmap_addr(tx_buf, dma);
>>>> +			size = dma_unmap_len(tx_buf, len);
>>>>  			/* unmap any remaining paged data */
>>>>  			if (dma_unmap_len(tx_buf, len)) {
>>>> -				dma_unmap_page(tx_ring->dev,
>>>> -					       dma_unmap_addr(tx_buf, dma),
>>>> -					       dma_unmap_len(tx_buf, len),
>>>> -					       DMA_TO_DEVICE);
>>>> +				dma_unmap_single_attrs(tx_ring->dev,
>>>> +						       addr,
>>>> +						       size,
>>>> +						       DMA_TO_DEVICE,
>>>> +						       tx_ring->dma_attrs);
>>>>  				dma_unmap_len_set(tx_buf, len, 0);
>>>>  			}
>>>>  		}
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>>  	 */
>>>>  	tx_ring->size += sizeof(u32);
>>>>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>>>> +	tx_ring->dma_attrs = 0;
>>>> +#endif
>>>>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>>  					   &tx_ring->dma, GFP_KERNEL);
>>>>  	if (!tx_ring->desc) {
>>>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
>> *rx_ring)
>>>>  		if (!rx_bi->page)
>>>>  			continue;
>>>>
>>>> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>>>> DMA_FROM_DEVICE);
>>>> +		dma_unmap_single_attrs(dev,
>>>> +				       rx_bi->dma,
>>>> +				       PAGE_SIZE,
>>>> +				       DMA_FROM_DEVICE,
>>>> +				       rx_ring->dma_attrs);
>>>>  		__free_pages(rx_bi->page, 0);
>>>>
>>>>  		rx_bi->page = NULL;
>>>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>>>> *rx_ring)
>>>>  	/* Round up to nearest 4K */
>>>>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>>>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>>>> +	rx_ring->dma_attrs = 0;
>>>> +#endif
>>>>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>>>  					   &rx_ring->dma, GFP_KERNEL);
>>>>
>>>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>>>> i40e_ring *rx_ring,
>>>>  	}
>>>>
>>>>  	/* map page for use */
>>>> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>>>> DMA_FROM_DEVICE);
>>>> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>>>> PAGE_SIZE,
>>>> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>>>
>>>>  	/* if mapping failed free memory back to system since
>>>>  	 * there isn't much point in holding memory we can't use @@
>> -1695,8
>>>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
>>>> +*rx_ring,
>>>>  		rx_ring->rx_stats.page_reuse_count++;
>>>>  	} else {
>>>>  		/* we are not reusing the buffer so unmap it */
>>>> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>>>> -			       DMA_FROM_DEVICE);
>>>> +		dma_unmap_single_attrs(rx_ring->dev,
>>>> +				       rx_buffer->dma,
>>>> +				       PAGE_SIZE,
>>>> +				       DMA_FROM_DEVICE,
>>>> +				       rx_ring->dma_attrs);
>>>>  	}
>>>>
>>>>  	/* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@ static
>>>> inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff
>>>> *skb,
>>>>  	first->skb = skb;
>>>>  	first->tx_flags = tx_flags;
>>>>
>>>> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>>>> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>>>> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
>>>>
>>>>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>>>>  	tx_bi = first;
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> index 5088405..9a86212 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>>>
>>>>  	unsigned int size;		/* length of descriptor ring in bytes */
>>>>  	dma_addr_t dma;			/* physical address of ring */
>>>> +	unsigned long dma_attrs;	/* DMA attributes */
>>>>
>>>>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>>>>  	struct i40e_q_vector *q_vector;	/* Backreference to associated
>> vector
>>>> */
>>>> --
>>>> 1.9.1
>>>
>>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [Intel-wired-lan] [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
@ 2016-12-28 21:55         ` tndave
  0 siblings, 0 replies; 33+ messages in thread
From: tndave @ 2016-12-28 21:55 UTC (permalink / raw)
  To: intel-wired-lan



On 12/27/2016 04:40 PM, maowenan wrote:
>
>
>> -----Original Message-----
>> From: tndave [mailto:tushar.n.dave at oracle.com]
>> Sent: Wednesday, December 28, 2016 6:28 AM
>> To: maowenan; jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
>> Cc: netdev at vger.kernel.org; weiyongjun (A); Dingtianhong
>> Subject: Re: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>
>>
>>
>> On 12/26/2016 03:39 AM, maowenan wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: netdev-owner at vger.kernel.org
>>>> [mailto:netdev-owner at vger.kernel.org]
>>>> On Behalf Of Tushar Dave
>>>> Sent: Tuesday, December 06, 2016 1:07 AM
>>>> To: jeffrey.t.kirsher at intel.com; intel-wired-lan at lists.osuosl.org
>>>> Cc: netdev at vger.kernel.org
>>>> Subject: [RFC PATCH] i40e: enable PCIe relax ordering for SPARC
>>>>
>>>> Unlike previous generation NIC (e.g. ixgbe) i40e doesn't seem to have
>>>> standard CSR where PCIe relaxed ordering can be set. Without PCIe
>>>> relax ordering enabled, i40e performance is significantly low on SPARC.
>>>>
>>> [Mao Wenan]Hi Tushar, you have referred to i40e doesn't seem to have
>>> standard CSR to set PCIe relaxed ordering, this CSR like TX&Rx DCA Control
>> Register in 82599, right?
>> Yes.
>> i40e datasheet mentions some CSR that can be used to enable/disable PCIe
>> relaxed ordering in device; however I don't see the exact definition of those
>> register in datasheet.
>> (https://www.mail-archive.com/netdev at vger.kernel.org/msg117219.html).
>>
>>> Is DMA_ATTR_WEAK_ORDERING the same as TX&RX control register in
>> 82599?
>> No.
>> DMA_ATTR_WEAK_ORDERING applies to the PCIe root complex of the system.
>>
>> -Tushar
>
> I understand that the PCIe Root Complex is the Host Bridge in the CPU that
> connects the CPU and memory to the PCIe architecture. So this attribute
> DMA_ATTR_WEAK_ORDERING is only applied on CPU side(the SPARC in you
> system), it can't apply on i40e, is it right?
Yes.
> And it is not the same as 82599 DCA control register's relax ordering bits.
It is not same as 82599 DCA control register's relax ordering bits.

-Tushar

> -Mao Wenan
>
>>>
>>> And to enable relax ordering mode in 82599 for SPARC using below codes:
>>> s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) {
>>> 	u32 i;
>>>
>>> 	/* Clear the rate limiters */
>>> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
>>> 		IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
>>> 		IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, 0);
>>> 	}
>>> 	IXGBE_WRITE_FLUSH(hw);
>>>
>>> #ifndef CONFIG_SPARC
>>> 	/* Disable relaxed ordering */
>>> 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
>>> 		u32 regval;
>>>
>>> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
>>> 		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>>> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
>>> 	}
>>>
>>> 	for (i = 0; i < hw->mac.max_rx_queues; i++) {
>>> 		u32 regval;
>>>
>>> 		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
>>> 		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
>>> 			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
>>> 		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
>>> 	}
>>> #endif
>>> 	return 0;
>>> }
>>>
>>>
>>>
>>>> This patch sets PCIe relax ordering for SPARC arch by setting dma
>>>> attr DMA_ATTR_WEAK_ORDERING for every tx and rx DMA map/unmap.
>>>> This has shown 10x increase in performance numbers.
>>>>
>>>> e.g.
>>>> iperf TCP test with 10 threads on SPARC S7
>>>>
>>>> Test 1: Without this patch
>>>>
>>>> [root at brm-snt1-03 net]# iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 40926 [
>>>> 5] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40934 [  6] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40930 [  7] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40928 [  8] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40922 [  9] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40932 [ 10] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40920 [ 11] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40924 [ 14] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 40982 [ 12] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 40980
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec   566 MBytes   237 Mbits/sec
>>>> [  5]  0.0-20.0 sec   532 MBytes   223 Mbits/sec
>>>> [  6]  0.0-20.0 sec   537 MBytes   225 Mbits/sec
>>>> [  8]  0.0-20.0 sec   546 MBytes   229 Mbits/sec
>>>> [ 11]  0.0-20.0 sec   592 MBytes   248 Mbits/sec
>>>> [  7]  0.0-20.0 sec   539 MBytes   226 Mbits/sec
>>>> [  9]  0.0-20.0 sec   572 MBytes   240 Mbits/sec
>>>> [ 10]  0.0-20.0 sec   604 MBytes   253 Mbits/sec
>>>> [ 14]  0.0-20.0 sec   567 MBytes   238 Mbits/sec
>>>> [ 12]  0.0-20.0 sec   511 MBytes   214 Mbits/sec
>>>> [SUM]  0.0-20.0 sec  5.44 GBytes  2.33 Gbits/sec
>>>>
>>>> Test 2: with this patch:
>>>>
>>>> [root at brm-snt1-03 net]# iperf -s
>>>> ------------------------------------------------------------
>>>> Server listening on TCP port 5001
>>>> TCP window size: 85.3 KByte (default)
>>>> ------------------------------------------------------------
>>>> TCP: request_sock_TCP: Possible SYN flooding on port 5001. Sending
>> cookies.
>>>> Check SNMP counters.
>>>> [  4] local 16.0.0.7 port 5001 connected with 16.0.0.1 port 46876 [
>>>> 5] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46874 [  6] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46872 [  7] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46880 [  8] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46878 [  9] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46884 [ 10] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46886 [ 11] local 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46890 [ 12] local
>>>> 16.0.0.7 port 5001 connected with 16.0.0.1 port 46888 [ 13] local
>>>> 16.0.0.7 port
>>>> 5001 connected with 16.0.0.1 port 46882
>>>> [ ID] Interval       Transfer     Bandwidth
>>>> [  4]  0.0-20.0 sec  7.45 GBytes  3.19 Gbits/sec [  5]  0.0-20.0 sec
>>>> 7.48 GBytes  3.21 Gbits/sec [  7]  0.0-20.0 sec  7.34 GBytes  3.15
>>>> Gbits/sec [  8]  0.0-20.0 sec  7.42 GBytes  3.18 Gbits/sec [  9]
>>>> 0.0-20.0 sec  7.24 GBytes  3.11 Gbits/sec [ 10]  0.0-20.0 sec  7.40
>>>> GBytes  3.17 Gbits/sec [ 12]  0.0-20.0 sec  7.49 GBytes  3.21
>>>> Gbits/sec [  6]  0.0-20.0 sec  7.30 GBytes  3.13 Gbits/sec [ 11]
>>>> 0.0-20.0 sec  7.44 GBytes  3.19 Gbits/sec [ 13]  0.0-20.0 sec  7.22
>>>> GBytes  3.10 Gbits/sec [SUM]  0.0-20.0 sec  73.8 GBytes  31.6
>>>> Gbits/sec
>>>>
>>>> NOTE: In my testing, this patch does _not_ show any harm to i40e
>>>> performance numbers on x86.
>>>>
>>>> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
>>>> ---
>>>>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 69
>>>> ++++++++++++++++++++---------
>>>> ++++++++++++++++++++drivers/net/ethernet/intel/i40e/i40e_txrx.h |
>>>> 1 +
>>>>  2 files changed, 49 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> index 6287bf6..800dca7 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>>>> @@ -551,15 +551,17 @@ static void
>>>> i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
>>>>  		else
>>>>  			dev_kfree_skb_any(tx_buffer->skb);
>>>>  		if (dma_unmap_len(tx_buffer, len))
>>>> -			dma_unmap_single(ring->dev,
>>>> -					 dma_unmap_addr(tx_buffer, dma),
>>>> -					 dma_unmap_len(tx_buffer, len),
>>>> -					 DMA_TO_DEVICE);
>>>> +			dma_unmap_single_attrs(ring->dev,
>>>> +					       dma_unmap_addr(tx_buffer, dma),
>>>> +					       dma_unmap_len(tx_buffer, len),
>>>> +					       DMA_TO_DEVICE,
>>>> +					       ring->dma_attrs);
>>>>  	} else if (dma_unmap_len(tx_buffer, len)) {
>>>> -		dma_unmap_page(ring->dev,
>>>> -			       dma_unmap_addr(tx_buffer, dma),
>>>> -			       dma_unmap_len(tx_buffer, len),
>>>> -			       DMA_TO_DEVICE);
>>>> +		dma_unmap_single_attrs(ring->dev,
>>>> +				       dma_unmap_addr(tx_buffer, dma),
>>>> +				       dma_unmap_len(tx_buffer, len),
>>>> +				       DMA_TO_DEVICE,
>>>> +				       ring->dma_attrs);
>>>>  	}
>>>>
>>>>  	tx_buffer->next_to_watch = NULL;
>>>> @@ -662,6 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  	struct i40e_tx_buffer *tx_buf;
>>>>  	struct i40e_tx_desc *tx_head;
>>>>  	struct i40e_tx_desc *tx_desc;
>>>> +	dma_addr_t addr;
>>>> +	size_t size;
>>>>  	unsigned int total_bytes = 0, total_packets = 0;
>>>>  	unsigned int budget = vsi->work_limit;
>>>>
>>>> @@ -696,10 +700,11 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  		napi_consume_skb(tx_buf->skb, napi_budget);
>>>>
>>>>  		/* unmap skb header data */
>>>> -		dma_unmap_single(tx_ring->dev,
>>>> -				 dma_unmap_addr(tx_buf, dma),
>>>> -				 dma_unmap_len(tx_buf, len),
>>>> -				 DMA_TO_DEVICE);
>>>> +		dma_unmap_single_attrs(tx_ring->dev,
>>>> +				       dma_unmap_addr(tx_buf, dma),
>>>> +				       dma_unmap_len(tx_buf, len),
>>>> +				       DMA_TO_DEVICE,
>>>> +				       tx_ring->dma_attrs);
>>>>
>>>>  		/* clear tx_buffer data */
>>>>  		tx_buf->skb = NULL;
>>>> @@ -717,12 +722,15 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
>>>>  				tx_desc = I40E_TX_DESC(tx_ring, 0);
>>>>  			}
>>>>
>>>> +			addr = dma_unmap_addr(tx_buf, dma);
>>>> +			size = dma_unmap_len(tx_buf, len);
>>>>  			/* unmap any remaining paged data */
>>>>  			if (dma_unmap_len(tx_buf, len)) {
>>>> -				dma_unmap_page(tx_ring->dev,
>>>> -					       dma_unmap_addr(tx_buf, dma),
>>>> -					       dma_unmap_len(tx_buf, len),
>>>> -					       DMA_TO_DEVICE);
>>>> +				dma_unmap_single_attrs(tx_ring->dev,
>>>> +						       addr,
>>>> +						       size,
>>>> +						       DMA_TO_DEVICE,
>>>> +						       tx_ring->dma_attrs);
>>>>  				dma_unmap_len_set(tx_buf, len, 0);
>>>>  			}
>>>>  		}
>>>> @@ -1010,6 +1018,11 @@ int i40e_setup_tx_descriptors(struct i40e_ring
>>>> *tx_ring)
>>>>  	 */
>>>>  	tx_ring->size += sizeof(u32);
>>>>  	tx_ring->size = ALIGN(tx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +	tx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>>>> +	tx_ring->dma_attrs = 0;
>>>> +#endif
>>>>  	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
>>>>  					   &tx_ring->dma, GFP_KERNEL);
>>>>  	if (!tx_ring->desc) {
>>>> @@ -1053,7 +1066,11 @@ void i40e_clean_rx_ring(struct i40e_ring
>> *rx_ring)
>>>>  		if (!rx_bi->page)
>>>>  			continue;
>>>>
>>>> -		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE,
>>>> DMA_FROM_DEVICE);
>>>> +		dma_unmap_single_attrs(dev,
>>>> +				       rx_bi->dma,
>>>> +				       PAGE_SIZE,
>>>> +				       DMA_FROM_DEVICE,
>>>> +				       rx_ring->dma_attrs);
>>>>  		__free_pages(rx_bi->page, 0);
>>>>
>>>>  		rx_bi->page = NULL;
>>>> @@ -1113,6 +1130,11 @@ int i40e_setup_rx_descriptors(struct i40e_ring
>>>> *rx_ring)
>>>>  	/* Round up to nearest 4K */
>>>>  	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
>>>>  	rx_ring->size = ALIGN(rx_ring->size, 4096);
>>>> +#ifdef CONFIG_SPARC
>>>> +	rx_ring->dma_attrs = DMA_ATTR_WEAK_ORDERING; #else
>>>> +	rx_ring->dma_attrs = 0;
>>>> +#endif
>>>>  	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
>>>>  					   &rx_ring->dma, GFP_KERNEL);
>>>>
>>>> @@ -1182,7 +1204,8 @@ static bool i40e_alloc_mapped_page(struct
>>>> i40e_ring *rx_ring,
>>>>  	}
>>>>
>>>>  	/* map page for use */
>>>> -	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE,
>>>> DMA_FROM_DEVICE);
>>>> +	dma = dma_map_single_attrs(rx_ring->dev, page_address(page),
>>>> PAGE_SIZE,
>>>> +				   DMA_FROM_DEVICE, rx_ring->dma_attrs);
>>>>
>>>>  	/* if mapping failed free memory back to system since
>>>>  	 * there isn't much point in holding memory we can't use @@
>> -1695,8
>>>> +1718,11 @@ struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring
>>>> +*rx_ring,
>>>>  		rx_ring->rx_stats.page_reuse_count++;
>>>>  	} else {
>>>>  		/* we are not reusing the buffer so unmap it */
>>>> -		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
>>>> -			       DMA_FROM_DEVICE);
>>>> +		dma_unmap_single_attrs(rx_ring->dev,
>>>> +				       rx_buffer->dma,
>>>> +				       PAGE_SIZE,
>>>> +				       DMA_FROM_DEVICE,
>>>> +				       rx_ring->dma_attrs);
>>>>  	}
>>>>
>>>>  	/* clear contents of buffer_info */ @@ -2737,7 +2763,8 @@ static
>>>> inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff
>>>> *skb,
>>>>  	first->skb = skb;
>>>>  	first->tx_flags = tx_flags;
>>>>
>>>> -	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
>>>> +	dma = dma_map_single_attrs(tx_ring->dev, skb->data, size,
>>>> +				   DMA_TO_DEVICE, tx_ring->dma_attrs);
>>>>
>>>>  	tx_desc = I40E_TX_DESC(tx_ring, i);
>>>>  	tx_bi = first;
>>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> index 5088405..9a86212 100644
>>>> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>>>> @@ -327,6 +327,7 @@ struct i40e_ring {
>>>>
>>>>  	unsigned int size;		/* length of descriptor ring in bytes */
>>>>  	dma_addr_t dma;			/* physical address of ring */
>>>> +	unsigned long dma_attrs;	/* DMA attributes */
>>>>
>>>>  	struct i40e_vsi *vsi;		/* Backreference to associated VSI */
>>>>  	struct i40e_q_vector *q_vector;	/* Backreference to associated
>> vector
>>>> */
>>>> --
>>>> 1.9.1
>>>
>>>
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2016-12-28 21:57 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-05 17:07 [RFC PATCH] i40e: enable PCIe relax ordering for SPARC Tushar Dave
2016-12-05 17:07 ` [Intel-wired-lan] " Tushar Dave
2016-12-05 21:54 ` Alexander Duyck
2016-12-05 21:54   ` Alexander Duyck
2016-12-05 22:23   ` tndave
2016-12-05 22:23     ` tndave
2016-12-06 17:10     ` Alexander Duyck
2016-12-06 17:10       ` Alexander Duyck
2016-12-06 22:04       ` tndave
2016-12-06 22:04         ` tndave
2016-12-08 10:43       ` David Laight
2016-12-08 10:43         ` David Laight
2016-12-08 16:05         ` Alexander Duyck
2016-12-08 16:05           ` Alexander Duyck
2016-12-09  0:45           ` tndave
2016-12-09  0:45             ` tndave
2016-12-09  1:16             ` tndave
2016-12-09  1:16               ` tndave
2016-12-09  1:16               ` tndave
2016-12-08 10:31   ` David Laight
2016-12-08 10:31     ` David Laight
2016-12-28  0:23   ` maowenan
2016-12-28  0:23     ` maowenan
2016-12-26 11:39 ` maowenan
2016-12-26 11:39   ` [Intel-wired-lan] " maowenan
2016-12-27 18:32   ` Alexander Duyck
2016-12-27 18:32     ` Alexander Duyck
2016-12-27 22:27   ` tndave
2016-12-27 22:27     ` [Intel-wired-lan] " tndave
2016-12-28  0:40     ` maowenan
2016-12-28  0:40       ` [Intel-wired-lan] " maowenan
2016-12-28 21:55       ` tndave
2016-12-28 21:55         ` [Intel-wired-lan] " tndave

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.