bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/5] tsnep: XDP socket zero-copy support
@ 2023-04-02 19:38 Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 1/5] tsnep: Rework TX/RX queue initialization Gerhard Engleder
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

Implement XDP socket zero-copy support for tsnep driver. I tried to
follow existing drivers like igc as far as possible. But one main
difference is that tsnep does not need any reconfiguration for XDP BPF
program setup. So I decided to keep this behavior no matter if a XSK
pool is used or not. As a result, tsnep starts using the XSK pool even
if no XDP BPF program is available.

Another difference is that I tried to prevent potentially failing
allocations during XSK pool setup. E.g. both memory models for page pool
and XSK pool are registered all the time. Thus, XSK pool setup cannot
end up with not working queues.

Some prework is done to reduce the last two XSK commits to actual XSK
changes.

Gerhard Engleder (5):
  tsnep: Rework TX/RX queue initialization
  tsnep: Add functions for queue enable/disable
  tsnep: Move skb receive action to separate function
  tsnep: Add XDP socket zero-copy RX support
  tsnep: Add XDP socket zero-copy TX support

 drivers/net/ethernet/engleder/tsnep.h      |   9 +
 drivers/net/ethernet/engleder/tsnep_main.c | 770 ++++++++++++++++++---
 drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++
 3 files changed, 738 insertions(+), 108 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH net-next 1/5] tsnep: Rework TX/RX queue initialization
  2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
@ 2023-04-02 19:38 ` Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 2/5] tsnep: Add functions for queue enable/disable Gerhard Engleder
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

Make initialization of TX and RX queues less dynamic by moving some
initialization from netdev open/close to device probing.

Additionally, move some initialization code to separate functions to
enable future use in other execution paths.

This is done as preparation for queue reconfigure at runtime, which is
necessary for XSK zero-copy support.

Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
---
 drivers/net/ethernet/engleder/tsnep_main.c | 90 ++++++++++++----------
 1 file changed, 49 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index ed1b6102cfeb..f2162a25e97c 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -265,7 +265,7 @@ static void tsnep_tx_ring_cleanup(struct tsnep_tx *tx)
 	}
 }
 
-static int tsnep_tx_ring_init(struct tsnep_tx *tx)
+static int tsnep_tx_ring_create(struct tsnep_tx *tx)
 {
 	struct device *dmadev = tx->adapter->dmadev;
 	struct tsnep_tx_entry *entry;
@@ -288,6 +288,7 @@ static int tsnep_tx_ring_init(struct tsnep_tx *tx)
 			entry->desc = (struct tsnep_tx_desc *)
 				(((u8 *)entry->desc_wb) + TSNEP_DESC_OFFSET);
 			entry->desc_dma = tx->page_dma[i] + TSNEP_DESC_SIZE * j;
+			entry->owner_user_flag = false;
 		}
 	}
 	for (i = 0; i < TSNEP_RING_SIZE; i++) {
@@ -303,6 +304,19 @@ static int tsnep_tx_ring_init(struct tsnep_tx *tx)
 	return retval;
 }
 
+static void tsnep_tx_init(struct tsnep_tx *tx)
+{
+	dma_addr_t dma;
+
+	dma = tx->entry[0].desc_dma | TSNEP_RESET_OWNER_COUNTER;
+	iowrite32(DMA_ADDR_LOW(dma), tx->addr + TSNEP_TX_DESC_ADDR_LOW);
+	iowrite32(DMA_ADDR_HIGH(dma), tx->addr + TSNEP_TX_DESC_ADDR_HIGH);
+	tx->write = 0;
+	tx->read = 0;
+	tx->owner_counter = 1;
+	tx->increment_owner_counter = TSNEP_RING_SIZE - 1;
+}
+
 static void tsnep_tx_activate(struct tsnep_tx *tx, int index, int length,
 			      bool last)
 {
@@ -731,26 +745,15 @@ static bool tsnep_tx_pending(struct tsnep_tx *tx)
 	return pending;
 }
 
-static int tsnep_tx_open(struct tsnep_adapter *adapter, void __iomem *addr,
-			 int queue_index, struct tsnep_tx *tx)
+static int tsnep_tx_open(struct tsnep_tx *tx)
 {
-	dma_addr_t dma;
 	int retval;
 
-	memset(tx, 0, sizeof(*tx));
-	tx->adapter = adapter;
-	tx->addr = addr;
-	tx->queue_index = queue_index;
-
-	retval = tsnep_tx_ring_init(tx);
+	retval = tsnep_tx_ring_create(tx);
 	if (retval)
 		return retval;
 
-	dma = tx->entry[0].desc_dma | TSNEP_RESET_OWNER_COUNTER;
-	iowrite32(DMA_ADDR_LOW(dma), tx->addr + TSNEP_TX_DESC_ADDR_LOW);
-	iowrite32(DMA_ADDR_HIGH(dma), tx->addr + TSNEP_TX_DESC_ADDR_HIGH);
-	tx->owner_counter = 1;
-	tx->increment_owner_counter = TSNEP_RING_SIZE - 1;
+	tsnep_tx_init(tx);
 
 	return 0;
 }
@@ -795,7 +798,7 @@ static void tsnep_rx_ring_cleanup(struct tsnep_rx *rx)
 	}
 }
 
-static int tsnep_rx_ring_init(struct tsnep_rx *rx)
+static int tsnep_rx_ring_create(struct tsnep_rx *rx)
 {
 	struct device *dmadev = rx->adapter->dmadev;
 	struct tsnep_rx_entry *entry;
@@ -850,6 +853,19 @@ static int tsnep_rx_ring_init(struct tsnep_rx *rx)
 	return retval;
 }
 
+static void tsnep_rx_init(struct tsnep_rx *rx)
+{
+	dma_addr_t dma;
+
+	dma = rx->entry[0].desc_dma | TSNEP_RESET_OWNER_COUNTER;
+	iowrite32(DMA_ADDR_LOW(dma), rx->addr + TSNEP_RX_DESC_ADDR_LOW);
+	iowrite32(DMA_ADDR_HIGH(dma), rx->addr + TSNEP_RX_DESC_ADDR_HIGH);
+	rx->write = 0;
+	rx->read = 0;
+	rx->owner_counter = 1;
+	rx->increment_owner_counter = TSNEP_RING_SIZE - 1;
+}
+
 static int tsnep_rx_desc_available(struct tsnep_rx *rx)
 {
 	if (rx->read <= rx->write)
@@ -1181,26 +1197,15 @@ static bool tsnep_rx_pending(struct tsnep_rx *rx)
 	return false;
 }
 
-static int tsnep_rx_open(struct tsnep_adapter *adapter, void __iomem *addr,
-			 int queue_index, struct tsnep_rx *rx)
+static int tsnep_rx_open(struct tsnep_rx *rx)
 {
-	dma_addr_t dma;
 	int retval;
 
-	memset(rx, 0, sizeof(*rx));
-	rx->adapter = adapter;
-	rx->addr = addr;
-	rx->queue_index = queue_index;
-
-	retval = tsnep_rx_ring_init(rx);
+	retval = tsnep_rx_ring_create(rx);
 	if (retval)
 		return retval;
 
-	dma = rx->entry[0].desc_dma | TSNEP_RESET_OWNER_COUNTER;
-	iowrite32(DMA_ADDR_LOW(dma), rx->addr + TSNEP_RX_DESC_ADDR_LOW);
-	iowrite32(DMA_ADDR_HIGH(dma), rx->addr + TSNEP_RX_DESC_ADDR_HIGH);
-	rx->owner_counter = 1;
-	rx->increment_owner_counter = TSNEP_RING_SIZE - 1;
+	tsnep_rx_init(rx);
 
 	tsnep_rx_refill(rx, tsnep_rx_desc_available(rx), false);
 
@@ -1377,27 +1382,18 @@ static int tsnep_queue_open(struct tsnep_adapter *adapter,
 static int tsnep_netdev_open(struct net_device *netdev)
 {
 	struct tsnep_adapter *adapter = netdev_priv(netdev);
-	int tx_queue_index = 0;
-	int rx_queue_index = 0;
-	void __iomem *addr;
 	int i, retval;
 
 	for (i = 0; i < adapter->num_queues; i++) {
 		if (adapter->queue[i].tx) {
-			addr = adapter->addr + TSNEP_QUEUE(tx_queue_index);
-			retval = tsnep_tx_open(adapter, addr, tx_queue_index,
-					       adapter->queue[i].tx);
+			retval = tsnep_tx_open(adapter->queue[i].tx);
 			if (retval)
 				goto failed;
-			tx_queue_index++;
 		}
 		if (adapter->queue[i].rx) {
-			addr = adapter->addr + TSNEP_QUEUE(rx_queue_index);
-			retval = tsnep_rx_open(adapter, addr, rx_queue_index,
-					       adapter->queue[i].rx);
+			retval = tsnep_rx_open(adapter->queue[i].rx);
 			if (retval)
 				goto failed;
-			rx_queue_index++;
 		}
 
 		retval = tsnep_queue_open(adapter, &adapter->queue[i], i == 0);
@@ -1798,7 +1794,13 @@ static int tsnep_queue_init(struct tsnep_adapter *adapter, int queue_count)
 	adapter->num_queues = 1;
 	adapter->queue[0].irq = retval;
 	adapter->queue[0].tx = &adapter->tx[0];
+	adapter->queue[0].tx->adapter = adapter;
+	adapter->queue[0].tx->addr = adapter->addr + TSNEP_QUEUE(0);
+	adapter->queue[0].tx->queue_index = 0;
 	adapter->queue[0].rx = &adapter->rx[0];
+	adapter->queue[0].rx->adapter = adapter;
+	adapter->queue[0].rx->addr = adapter->addr + TSNEP_QUEUE(0);
+	adapter->queue[0].rx->queue_index = 0;
 	adapter->queue[0].irq_mask = irq_mask;
 	adapter->queue[0].irq_delay_addr = adapter->addr + ECM_INT_DELAY;
 	retval = tsnep_set_irq_coalesce(&adapter->queue[0],
@@ -1822,7 +1824,13 @@ static int tsnep_queue_init(struct tsnep_adapter *adapter, int queue_count)
 		adapter->num_queues++;
 		adapter->queue[i].irq = retval;
 		adapter->queue[i].tx = &adapter->tx[i];
+		adapter->queue[i].tx->adapter = adapter;
+		adapter->queue[i].tx->addr = adapter->addr + TSNEP_QUEUE(i);
+		adapter->queue[i].tx->queue_index = i;
 		adapter->queue[i].rx = &adapter->rx[i];
+		adapter->queue[i].rx->adapter = adapter;
+		adapter->queue[i].rx->addr = adapter->addr + TSNEP_QUEUE(i);
+		adapter->queue[i].rx->queue_index = i;
 		adapter->queue[i].irq_mask =
 			irq_mask << (ECM_INT_TXRX_SHIFT * i);
 		adapter->queue[i].irq_delay_addr =
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next 2/5] tsnep: Add functions for queue enable/disable
  2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 1/5] tsnep: Rework TX/RX queue initialization Gerhard Engleder
@ 2023-04-02 19:38 ` Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 3/5] tsnep: Move skb receive action to separate function Gerhard Engleder
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

Move queue enable and disable code to separate functions. This way the
activation and deactivation of the queues are defined actions, which can
be used in future execution paths.

This functions will be used for the queue reconfiguration at runtime,
which is necessary for XSK zero-copy support.

Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
---
 drivers/net/ethernet/engleder/tsnep_main.c | 94 +++++++++++++++-------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index f2162a25e97c..c49e1d322def 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -866,6 +866,24 @@ static void tsnep_rx_init(struct tsnep_rx *rx)
 	rx->increment_owner_counter = TSNEP_RING_SIZE - 1;
 }
 
+static void tsnep_rx_enable(struct tsnep_rx *rx)
+{
+	/* descriptor properties shall be valid before hardware is notified */
+	dma_wmb();
+
+	iowrite32(TSNEP_CONTROL_RX_ENABLE, rx->addr + TSNEP_CONTROL);
+}
+
+static void tsnep_rx_disable(struct tsnep_rx *rx)
+{
+	u32 val;
+
+	iowrite32(TSNEP_CONTROL_RX_DISABLE, rx->addr + TSNEP_CONTROL);
+	readx_poll_timeout(ioread32, rx->addr + TSNEP_CONTROL, val,
+			   ((val & TSNEP_CONTROL_RX_ENABLE) == 0), 10000,
+			   1000000);
+}
+
 static int tsnep_rx_desc_available(struct tsnep_rx *rx)
 {
 	if (rx->read <= rx->write)
@@ -932,13 +950,10 @@ static void tsnep_rx_activate(struct tsnep_rx *rx, int index)
 	entry->desc->properties = __cpu_to_le32(entry->properties);
 }
 
-static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
+static int tsnep_rx_alloc(struct tsnep_rx *rx, int count, bool reuse)
 {
-	int index;
 	bool alloc_failed = false;
-	bool enable = false;
-	int i;
-	int retval;
+	int i, index, retval;
 
 	for (i = 0; i < count && !alloc_failed; i++) {
 		index = (rx->write + i) % TSNEP_RING_SIZE;
@@ -956,22 +971,23 @@ static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
 		}
 
 		tsnep_rx_activate(rx, index);
-
-		enable = true;
 	}
 
-	if (enable) {
+	if (i)
 		rx->write = (rx->write + i) % TSNEP_RING_SIZE;
 
-		/* descriptor properties shall be valid before hardware is
-		 * notified
-		 */
-		dma_wmb();
+	return i;
+}
 
-		iowrite32(TSNEP_CONTROL_RX_ENABLE, rx->addr + TSNEP_CONTROL);
-	}
+static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
+{
+	int desc_refilled;
 
-	return i;
+	desc_refilled = tsnep_rx_alloc(rx, count, reuse);
+	if (desc_refilled)
+		tsnep_rx_enable(rx);
+
+	return desc_refilled;
 }
 
 static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
@@ -1199,6 +1215,7 @@ static bool tsnep_rx_pending(struct tsnep_rx *rx)
 
 static int tsnep_rx_open(struct tsnep_rx *rx)
 {
+	int desc_available;
 	int retval;
 
 	retval = tsnep_rx_ring_create(rx);
@@ -1207,20 +1224,19 @@ static int tsnep_rx_open(struct tsnep_rx *rx)
 
 	tsnep_rx_init(rx);
 
-	tsnep_rx_refill(rx, tsnep_rx_desc_available(rx), false);
+	desc_available = tsnep_rx_desc_available(rx);
+	retval = tsnep_rx_alloc(rx, desc_available, false);
+	if (retval != desc_available) {
+		tsnep_rx_ring_cleanup(rx);
+
+		return -ENOMEM;
+	}
 
 	return 0;
 }
 
 static void tsnep_rx_close(struct tsnep_rx *rx)
 {
-	u32 val;
-
-	iowrite32(TSNEP_CONTROL_RX_DISABLE, rx->addr + TSNEP_CONTROL);
-	readx_poll_timeout(ioread32, rx->addr + TSNEP_CONTROL, val,
-			   ((val & TSNEP_CONTROL_RX_ENABLE) == 0), 10000,
-			   1000000);
-
 	tsnep_rx_ring_cleanup(rx);
 }
 
@@ -1379,6 +1395,27 @@ static int tsnep_queue_open(struct tsnep_adapter *adapter,
 	return retval;
 }
 
+static void tsnep_queue_enable(struct tsnep_queue *queue)
+{
+	napi_enable(&queue->napi);
+	tsnep_enable_irq(queue->adapter, queue->irq_mask);
+
+	if (queue->rx)
+		tsnep_rx_enable(queue->rx);
+}
+
+static void tsnep_queue_disable(struct tsnep_queue *queue)
+{
+	napi_disable(&queue->napi);
+	tsnep_disable_irq(queue->adapter, queue->irq_mask);
+
+	/* disable RX after NAPI polling has been disabled, because RX can be
+	 * enabled during NAPI polling
+	 */
+	if (queue->rx)
+		tsnep_rx_disable(queue->rx);
+}
+
 static int tsnep_netdev_open(struct net_device *netdev)
 {
 	struct tsnep_adapter *adapter = netdev_priv(netdev);
@@ -1415,11 +1452,8 @@ static int tsnep_netdev_open(struct net_device *netdev)
 	if (retval)
 		goto phy_failed;
 
-	for (i = 0; i < adapter->num_queues; i++) {
-		napi_enable(&adapter->queue[i].napi);
-
-		tsnep_enable_irq(adapter, adapter->queue[i].irq_mask);
-	}
+	for (i = 0; i < adapter->num_queues; i++)
+		tsnep_queue_enable(&adapter->queue[i]);
 
 	return 0;
 
@@ -1446,9 +1480,7 @@ static int tsnep_netdev_close(struct net_device *netdev)
 	tsnep_phy_close(adapter);
 
 	for (i = 0; i < adapter->num_queues; i++) {
-		tsnep_disable_irq(adapter, adapter->queue[i].irq_mask);
-
-		napi_disable(&adapter->queue[i].napi);
+		tsnep_queue_disable(&adapter->queue[i]);
 
 		tsnep_queue_close(&adapter->queue[i], i == 0);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next 3/5] tsnep: Move skb receive action to separate function
  2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 1/5] tsnep: Rework TX/RX queue initialization Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 2/5] tsnep: Add functions for queue enable/disable Gerhard Engleder
@ 2023-04-02 19:38 ` Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support Gerhard Engleder
  2023-04-02 19:38 ` [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support Gerhard Engleder
  4 siblings, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

The function tsnep_rx_poll() is already pretty long and the skb receive
action can be reused for XSK zero-copy support. Move page based skb
receive to separate function.

Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
---
 drivers/net/ethernet/engleder/tsnep_main.c | 39 +++++++++++++---------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index c49e1d322def..6d63b379f05a 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -1077,6 +1077,28 @@ static struct sk_buff *tsnep_build_skb(struct tsnep_rx *rx, struct page *page,
 	return skb;
 }
 
+static void tsnep_rx_page(struct tsnep_rx *rx, struct napi_struct *napi,
+			  struct page *page, int length)
+{
+	struct sk_buff *skb;
+
+	skb = tsnep_build_skb(rx, page, length);
+	if (skb) {
+		page_pool_release_page(rx->page_pool, page);
+
+		rx->packets++;
+		rx->bytes += length;
+		if (skb->pkt_type == PACKET_MULTICAST)
+			rx->multicast++;
+
+		napi_gro_receive(napi, skb);
+	} else {
+		page_pool_recycle_direct(rx->page_pool, page);
+
+		rx->dropped++;
+	}
+}
+
 static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
 			 int budget)
 {
@@ -1086,7 +1108,6 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
 	struct netdev_queue *tx_nq;
 	struct bpf_prog *prog;
 	struct xdp_buff xdp;
-	struct sk_buff *skb;
 	struct tsnep_tx *tx;
 	int desc_available;
 	int xdp_status = 0;
@@ -1171,21 +1192,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
 			}
 		}
 
-		skb = tsnep_build_skb(rx, entry->page, length);
-		if (skb) {
-			page_pool_release_page(rx->page_pool, entry->page);
-
-			rx->packets++;
-			rx->bytes += length;
-			if (skb->pkt_type == PACKET_MULTICAST)
-				rx->multicast++;
-
-			napi_gro_receive(napi, skb);
-		} else {
-			page_pool_recycle_direct(rx->page_pool, entry->page);
-
-			rx->dropped++;
-		}
+		tsnep_rx_page(rx, napi, entry->page, length);
 		entry->page = NULL;
 	}
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
                   ` (2 preceding siblings ...)
  2023-04-02 19:38 ` [PATCH net-next 3/5] tsnep: Move skb receive action to separate function Gerhard Engleder
@ 2023-04-02 19:38 ` Gerhard Engleder
  2023-04-03 17:19   ` Maciej Fijalkowski
  2023-04-02 19:38 ` [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support Gerhard Engleder
  4 siblings, 1 reply; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

Add support for XSK zero-copy to RX path. The setup of the XSK pool can
be done at runtime. If the netdev is running, then the queue must be
disabled and enabled during reconfiguration. This can be done easily
with functions introduced in previous commits.

A more important property is that, if the netdev is running, then the
setup of the XSK pool shall not stop the netdev in case of errors. A
broken netdev after a failed XSK pool setup is bad behavior. Therefore,
the allocation and setup of resources during XSK pool setup is done only
before any queue is disabled. Additionally, freeing and later allocation
of resources is eliminated in some cases. Page pool entries are kept for
later use. Two memory models are registered in parallel. As a result,
the XSK pool setup cannot fail during queue reconfiguration.

In contrast to other drivers, XSK pool setup and XDP BPF program setup
are separate actions. XSK pool setup can be done without any XDP BPF
program. The XDP BPF program can be added, removed or changed without
any reconfiguration of the XSK pool.

Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
---
 drivers/net/ethernet/engleder/tsnep.h      |   7 +
 drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
 drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
 3 files changed, 488 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/engleder/tsnep.h b/drivers/net/ethernet/engleder/tsnep.h
index 058c2bcf31a7..836fd6b1d62e 100644
--- a/drivers/net/ethernet/engleder/tsnep.h
+++ b/drivers/net/ethernet/engleder/tsnep.h
@@ -101,6 +101,7 @@ struct tsnep_rx_entry {
 	u32 properties;
 
 	struct page *page;
+	struct xdp_buff *xdp;
 	size_t len;
 	dma_addr_t dma;
 };
@@ -120,6 +121,7 @@ struct tsnep_rx {
 	u32 owner_counter;
 	int increment_owner_counter;
 	struct page_pool *page_pool;
+	struct xsk_buff_pool *xsk_pool;
 
 	u32 packets;
 	u32 bytes;
@@ -128,6 +130,7 @@ struct tsnep_rx {
 	u32 alloc_failed;
 
 	struct xdp_rxq_info xdp_rxq;
+	struct xdp_rxq_info xdp_rxq_zc;
 };
 
 struct tsnep_queue {
@@ -213,6 +216,8 @@ int tsnep_rxnfc_del_rule(struct tsnep_adapter *adapter,
 
 int tsnep_xdp_setup_prog(struct tsnep_adapter *adapter, struct bpf_prog *prog,
 			 struct netlink_ext_ack *extack);
+int tsnep_xdp_setup_pool(struct tsnep_adapter *adapter,
+			 struct xsk_buff_pool *pool, u16 queue_id);
 
 #if IS_ENABLED(CONFIG_TSNEP_SELFTESTS)
 int tsnep_ethtool_get_test_count(void);
@@ -241,5 +246,7 @@ static inline void tsnep_ethtool_self_test(struct net_device *dev,
 void tsnep_get_system_time(struct tsnep_adapter *adapter, u64 *time);
 int tsnep_set_irq_coalesce(struct tsnep_queue *queue, u32 usecs);
 u32 tsnep_get_irq_coalesce(struct tsnep_queue *queue);
+int tsnep_enable_xsk(struct tsnep_queue *queue, struct xsk_buff_pool *pool);
+void tsnep_disable_xsk(struct tsnep_queue *queue);
 
 #endif /* _TSNEP_H */
diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index 6d63b379f05a..e05835d675aa 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -28,11 +28,16 @@
 #include <linux/iopoll.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <net/xdp_sock_drv.h>
 
 #define TSNEP_RX_OFFSET (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN)
 #define TSNEP_HEADROOM ALIGN(TSNEP_RX_OFFSET, 4)
 #define TSNEP_MAX_RX_BUF_SIZE (PAGE_SIZE - TSNEP_HEADROOM - \
 			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+/* XSK buffer shall store at least Q-in-Q frame */
+#define TSNEP_XSK_RX_BUF_SIZE (ALIGN(TSNEP_RX_INLINE_METADATA_SIZE + \
+				     ETH_FRAME_LEN + ETH_FCS_LEN + \
+				     VLAN_HLEN * 2, 4))
 
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 #define DMA_ADDR_HIGH(dma_addr) ((u32)(((dma_addr) >> 32) & 0xFFFFFFFF))
@@ -781,6 +786,9 @@ static void tsnep_rx_ring_cleanup(struct tsnep_rx *rx)
 			page_pool_put_full_page(rx->page_pool, entry->page,
 						false);
 		entry->page = NULL;
+		if (entry->xdp)
+			xsk_buff_free(entry->xdp);
+		entry->xdp = NULL;
 	}
 
 	if (rx->page_pool)
@@ -927,7 +935,7 @@ static void tsnep_rx_activate(struct tsnep_rx *rx, int index)
 {
 	struct tsnep_rx_entry *entry = &rx->entry[index];
 
-	/* TSNEP_MAX_RX_BUF_SIZE is a multiple of 4 */
+	/* TSNEP_MAX_RX_BUF_SIZE and TSNEP_XSK_RX_BUF_SIZE are multiple of 4 */
 	entry->properties = entry->len & TSNEP_DESC_LENGTH_MASK;
 	entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
 	if (index == rx->increment_owner_counter) {
@@ -979,6 +987,24 @@ static int tsnep_rx_alloc(struct tsnep_rx *rx, int count, bool reuse)
 	return i;
 }
 
+static int tsnep_rx_prealloc(struct tsnep_rx *rx)
+{
+	struct tsnep_rx_entry *entry;
+	int i;
+
+	/* prealloc all ring entries except the last one, because ring cannot be
+	 * filled completely
+	 */
+	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
+		entry = &rx->entry[i];
+		entry->page = page_pool_dev_alloc_pages(rx->page_pool);
+		if (!entry->page)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
 {
 	int desc_refilled;
@@ -990,22 +1016,118 @@ static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
 	return desc_refilled;
 }
 
+static void tsnep_rx_set_xdp(struct tsnep_rx *rx, struct tsnep_rx_entry *entry,
+			     struct xdp_buff *xdp)
+{
+	entry->xdp = xdp;
+	entry->len = TSNEP_XSK_RX_BUF_SIZE;
+	entry->dma = xsk_buff_xdp_get_dma(entry->xdp);
+	entry->desc->rx = __cpu_to_le64(entry->dma);
+}
+
+static int tsnep_rx_alloc_buffer_zc(struct tsnep_rx *rx, int index)
+{
+	struct tsnep_rx_entry *entry = &rx->entry[index];
+	struct xdp_buff *xdp;
+
+	xdp = xsk_buff_alloc(rx->xsk_pool);
+	if (unlikely(!xdp))
+		return -ENOMEM;
+	tsnep_rx_set_xdp(rx, entry, xdp);
+
+	return 0;
+}
+
+static void tsnep_rx_reuse_buffer_zc(struct tsnep_rx *rx, int index)
+{
+	struct tsnep_rx_entry *entry = &rx->entry[index];
+	struct tsnep_rx_entry *read = &rx->entry[rx->read];
+
+	tsnep_rx_set_xdp(rx, entry, read->xdp);
+	read->xdp = NULL;
+}
+
+static int tsnep_rx_alloc_zc(struct tsnep_rx *rx, int count, bool reuse)
+{
+	bool alloc_failed = false;
+	int i, index, retval;
+
+	for (i = 0; i < count && !alloc_failed; i++) {
+		index = (rx->write + i) % TSNEP_RING_SIZE;
+
+		retval = tsnep_rx_alloc_buffer_zc(rx, index);
+		if (unlikely(retval)) {
+			rx->alloc_failed++;
+			alloc_failed = true;
+
+			/* reuse only if no other allocation was successful */
+			if (i == 0 && reuse)
+				tsnep_rx_reuse_buffer_zc(rx, index);
+			else
+				break;
+		}
+		tsnep_rx_activate(rx, index);
+	}
+
+	if (i)
+		rx->write = (rx->write + i) % TSNEP_RING_SIZE;
+
+	return i;
+}
+
+static void tsnep_rx_free_zc(struct tsnep_rx *rx, struct xsk_buff_pool *pool)
+{
+	struct tsnep_rx_entry *entry;
+	int i;
+
+	for (i = 0; i < TSNEP_RING_SIZE; i++) {
+		entry = &rx->entry[i];
+		if (entry->xdp)
+			xsk_buff_free(entry->xdp);
+		entry->xdp = NULL;
+	}
+}
+
+static void tsnep_rx_prealloc_zc(struct tsnep_rx *rx,
+				 struct xsk_buff_pool *pool)
+{
+	struct tsnep_rx_entry *entry;
+	int i;
+
+	/* prealloc all ring entries except the last one, because ring cannot be
+	 * filled completely, as many buffers as possible is enough as wakeup is
+	 * done if new buffers are available
+	 */
+	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
+		entry = &rx->entry[i];
+		entry->xdp = xsk_buff_alloc(pool);
+		if (!entry->xdp)
+			break;
+	}
+}
+
+static int tsnep_rx_refill_zc(struct tsnep_rx *rx, int count, bool reuse)
+{
+	int desc_refilled;
+
+	desc_refilled = tsnep_rx_alloc_zc(rx, count, reuse);
+	if (desc_refilled)
+		tsnep_rx_enable(rx);
+
+	return desc_refilled;
+}
+
 static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
 			       struct xdp_buff *xdp, int *status,
-			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
+			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
+			       bool zc)
 {
 	unsigned int length;
-	unsigned int sync;
 	u32 act;
 
 	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
 
 	act = bpf_prog_run_xdp(prog, xdp);
-
-	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
-	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
-	sync = max(sync, length);
-
 	switch (act) {
 	case XDP_PASS:
 		return false;
@@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
 		trace_xdp_exception(rx->adapter->netdev, prog, act);
 		fallthrough;
 	case XDP_DROP:
-		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
-				   sync, true);
+		if (zc) {
+			xsk_buff_free(xdp);
+		} else {
+			unsigned int sync;
+
+			/* Due xdp_adjust_tail: DMA sync for_device cover max
+			 * len CPU touch
+			 */
+			sync = xdp->data_end - xdp->data_hard_start -
+			       XDP_PACKET_HEADROOM;
+			sync = max(sync, length);
+			page_pool_put_page(rx->page_pool,
+					   virt_to_head_page(xdp->data), sync,
+					   true);
+		}
 		return true;
 	}
 }
@@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
 					 length, false);
 
 			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
-						     &xdp_status, tx_nq, tx);
+						     &xdp_status, tx_nq, tx,
+						     false);
 			if (consume) {
 				rx->packets++;
 				rx->bytes += length;
@@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
 	return done;
 }
 
+static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
+			    int budget)
+{
+	struct tsnep_rx_entry *entry;
+	struct netdev_queue *tx_nq;
+	struct bpf_prog *prog;
+	struct tsnep_tx *tx;
+	int desc_available;
+	int xdp_status = 0;
+	struct page *page;
+	int done = 0;
+	int length;
+
+	desc_available = tsnep_rx_desc_available(rx);
+	prog = READ_ONCE(rx->adapter->xdp_prog);
+	if (prog) {
+		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
+					    rx->tx_queue_index);
+		tx = &rx->adapter->tx[rx->tx_queue_index];
+	}
+
+	while (likely(done < budget) && (rx->read != rx->write)) {
+		entry = &rx->entry[rx->read];
+		if ((__le32_to_cpu(entry->desc_wb->properties) &
+		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
+		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
+			break;
+		done++;
+
+		if (desc_available >= TSNEP_RING_RX_REFILL) {
+			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
+
+			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
+							     reuse);
+			if (!entry->xdp) {
+				/* buffer has been reused for refill to prevent
+				 * empty RX ring, thus buffer cannot be used for
+				 * RX processing
+				 */
+				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
+				desc_available++;
+
+				rx->dropped++;
+
+				continue;
+			}
+		}
+
+		/* descriptor properties shall be read first, because valid data
+		 * is signaled there
+		 */
+		dma_rmb();
+
+		prefetch(entry->xdp->data);
+		length = __le32_to_cpu(entry->desc_wb->properties) &
+			 TSNEP_DESC_LENGTH_MASK;
+		entry->xdp->data_end = entry->xdp->data + length;
+		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+
+		/* RX metadata with timestamps is in front of actual data,
+		 * subtract metadata size to get length of actual data and
+		 * consider metadata size as offset of actual data during RX
+		 * processing
+		 */
+		length -= TSNEP_RX_INLINE_METADATA_SIZE;
+
+		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
+		desc_available++;
+
+		if (prog) {
+			bool consume;
+
+			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
+			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
+
+			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
+						     &xdp_status, tx_nq, tx,
+						     true);
+			if (consume) {
+				rx->packets++;
+				rx->bytes += length;
+
+				entry->xdp = NULL;
+
+				continue;
+			}
+		}
+
+		page = page_pool_dev_alloc_pages(rx->page_pool);
+		if (page) {
+			memcpy(page_address(page) + TSNEP_RX_OFFSET,
+			       entry->xdp->data - TSNEP_RX_INLINE_METADATA_SIZE,
+			       length + TSNEP_RX_INLINE_METADATA_SIZE);
+			tsnep_rx_page(rx, napi, page, length);
+		} else {
+			rx->dropped++;
+		}
+		xsk_buff_free(entry->xdp);
+		entry->xdp = NULL;
+	}
+
+	if (xdp_status)
+		tsnep_finalize_xdp(rx->adapter, xdp_status, tx_nq, tx);
+
+	if (desc_available)
+		desc_available -= tsnep_rx_refill_zc(rx, desc_available, false);
+
+	if (xsk_uses_need_wakeup(rx->xsk_pool)) {
+		if (desc_available)
+			xsk_set_rx_need_wakeup(rx->xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(rx->xsk_pool);
+
+		return done;
+	}
+
+	return desc_available ? budget : done;
+}
+
 static bool tsnep_rx_pending(struct tsnep_rx *rx)
 {
 	struct tsnep_rx_entry *entry;
@@ -1232,14 +1487,30 @@ static int tsnep_rx_open(struct tsnep_rx *rx)
 	tsnep_rx_init(rx);
 
 	desc_available = tsnep_rx_desc_available(rx);
-	retval = tsnep_rx_alloc(rx, desc_available, false);
+	if (rx->xsk_pool)
+		retval = tsnep_rx_alloc_zc(rx, desc_available, false);
+	else
+		retval = tsnep_rx_alloc(rx, desc_available, false);
 	if (retval != desc_available) {
-		tsnep_rx_ring_cleanup(rx);
+		retval = -ENOMEM;
 
-		return -ENOMEM;
+		goto alloc_failed;
+	}
+
+	/* prealloc pages to prevent allocation failures when XSK pool is
+	 * disabled at runtime
+	 */
+	if (rx->xsk_pool) {
+		retval = tsnep_rx_prealloc(rx);
+		if (retval)
+			goto alloc_failed;
 	}
 
 	return 0;
+
+alloc_failed:
+	tsnep_rx_ring_cleanup(rx);
+	return retval;
 }
 
 static void tsnep_rx_close(struct tsnep_rx *rx)
@@ -1247,6 +1518,43 @@ static void tsnep_rx_close(struct tsnep_rx *rx)
 	tsnep_rx_ring_cleanup(rx);
 }
 
+static void tsnep_rx_reopen(struct tsnep_rx *rx)
+{
+	int i;
+
+	tsnep_rx_init(rx);
+
+	/* prevent allocation failures by using already allocated RX buffers */
+	for (i = 0; i < TSNEP_RING_SIZE; i++) {
+		struct tsnep_rx_entry *entry = &rx->entry[i];
+
+		/* defined initial values for properties are required for
+		 * correct owner counter checking
+		 */
+		entry->desc->properties = 0;
+		entry->desc_wb->properties = 0;
+
+		if (!rx->xsk_pool && entry->page) {
+			struct tsnep_rx_entry *write_entry;
+
+			/* move page to write index entry */
+			write_entry = &rx->entry[rx->write];
+			if (write_entry != entry) {
+				write_entry->page = entry->page;
+				entry->page = NULL;
+			}
+
+			tsnep_rx_set_page(rx, write_entry, write_entry->page);
+			tsnep_rx_activate(rx, rx->write);
+			rx->write++;
+		} else if (rx->xsk_pool && entry->xdp) {
+			tsnep_rx_set_xdp(rx, entry, entry->xdp);
+			tsnep_rx_activate(rx, rx->write);
+			rx->write++;
+		}
+	}
+}
+
 static bool tsnep_pending(struct tsnep_queue *queue)
 {
 	if (queue->tx && tsnep_tx_pending(queue->tx))
@@ -1269,7 +1577,9 @@ static int tsnep_poll(struct napi_struct *napi, int budget)
 		complete = tsnep_tx_poll(queue->tx, budget);
 
 	if (queue->rx) {
-		done = tsnep_rx_poll(queue->rx, napi, budget);
+		done = queue->rx->xsk_pool ?
+		       tsnep_rx_poll_zc(queue->rx, napi, budget) :
+		       tsnep_rx_poll(queue->rx, napi, budget);
 		if (done >= budget)
 			complete = false;
 	}
@@ -1350,8 +1660,12 @@ static void tsnep_queue_close(struct tsnep_queue *queue, bool first)
 
 	tsnep_free_irq(queue, first);
 
-	if (rx && xdp_rxq_info_is_reg(&rx->xdp_rxq))
-		xdp_rxq_info_unreg(&rx->xdp_rxq);
+	if (rx) {
+		if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
+			xdp_rxq_info_unreg(&rx->xdp_rxq);
+		if (xdp_rxq_info_is_reg(&rx->xdp_rxq_zc))
+			xdp_rxq_info_unreg(&rx->xdp_rxq_zc);
+	}
 
 	netif_napi_del(&queue->napi);
 }
@@ -1376,6 +1690,10 @@ static int tsnep_queue_open(struct tsnep_adapter *adapter,
 		else
 			rx->tx_queue_index = 0;
 
+		/* prepare both memory models to eliminate possible registration
+		 * errors when memory model is switched between page pool and
+		 * XSK pool during runtime
+		 */
 		retval = xdp_rxq_info_reg(&rx->xdp_rxq, adapter->netdev,
 					  rx->queue_index, queue->napi.napi_id);
 		if (retval)
@@ -1385,6 +1703,17 @@ static int tsnep_queue_open(struct tsnep_adapter *adapter,
 						    rx->page_pool);
 		if (retval)
 			goto failed;
+		retval = xdp_rxq_info_reg(&rx->xdp_rxq_zc, adapter->netdev,
+					  rx->queue_index, queue->napi.napi_id);
+		if (retval)
+			goto failed;
+		retval = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq_zc,
+						    MEM_TYPE_XSK_BUFF_POOL,
+						    NULL);
+		if (retval)
+			goto failed;
+		if (rx->xsk_pool)
+			xsk_pool_set_rxq_info(rx->xsk_pool, &rx->xdp_rxq_zc);
 	}
 
 	retval = tsnep_request_irq(queue, first);
@@ -1500,6 +1829,51 @@ static int tsnep_netdev_close(struct net_device *netdev)
 	return 0;
 }
 
+int tsnep_enable_xsk(struct tsnep_queue *queue, struct xsk_buff_pool *pool)
+{
+	bool running = netif_running(queue->adapter->netdev);
+	u32 frame_size;
+
+	frame_size = xsk_pool_get_rx_frame_size(pool);
+	if (frame_size < TSNEP_XSK_RX_BUF_SIZE)
+		return -EOPNOTSUPP;
+
+	xsk_pool_set_rxq_info(pool, &queue->rx->xdp_rxq_zc);
+
+	tsnep_rx_prealloc_zc(queue->rx, pool);
+
+	if (running)
+		tsnep_queue_disable(queue);
+
+	queue->rx->xsk_pool = pool;
+
+	tsnep_rx_reopen(queue->rx);
+
+	if (running)
+		tsnep_queue_enable(queue);
+
+	return 0;
+}
+
+void tsnep_disable_xsk(struct tsnep_queue *queue)
+{
+	bool running = netif_running(queue->adapter->netdev);
+	struct xsk_buff_pool *old_pool;
+
+	if (running)
+		tsnep_queue_disable(queue);
+
+	old_pool = queue->rx->xsk_pool;
+	queue->rx->xsk_pool = NULL;
+
+	tsnep_rx_reopen(queue->rx);
+
+	if (running)
+		tsnep_queue_enable(queue);
+
+	tsnep_rx_free_zc(queue->rx, old_pool);
+}
+
 static netdev_tx_t tsnep_netdev_xmit_frame(struct sk_buff *skb,
 					   struct net_device *netdev)
 {
@@ -1649,6 +2023,9 @@ static int tsnep_netdev_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	switch (bpf->command) {
 	case XDP_SETUP_PROG:
 		return tsnep_xdp_setup_prog(adapter, bpf->prog, bpf->extack);
+	case XDP_SETUP_XSK_POOL:
+		return tsnep_xdp_setup_pool(adapter, bpf->xsk.pool,
+					    bpf->xsk.queue_id);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1703,6 +2080,23 @@ static int tsnep_netdev_xdp_xmit(struct net_device *dev, int n,
 	return nxmit;
 }
 
+static int tsnep_netdev_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
+{
+	struct tsnep_adapter *adapter = netdev_priv(dev);
+	struct tsnep_queue *queue;
+
+	if (queue_id >= adapter->num_rx_queues ||
+	    queue_id >= adapter->num_tx_queues)
+		return -EINVAL;
+
+	queue = &adapter->queue[queue_id];
+
+	if (!napi_if_scheduled_mark_missed(&queue->napi))
+		napi_schedule(&queue->napi);
+
+	return 0;
+}
+
 static const struct net_device_ops tsnep_netdev_ops = {
 	.ndo_open = tsnep_netdev_open,
 	.ndo_stop = tsnep_netdev_close,
@@ -1716,6 +2110,7 @@ static const struct net_device_ops tsnep_netdev_ops = {
 	.ndo_setup_tc = tsnep_tc_setup,
 	.ndo_bpf = tsnep_netdev_bpf,
 	.ndo_xdp_xmit = tsnep_netdev_xdp_xmit,
+	.ndo_xsk_wakeup = tsnep_netdev_xsk_wakeup,
 };
 
 static int tsnep_mac_init(struct tsnep_adapter *adapter)
@@ -1974,7 +2369,8 @@ static int tsnep_probe(struct platform_device *pdev)
 
 	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
 			       NETDEV_XDP_ACT_NDO_XMIT |
-			       NETDEV_XDP_ACT_NDO_XMIT_SG;
+			       NETDEV_XDP_ACT_NDO_XMIT_SG |
+			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
 
 	/* carrier off reporting is important to ethtool even BEFORE open */
 	netif_carrier_off(netdev);
diff --git a/drivers/net/ethernet/engleder/tsnep_xdp.c b/drivers/net/ethernet/engleder/tsnep_xdp.c
index 4d14cb1fd772..6ec137870b59 100644
--- a/drivers/net/ethernet/engleder/tsnep_xdp.c
+++ b/drivers/net/ethernet/engleder/tsnep_xdp.c
@@ -6,6 +6,8 @@
 
 #include "tsnep.h"
 
+#define TSNEP_XSK_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC)
+
 int tsnep_xdp_setup_prog(struct tsnep_adapter *adapter, struct bpf_prog *prog,
 			 struct netlink_ext_ack *extack)
 {
@@ -17,3 +19,68 @@ int tsnep_xdp_setup_prog(struct tsnep_adapter *adapter, struct bpf_prog *prog,
 
 	return 0;
 }
+
+static int tsnep_xdp_enable_pool(struct tsnep_adapter *adapter,
+				 struct xsk_buff_pool *pool, u16 queue_id)
+{
+	struct tsnep_queue *queue;
+	int retval;
+
+	if (queue_id >= adapter->num_rx_queues ||
+	    queue_id >= adapter->num_tx_queues)
+		return -EINVAL;
+
+	queue = &adapter->queue[queue_id];
+	if (queue->rx->queue_index != queue_id ||
+	    queue->tx->queue_index != queue_id) {
+		netdev_err(adapter->netdev,
+			   "XSK support only for TX/RX queue pairs\n");
+
+		return -EOPNOTSUPP;
+	}
+
+	retval = xsk_pool_dma_map(pool, adapter->dmadev, TSNEP_XSK_DMA_ATTR);
+	if (retval) {
+		netdev_err(adapter->netdev, "failed to map XSK pool\n");
+
+		return retval;
+	}
+
+	retval = tsnep_enable_xsk(queue, pool);
+	if (retval) {
+		xsk_pool_dma_unmap(pool, TSNEP_XSK_DMA_ATTR);
+
+		return retval;
+	}
+
+	return 0;
+}
+
+static int tsnep_xdp_disable_pool(struct tsnep_adapter *adapter, u16 queue_id)
+{
+	struct xsk_buff_pool *pool;
+	struct tsnep_queue *queue;
+
+	if (queue_id >= adapter->num_rx_queues ||
+	    queue_id >= adapter->num_tx_queues)
+		return -EINVAL;
+
+	pool = xsk_get_pool_from_qid(adapter->netdev, queue_id);
+	if (!pool)
+		return -EINVAL;
+
+	queue = &adapter->queue[queue_id];
+
+	tsnep_disable_xsk(queue);
+
+	xsk_pool_dma_unmap(pool, TSNEP_XSK_DMA_ATTR);
+
+	return 0;
+}
+
+int tsnep_xdp_setup_pool(struct tsnep_adapter *adapter,
+			 struct xsk_buff_pool *pool, u16 queue_id)
+{
+	return pool ? tsnep_xdp_enable_pool(adapter, pool, queue_id) :
+		      tsnep_xdp_disable_pool(adapter, queue_id);
+}
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support
  2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
                   ` (3 preceding siblings ...)
  2023-04-02 19:38 ` [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support Gerhard Engleder
@ 2023-04-02 19:38 ` Gerhard Engleder
  2023-04-03 22:36   ` Maciej Fijalkowski
  4 siblings, 1 reply; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-02 19:38 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, kuba, edumazet, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, Gerhard Engleder

Send and complete XSK pool frames within TX NAPI context. NAPI context
is triggered by ndo_xsk_wakeup.

Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
---
 drivers/net/ethernet/engleder/tsnep.h      |   2 +
 drivers/net/ethernet/engleder/tsnep_main.c | 131 +++++++++++++++++++--
 2 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/engleder/tsnep.h b/drivers/net/ethernet/engleder/tsnep.h
index 836fd6b1d62e..a336ed08cb3e 100644
--- a/drivers/net/ethernet/engleder/tsnep.h
+++ b/drivers/net/ethernet/engleder/tsnep.h
@@ -69,6 +69,7 @@ struct tsnep_tx_entry {
 	union {
 		struct sk_buff *skb;
 		struct xdp_frame *xdpf;
+		bool zc;
 	};
 	size_t len;
 	DEFINE_DMA_UNMAP_ADDR(dma);
@@ -87,6 +88,7 @@ struct tsnep_tx {
 	int read;
 	u32 owner_counter;
 	int increment_owner_counter;
+	struct xsk_buff_pool *xsk_pool;
 
 	u32 packets;
 	u32 bytes;
diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index e05835d675aa..c70c5ee3693e 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -54,6 +54,8 @@
 #define TSNEP_TX_TYPE_SKB_FRAG	BIT(1)
 #define TSNEP_TX_TYPE_XDP_TX	BIT(2)
 #define TSNEP_TX_TYPE_XDP_NDO	BIT(3)
+#define TSNEP_TX_TYPE_XDP	(TSNEP_TX_TYPE_XDP_TX | TSNEP_TX_TYPE_XDP_NDO)
+#define TSNEP_TX_TYPE_XSK	BIT(4)
 
 #define TSNEP_XDP_TX		BIT(0)
 #define TSNEP_XDP_REDIRECT	BIT(1)
@@ -322,13 +324,51 @@ static void tsnep_tx_init(struct tsnep_tx *tx)
 	tx->increment_owner_counter = TSNEP_RING_SIZE - 1;
 }
 
+static void tsnep_tx_enable(struct tsnep_tx *tx)
+{
+	struct netdev_queue *nq;
+
+	nq = netdev_get_tx_queue(tx->adapter->netdev, tx->queue_index);
+
+	local_bh_disable();
+	__netif_tx_lock(nq, smp_processor_id());
+	netif_tx_wake_queue(nq);
+	__netif_tx_unlock(nq);
+	local_bh_enable();
+}
+
+static void tsnep_tx_disable(struct tsnep_tx *tx, struct napi_struct *napi)
+{
+	struct netdev_queue *nq;
+	u32 val;
+
+	nq = netdev_get_tx_queue(tx->adapter->netdev, tx->queue_index);
+
+	local_bh_disable();
+	__netif_tx_lock(nq, smp_processor_id());
+	netif_tx_stop_queue(nq);
+	__netif_tx_unlock(nq);
+	local_bh_enable();
+
+	/* wait until TX is done in hardware */
+	readx_poll_timeout(ioread32, tx->addr + TSNEP_CONTROL, val,
+			   ((val & TSNEP_CONTROL_TX_ENABLE) == 0), 10000,
+			   1000000);
+
+	/* wait until TX is also done in software */
+	while (READ_ONCE(tx->read) != tx->write) {
+		napi_schedule(napi);
+		napi_synchronize(napi);
+	}
+}
+
 static void tsnep_tx_activate(struct tsnep_tx *tx, int index, int length,
 			      bool last)
 {
 	struct tsnep_tx_entry *entry = &tx->entry[index];
 
 	entry->properties = 0;
-	/* xdpf is union with skb */
+	/* xdpf and zc are union with skb */
 	if (entry->skb) {
 		entry->properties = length & TSNEP_DESC_LENGTH_MASK;
 		entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
@@ -646,10 +686,69 @@ static bool tsnep_xdp_xmit_back(struct tsnep_adapter *adapter,
 	return xmit;
 }
 
+static int tsnep_xdp_tx_map_zc(struct xdp_desc *xdpd, struct tsnep_tx *tx)
+{
+	struct tsnep_tx_entry *entry;
+	dma_addr_t dma;
+
+	entry = &tx->entry[tx->write];
+	entry->zc = true;
+
+	dma = xsk_buff_raw_get_dma(tx->xsk_pool, xdpd->addr);
+	xsk_buff_raw_dma_sync_for_device(tx->xsk_pool, dma, xdpd->len);
+
+	entry->type = TSNEP_TX_TYPE_XSK;
+	entry->len = xdpd->len;
+
+	entry->desc->tx = __cpu_to_le64(dma);
+
+	return xdpd->len;
+}
+
+static void tsnep_xdp_xmit_frame_ring_zc(struct xdp_desc *xdpd,
+					 struct tsnep_tx *tx)
+{
+	int length;
+
+	length = tsnep_xdp_tx_map_zc(xdpd, tx);
+
+	tsnep_tx_activate(tx, tx->write, length, true);
+	tx->write = (tx->write + 1) % TSNEP_RING_SIZE;
+
+	/* descriptor properties shall be valid before hardware is notified */
+	dma_wmb();
+}
+
+static void tsnep_xdp_xmit_zc(struct tsnep_tx *tx)
+{
+	int desc_available = tsnep_tx_desc_available(tx);
+	struct xdp_desc xdp_desc;
+	bool xmit = false;
+
+	/* ensure that TX ring is not filled up by XDP, always MAX_SKB_FRAGS
+	 * will be available for normal TX path and queue is stopped there if
+	 * necessary
+	 */
+	if (desc_available <= (MAX_SKB_FRAGS + 1))
+		return;
+	desc_available -= MAX_SKB_FRAGS + 1;
+
+	while (xsk_tx_peek_desc(tx->xsk_pool, &xdp_desc) && desc_available--) {
+		tsnep_xdp_xmit_frame_ring_zc(&xdp_desc, tx);
+		xmit = true;
+	}
+
+	if (xmit) {
+		tsnep_xdp_xmit_flush(tx);
+		xsk_tx_release(tx->xsk_pool);
+	}
+}
+
 static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
 {
 	struct tsnep_tx_entry *entry;
 	struct netdev_queue *nq;
+	int xsk_frames = 0;
 	int budget = 128;
 	int length;
 	int count;
@@ -676,7 +775,7 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
 		if ((entry->type & TSNEP_TX_TYPE_SKB) &&
 		    skb_shinfo(entry->skb)->nr_frags > 0)
 			count += skb_shinfo(entry->skb)->nr_frags;
-		else if (!(entry->type & TSNEP_TX_TYPE_SKB) &&
+		else if ((entry->type & TSNEP_TX_TYPE_XDP) &&
 			 xdp_frame_has_frags(entry->xdpf))
 			count += xdp_get_shared_info_from_frame(entry->xdpf)->nr_frags;
 
@@ -705,9 +804,11 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
 
 		if (entry->type & TSNEP_TX_TYPE_SKB)
 			napi_consume_skb(entry->skb, napi_budget);
-		else
+		else if (entry->type & TSNEP_TX_TYPE_XDP)
 			xdp_return_frame_rx_napi(entry->xdpf);
-		/* xdpf is union with skb */
+		else
+			xsk_frames++;
+		/* xdpf and zc are union with skb */
 		entry->skb = NULL;
 
 		tx->read = (tx->read + count) % TSNEP_RING_SIZE;
@@ -718,6 +819,14 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
 		budget--;
 	} while (likely(budget));
 
+	if (tx->xsk_pool) {
+		if (xsk_frames)
+			xsk_tx_completed(tx->xsk_pool, xsk_frames);
+		if (xsk_uses_need_wakeup(tx->xsk_pool))
+			xsk_set_tx_need_wakeup(tx->xsk_pool);
+		tsnep_xdp_xmit_zc(tx);
+	}
+
 	if ((tsnep_tx_desc_available(tx) >= ((MAX_SKB_FRAGS + 1) * 2)) &&
 	    netif_tx_queue_stopped(nq)) {
 		netif_tx_wake_queue(nq);
@@ -765,12 +874,6 @@ static int tsnep_tx_open(struct tsnep_tx *tx)
 
 static void tsnep_tx_close(struct tsnep_tx *tx)
 {
-	u32 val;
-
-	readx_poll_timeout(ioread32, tx->addr + TSNEP_CONTROL, val,
-			   ((val & TSNEP_CONTROL_TX_ENABLE) == 0), 10000,
-			   1000000);
-
 	tsnep_tx_ring_cleanup(tx);
 }
 
@@ -1736,12 +1839,18 @@ static void tsnep_queue_enable(struct tsnep_queue *queue)
 	napi_enable(&queue->napi);
 	tsnep_enable_irq(queue->adapter, queue->irq_mask);
 
+	if (queue->tx)
+		tsnep_tx_enable(queue->tx);
+
 	if (queue->rx)
 		tsnep_rx_enable(queue->rx);
 }
 
 static void tsnep_queue_disable(struct tsnep_queue *queue)
 {
+	if (queue->tx)
+		tsnep_tx_disable(queue->tx, &queue->napi);
+
 	napi_disable(&queue->napi);
 	tsnep_disable_irq(queue->adapter, queue->irq_mask);
 
@@ -1845,6 +1954,7 @@ int tsnep_enable_xsk(struct tsnep_queue *queue, struct xsk_buff_pool *pool)
 	if (running)
 		tsnep_queue_disable(queue);
 
+	queue->tx->xsk_pool = pool;
 	queue->rx->xsk_pool = pool;
 
 	tsnep_rx_reopen(queue->rx);
@@ -1865,6 +1975,7 @@ void tsnep_disable_xsk(struct tsnep_queue *queue)
 
 	old_pool = queue->rx->xsk_pool;
 	queue->rx->xsk_pool = NULL;
+	queue->tx->xsk_pool = NULL;
 
 	tsnep_rx_reopen(queue->rx);
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-02 19:38 ` [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support Gerhard Engleder
@ 2023-04-03 17:19   ` Maciej Fijalkowski
  2023-04-03 17:26     ` Maciej Fijalkowski
  2023-04-05 18:49     ` Gerhard Engleder
  0 siblings, 2 replies; 14+ messages in thread
From: Maciej Fijalkowski @ 2023-04-03 17:19 UTC (permalink / raw)
  To: Gerhard Engleder
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On Sun, Apr 02, 2023 at 09:38:37PM +0200, Gerhard Engleder wrote:

Hey Gerhard,

> Add support for XSK zero-copy to RX path. The setup of the XSK pool can
> be done at runtime. If the netdev is running, then the queue must be
> disabled and enabled during reconfiguration. This can be done easily
> with functions introduced in previous commits.
> 
> A more important property is that, if the netdev is running, then the
> setup of the XSK pool shall not stop the netdev in case of errors. A
> broken netdev after a failed XSK pool setup is bad behavior. Therefore,
> the allocation and setup of resources during XSK pool setup is done only
> before any queue is disabled. Additionally, freeing and later allocation
> of resources is eliminated in some cases. Page pool entries are kept for
> later use. Two memory models are registered in parallel. As a result,
> the XSK pool setup cannot fail during queue reconfiguration.
> 
> In contrast to other drivers, XSK pool setup and XDP BPF program setup
> are separate actions. XSK pool setup can be done without any XDP BPF
> program. The XDP BPF program can be added, removed or changed without
> any reconfiguration of the XSK pool.

I won't argue about your design, but I'd be glad if you would present any
perf numbers (ZC vs copy mode) just to give us some overview how your
implementation works out. Also, please consider using batching APIs and
see if this gives you any boost (my assumption is that it would).

> 
> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
> ---
>  drivers/net/ethernet/engleder/tsnep.h      |   7 +
>  drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
>  drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
>  3 files changed, 488 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/net/ethernet/engleder/tsnep.h b/drivers/net/ethernet/engleder/tsnep.h
> index 058c2bcf31a7..836fd6b1d62e 100644
> --- a/drivers/net/ethernet/engleder/tsnep.h
> +++ b/drivers/net/ethernet/engleder/tsnep.h
> @@ -101,6 +101,7 @@ struct tsnep_rx_entry {
>  	u32 properties;
>  
>  	struct page *page;
> +	struct xdp_buff *xdp;

couldn't page and xdp be a union now?

>  	size_t len;
>  	dma_addr_t dma;
>  };
> @@ -120,6 +121,7 @@ struct tsnep_rx {
>  	u32 owner_counter;
>  	int increment_owner_counter;
>  	struct page_pool *page_pool;
> +	struct xsk_buff_pool *xsk_pool;
>  
>  	u32 packets;
>  	u32 bytes;
> @@ -128,6 +130,7 @@ struct tsnep_rx {
>  	u32 alloc_failed;
>  
>  	struct xdp_rxq_info xdp_rxq;
> +	struct xdp_rxq_info xdp_rxq_zc;
>  };
>  
>  struct tsnep_queue {
> @@ -213,6 +216,8 @@ int tsnep_rxnfc_del_rule(struct tsnep_adapter *adapter,
>  
>  int tsnep_xdp_setup_prog(struct tsnep_adapter *adapter, struct bpf_prog *prog,
>  			 struct netlink_ext_ack *extack);
> +int tsnep_xdp_setup_pool(struct tsnep_adapter *adapter,
> +			 struct xsk_buff_pool *pool, u16 queue_id);
>  
>  #if IS_ENABLED(CONFIG_TSNEP_SELFTESTS)
>  int tsnep_ethtool_get_test_count(void);
> @@ -241,5 +246,7 @@ static inline void tsnep_ethtool_self_test(struct net_device *dev,
>  void tsnep_get_system_time(struct tsnep_adapter *adapter, u64 *time);
>  int tsnep_set_irq_coalesce(struct tsnep_queue *queue, u32 usecs);
>  u32 tsnep_get_irq_coalesce(struct tsnep_queue *queue);
> +int tsnep_enable_xsk(struct tsnep_queue *queue, struct xsk_buff_pool *pool);
> +void tsnep_disable_xsk(struct tsnep_queue *queue);
>  
>  #endif /* _TSNEP_H */
> diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
> index 6d63b379f05a..e05835d675aa 100644
> --- a/drivers/net/ethernet/engleder/tsnep_main.c
> +++ b/drivers/net/ethernet/engleder/tsnep_main.c
> @@ -28,11 +28,16 @@
>  #include <linux/iopoll.h>
>  #include <linux/bpf.h>
>  #include <linux/bpf_trace.h>
> +#include <net/xdp_sock_drv.h>
>  
>  #define TSNEP_RX_OFFSET (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN)
>  #define TSNEP_HEADROOM ALIGN(TSNEP_RX_OFFSET, 4)
>  #define TSNEP_MAX_RX_BUF_SIZE (PAGE_SIZE - TSNEP_HEADROOM - \
>  			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
> +/* XSK buffer shall store at least Q-in-Q frame */
> +#define TSNEP_XSK_RX_BUF_SIZE (ALIGN(TSNEP_RX_INLINE_METADATA_SIZE + \
> +				     ETH_FRAME_LEN + ETH_FCS_LEN + \
> +				     VLAN_HLEN * 2, 4))
>  
>  #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>  #define DMA_ADDR_HIGH(dma_addr) ((u32)(((dma_addr) >> 32) & 0xFFFFFFFF))
> @@ -781,6 +786,9 @@ static void tsnep_rx_ring_cleanup(struct tsnep_rx *rx)
>  			page_pool_put_full_page(rx->page_pool, entry->page,
>  						false);
>  		entry->page = NULL;
> +		if (entry->xdp)
> +			xsk_buff_free(entry->xdp);
> +		entry->xdp = NULL;
>  	}
>  
>  	if (rx->page_pool)
> @@ -927,7 +935,7 @@ static void tsnep_rx_activate(struct tsnep_rx *rx, int index)
>  {
>  	struct tsnep_rx_entry *entry = &rx->entry[index];
>  
> -	/* TSNEP_MAX_RX_BUF_SIZE is a multiple of 4 */
> +	/* TSNEP_MAX_RX_BUF_SIZE and TSNEP_XSK_RX_BUF_SIZE are multiple of 4 */
>  	entry->properties = entry->len & TSNEP_DESC_LENGTH_MASK;
>  	entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
>  	if (index == rx->increment_owner_counter) {
> @@ -979,6 +987,24 @@ static int tsnep_rx_alloc(struct tsnep_rx *rx, int count, bool reuse)
>  	return i;
>  }
>  
> +static int tsnep_rx_prealloc(struct tsnep_rx *rx)
> +{
> +	struct tsnep_rx_entry *entry;
> +	int i;
> +
> +	/* prealloc all ring entries except the last one, because ring cannot be
> +	 * filled completely
> +	 */
> +	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
> +		entry = &rx->entry[i];
> +		entry->page = page_pool_dev_alloc_pages(rx->page_pool);
> +		if (!entry->page)
> +			return -ENOMEM;
> +	}
> +
> +	return 0;
> +}
> +
>  static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
>  {
>  	int desc_refilled;
> @@ -990,22 +1016,118 @@ static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
>  	return desc_refilled;
>  }
>  
> +static void tsnep_rx_set_xdp(struct tsnep_rx *rx, struct tsnep_rx_entry *entry,
> +			     struct xdp_buff *xdp)
> +{
> +	entry->xdp = xdp;
> +	entry->len = TSNEP_XSK_RX_BUF_SIZE;
> +	entry->dma = xsk_buff_xdp_get_dma(entry->xdp);
> +	entry->desc->rx = __cpu_to_le64(entry->dma);
> +}
> +
> +static int tsnep_rx_alloc_buffer_zc(struct tsnep_rx *rx, int index)
> +{
> +	struct tsnep_rx_entry *entry = &rx->entry[index];
> +	struct xdp_buff *xdp;
> +
> +	xdp = xsk_buff_alloc(rx->xsk_pool);
> +	if (unlikely(!xdp))
> +		return -ENOMEM;
> +	tsnep_rx_set_xdp(rx, entry, xdp);
> +
> +	return 0;
> +}
> +
> +static void tsnep_rx_reuse_buffer_zc(struct tsnep_rx *rx, int index)
> +{
> +	struct tsnep_rx_entry *entry = &rx->entry[index];
> +	struct tsnep_rx_entry *read = &rx->entry[rx->read];
> +
> +	tsnep_rx_set_xdp(rx, entry, read->xdp);
> +	read->xdp = NULL;
> +}
> +
> +static int tsnep_rx_alloc_zc(struct tsnep_rx *rx, int count, bool reuse)
> +{
> +	bool alloc_failed = false;
> +	int i, index, retval;
> +
> +	for (i = 0; i < count && !alloc_failed; i++) {
> +		index = (rx->write + i) % TSNEP_RING_SIZE;

If your ring size is static (256) then you could use the trick of:

		index = (rx->write + i) & (TSNEP_RING_SIZE - 1);

since TSNEP_RING_SIZE is of power of 2. This way you avoid modulo op in
hotpath.

> +
> +		retval = tsnep_rx_alloc_buffer_zc(rx, index);
> +		if (unlikely(retval)) {

retval is not needed. just do:
		if (unlikely(tsnep_rx_alloc_buffer_zc(rx, index))) {

> +			rx->alloc_failed++;
> +			alloc_failed = true;
> +
> +			/* reuse only if no other allocation was successful */
> +			if (i == 0 && reuse)
> +				tsnep_rx_reuse_buffer_zc(rx, index);
> +			else
> +				break;

isn't the else branch not needed as you've set the alloc_failed to true
which will break out the loop?

> +		}
> +		tsnep_rx_activate(rx, index);
> +	}
> +
> +	if (i)
> +		rx->write = (rx->write + i) % TSNEP_RING_SIZE;
> +
> +	return i;
> +}
> +
> +static void tsnep_rx_free_zc(struct tsnep_rx *rx, struct xsk_buff_pool *pool)
> +{
> +	struct tsnep_rx_entry *entry;

can be scoped within loop

> +	int i;
> +
> +	for (i = 0; i < TSNEP_RING_SIZE; i++) {
> +		entry = &rx->entry[i];
> +		if (entry->xdp)
> +			xsk_buff_free(entry->xdp);
> +		entry->xdp = NULL;
> +	}
> +}
> +
> +static void tsnep_rx_prealloc_zc(struct tsnep_rx *rx,
> +				 struct xsk_buff_pool *pool)
> +{
> +	struct tsnep_rx_entry *entry;

ditto

> +	int i;
> +
> +	/* prealloc all ring entries except the last one, because ring cannot be
> +	 * filled completely, as many buffers as possible is enough as wakeup is
> +	 * done if new buffers are available
> +	 */
> +	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
> +		entry = &rx->entry[i];
> +		entry->xdp = xsk_buff_alloc(pool);

anything stops you from using xsk_buff_alloc_batch() ?

> +		if (!entry->xdp)
> +			break;
> +	}
> +}
> +
> +static int tsnep_rx_refill_zc(struct tsnep_rx *rx, int count, bool reuse)
> +{
> +	int desc_refilled;
> +
> +	desc_refilled = tsnep_rx_alloc_zc(rx, count, reuse);
> +	if (desc_refilled)
> +		tsnep_rx_enable(rx);
> +
> +	return desc_refilled;
> +}
> +
>  static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>  			       struct xdp_buff *xdp, int *status,
> -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
> +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
> +			       bool zc)
>  {
>  	unsigned int length;
> -	unsigned int sync;
>  	u32 act;
>  
>  	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>  
>  	act = bpf_prog_run_xdp(prog, xdp);
> -
> -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
> -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
> -	sync = max(sync, length);
> -
>  	switch (act) {
>  	case XDP_PASS:
>  		return false;
> @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>  		trace_xdp_exception(rx->adapter->netdev, prog, act);
>  		fallthrough;
>  	case XDP_DROP:
> -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
> -				   sync, true);
> +		if (zc) {
> +			xsk_buff_free(xdp);
> +		} else {
> +			unsigned int sync;
> +
> +			/* Due xdp_adjust_tail: DMA sync for_device cover max
> +			 * len CPU touch
> +			 */
> +			sync = xdp->data_end - xdp->data_hard_start -
> +			       XDP_PACKET_HEADROOM;
> +			sync = max(sync, length);
> +			page_pool_put_page(rx->page_pool,
> +					   virt_to_head_page(xdp->data), sync,
> +					   true);
> +		}
>  		return true;
>  	}
>  }
> @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>  					 length, false);
>  
>  			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
> -						     &xdp_status, tx_nq, tx);
> +						     &xdp_status, tx_nq, tx,
> +						     false);
>  			if (consume) {
>  				rx->packets++;
>  				rx->bytes += length;
> @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>  	return done;
>  }
>  
> +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
> +			    int budget)
> +{
> +	struct tsnep_rx_entry *entry;
> +	struct netdev_queue *tx_nq;
> +	struct bpf_prog *prog;
> +	struct tsnep_tx *tx;
> +	int desc_available;
> +	int xdp_status = 0;
> +	struct page *page;
> +	int done = 0;
> +	int length;
> +
> +	desc_available = tsnep_rx_desc_available(rx);
> +	prog = READ_ONCE(rx->adapter->xdp_prog);
> +	if (prog) {
> +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
> +					    rx->tx_queue_index);
> +		tx = &rx->adapter->tx[rx->tx_queue_index];
> +	}
> +
> +	while (likely(done < budget) && (rx->read != rx->write)) {
> +		entry = &rx->entry[rx->read];
> +		if ((__le32_to_cpu(entry->desc_wb->properties) &
> +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
> +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
> +			break;
> +		done++;
> +
> +		if (desc_available >= TSNEP_RING_RX_REFILL) {
> +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
> +
> +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
> +							     reuse);
> +			if (!entry->xdp) {
> +				/* buffer has been reused for refill to prevent
> +				 * empty RX ring, thus buffer cannot be used for
> +				 * RX processing
> +				 */
> +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> +				desc_available++;
> +
> +				rx->dropped++;
> +
> +				continue;
> +			}
> +		}
> +
> +		/* descriptor properties shall be read first, because valid data
> +		 * is signaled there
> +		 */
> +		dma_rmb();
> +
> +		prefetch(entry->xdp->data);
> +		length = __le32_to_cpu(entry->desc_wb->properties) &
> +			 TSNEP_DESC_LENGTH_MASK;
> +		entry->xdp->data_end = entry->xdp->data + length;
> +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
> +
> +		/* RX metadata with timestamps is in front of actual data,
> +		 * subtract metadata size to get length of actual data and
> +		 * consider metadata size as offset of actual data during RX
> +		 * processing
> +		 */
> +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
> +
> +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> +		desc_available++;
> +
> +		if (prog) {
> +			bool consume;
> +
> +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
> +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
> +
> +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
> +						     &xdp_status, tx_nq, tx,
> +						     true);

reason for separate xdp run prog routine for ZC was usually "likely-fying"
XDP_REDIRECT action as this is the main action for AF_XDP which was giving
us perf improvement. Please try this out on your side to see if this
yields any positive value.

> +			if (consume) {
> +				rx->packets++;
> +				rx->bytes += length;
> +
> +				entry->xdp = NULL;
> +
> +				continue;
> +			}
> +		}
> +
> +		page = page_pool_dev_alloc_pages(rx->page_pool);
> +		if (page) {
> +			memcpy(page_address(page) + TSNEP_RX_OFFSET,
> +			       entry->xdp->data - TSNEP_RX_INLINE_METADATA_SIZE,
> +			       length + TSNEP_RX_INLINE_METADATA_SIZE);
> +			tsnep_rx_page(rx, napi, page, length);
> +		} else {
> +			rx->dropped++;
> +		}
> +		xsk_buff_free(entry->xdp);
> +		entry->xdp = NULL;
> +	}
> +
> +	if (xdp_status)
> +		tsnep_finalize_xdp(rx->adapter, xdp_status, tx_nq, tx);
> +
> +	if (desc_available)
> +		desc_available -= tsnep_rx_refill_zc(rx, desc_available, false);
> +
> +	if (xsk_uses_need_wakeup(rx->xsk_pool)) {
> +		if (desc_available)
> +			xsk_set_rx_need_wakeup(rx->xsk_pool);
> +		else
> +			xsk_clear_rx_need_wakeup(rx->xsk_pool);
> +
> +		return done;
> +	}
> +
> +	return desc_available ? budget : done;
> +}
>  

(...)

>  static int tsnep_mac_init(struct tsnep_adapter *adapter)
> @@ -1974,7 +2369,8 @@ static int tsnep_probe(struct platform_device *pdev)
>  
>  	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
>  			       NETDEV_XDP_ACT_NDO_XMIT |
> -			       NETDEV_XDP_ACT_NDO_XMIT_SG;
> +			       NETDEV_XDP_ACT_NDO_XMIT_SG |
> +			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
>  
>  	/* carrier off reporting is important to ethtool even BEFORE open */
>  	netif_carrier_off(netdev);
> diff --git a/drivers/net/ethernet/engleder/tsnep_xdp.c b/drivers/net/ethernet/engleder/tsnep_xdp.c
> index 4d14cb1fd772..6ec137870b59 100644
> --- a/drivers/net/ethernet/engleder/tsnep_xdp.c
> +++ b/drivers/net/ethernet/engleder/tsnep_xdp.c
> @@ -6,6 +6,8 @@
>  
>  #include "tsnep.h"
>  
> +#define TSNEP_XSK_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC)

This is not introducing any value, you can operate directly on
DMA_ATTR_SKIP_CPU_SYNC

(...)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-03 17:19   ` Maciej Fijalkowski
@ 2023-04-03 17:26     ` Maciej Fijalkowski
  2023-04-05 19:13       ` Gerhard Engleder
  2023-04-05 18:49     ` Gerhard Engleder
  1 sibling, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2023-04-03 17:26 UTC (permalink / raw)
  To: Gerhard Engleder
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On Mon, Apr 03, 2023 at 07:19:15PM +0200, Maciej Fijalkowski wrote:
> On Sun, Apr 02, 2023 at 09:38:37PM +0200, Gerhard Engleder wrote:
> 
> Hey Gerhard,
> 
> > Add support for XSK zero-copy to RX path. The setup of the XSK pool can
> > be done at runtime. If the netdev is running, then the queue must be
> > disabled and enabled during reconfiguration. This can be done easily
> > with functions introduced in previous commits.
> > 
> > A more important property is that, if the netdev is running, then the
> > setup of the XSK pool shall not stop the netdev in case of errors. A
> > broken netdev after a failed XSK pool setup is bad behavior. Therefore,
> > the allocation and setup of resources during XSK pool setup is done only
> > before any queue is disabled. Additionally, freeing and later allocation
> > of resources is eliminated in some cases. Page pool entries are kept for
> > later use. Two memory models are registered in parallel. As a result,
> > the XSK pool setup cannot fail during queue reconfiguration.
> > 
> > In contrast to other drivers, XSK pool setup and XDP BPF program setup
> > are separate actions. XSK pool setup can be done without any XDP BPF
> > program. The XDP BPF program can be added, removed or changed without
> > any reconfiguration of the XSK pool.
> 
> I won't argue about your design, but I'd be glad if you would present any
> perf numbers (ZC vs copy mode) just to give us some overview how your
> implementation works out. Also, please consider using batching APIs and
> see if this gives you any boost (my assumption is that it would).
> 
> > 
> > Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
> > ---
> >  drivers/net/ethernet/engleder/tsnep.h      |   7 +
> >  drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
> >  drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
> >  3 files changed, 488 insertions(+), 18 deletions(-)

(...)

> > +
> >  static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
> >  			       struct xdp_buff *xdp, int *status,
> > -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
> > +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
> > +			       bool zc)
> >  {
> >  	unsigned int length;
> > -	unsigned int sync;
> >  	u32 act;
> >  
> >  	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
> >  
> >  	act = bpf_prog_run_xdp(prog, xdp);
> > -
> > -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
> > -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
> > -	sync = max(sync, length);
> > -
> >  	switch (act) {
> >  	case XDP_PASS:
> >  		return false;
> > @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
> >  		trace_xdp_exception(rx->adapter->netdev, prog, act);
> >  		fallthrough;
> >  	case XDP_DROP:
> > -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
> > -				   sync, true);
> > +		if (zc) {
> > +			xsk_buff_free(xdp);
> > +		} else {
> > +			unsigned int sync;
> > +
> > +			/* Due xdp_adjust_tail: DMA sync for_device cover max
> > +			 * len CPU touch
> > +			 */
> > +			sync = xdp->data_end - xdp->data_hard_start -
> > +			       XDP_PACKET_HEADROOM;
> > +			sync = max(sync, length);
> > +			page_pool_put_page(rx->page_pool,
> > +					   virt_to_head_page(xdp->data), sync,
> > +					   true);
> > +		}
> >  		return true;
> >  	}
> >  }
> > @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
> >  					 length, false);
> >  
> >  			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
> > -						     &xdp_status, tx_nq, tx);
> > +						     &xdp_status, tx_nq, tx,
> > +						     false);
> >  			if (consume) {
> >  				rx->packets++;
> >  				rx->bytes += length;
> > @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
> >  	return done;
> >  }
> >  
> > +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
> > +			    int budget)
> > +{
> > +	struct tsnep_rx_entry *entry;
> > +	struct netdev_queue *tx_nq;
> > +	struct bpf_prog *prog;
> > +	struct tsnep_tx *tx;
> > +	int desc_available;
> > +	int xdp_status = 0;
> > +	struct page *page;
> > +	int done = 0;
> > +	int length;
> > +
> > +	desc_available = tsnep_rx_desc_available(rx);
> > +	prog = READ_ONCE(rx->adapter->xdp_prog);
> > +	if (prog) {
> > +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
> > +					    rx->tx_queue_index);
> > +		tx = &rx->adapter->tx[rx->tx_queue_index];
> > +	}
> > +
> > +	while (likely(done < budget) && (rx->read != rx->write)) {
> > +		entry = &rx->entry[rx->read];
> > +		if ((__le32_to_cpu(entry->desc_wb->properties) &
> > +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
> > +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
> > +			break;
> > +		done++;
> > +
> > +		if (desc_available >= TSNEP_RING_RX_REFILL) {
> > +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
> > +
> > +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
> > +							     reuse);
> > +			if (!entry->xdp) {
> > +				/* buffer has been reused for refill to prevent
> > +				 * empty RX ring, thus buffer cannot be used for
> > +				 * RX processing
> > +				 */
> > +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> > +				desc_available++;
> > +
> > +				rx->dropped++;
> > +
> > +				continue;
> > +			}
> > +		}
> > +
> > +		/* descriptor properties shall be read first, because valid data
> > +		 * is signaled there
> > +		 */
> > +		dma_rmb();
> > +
> > +		prefetch(entry->xdp->data);
> > +		length = __le32_to_cpu(entry->desc_wb->properties) &
> > +			 TSNEP_DESC_LENGTH_MASK;
> > +		entry->xdp->data_end = entry->xdp->data + length;
> > +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
> > +
> > +		/* RX metadata with timestamps is in front of actual data,
> > +		 * subtract metadata size to get length of actual data and
> > +		 * consider metadata size as offset of actual data during RX
> > +		 * processing
> > +		 */
> > +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
> > +
> > +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> > +		desc_available++;
> > +
> > +		if (prog) {
> > +			bool consume;
> > +
> > +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
> > +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
> > +
> > +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
> > +						     &xdp_status, tx_nq, tx,
> > +						     true);
> 
> reason for separate xdp run prog routine for ZC was usually "likely-fying"
> XDP_REDIRECT action as this is the main action for AF_XDP which was giving
> us perf improvement. Please try this out on your side to see if this
> yields any positive value.

One more thing - you have to handle XDP_TX action in a ZC specific way.
Your current code will break if you enable xsk_pool and return XDP_TX from
XDP prog.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support
  2023-04-02 19:38 ` [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support Gerhard Engleder
@ 2023-04-03 22:36   ` Maciej Fijalkowski
  2023-04-05 19:15     ` Gerhard Engleder
  0 siblings, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2023-04-03 22:36 UTC (permalink / raw)
  To: Gerhard Engleder
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On Sun, Apr 02, 2023 at 09:38:38PM +0200, Gerhard Engleder wrote:
> Send and complete XSK pool frames within TX NAPI context. NAPI context
> is triggered by ndo_xsk_wakeup.
> 
> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
> ---
>  drivers/net/ethernet/engleder/tsnep.h      |   2 +
>  drivers/net/ethernet/engleder/tsnep_main.c | 131 +++++++++++++++++++--
>  2 files changed, 123 insertions(+), 10 deletions(-)
> 

(...)

> +static void tsnep_xdp_xmit_zc(struct tsnep_tx *tx)
> +{
> +	int desc_available = tsnep_tx_desc_available(tx);
> +	struct xdp_desc xdp_desc;
> +	bool xmit = false;
> +
> +	/* ensure that TX ring is not filled up by XDP, always MAX_SKB_FRAGS
> +	 * will be available for normal TX path and queue is stopped there if
> +	 * necessary
> +	 */
> +	if (desc_available <= (MAX_SKB_FRAGS + 1))
> +		return;
> +	desc_available -= MAX_SKB_FRAGS + 1;
> +
> +	while (xsk_tx_peek_desc(tx->xsk_pool, &xdp_desc) && desc_available--) {

Again, I am curious how batch API usage would improve your perf.

> +		tsnep_xdp_xmit_frame_ring_zc(&xdp_desc, tx);
> +		xmit = true;
> +	}
> +
> +	if (xmit) {
> +		tsnep_xdp_xmit_flush(tx);
> +		xsk_tx_release(tx->xsk_pool);
> +	}
> +}
> +

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-03 17:19   ` Maciej Fijalkowski
  2023-04-03 17:26     ` Maciej Fijalkowski
@ 2023-04-05 18:49     ` Gerhard Engleder
  1 sibling, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-05 18:49 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On 03.04.23 19:19, Maciej Fijalkowski wrote:

Hello Maciej,

>> Add support for XSK zero-copy to RX path. The setup of the XSK pool can
>> be done at runtime. If the netdev is running, then the queue must be
>> disabled and enabled during reconfiguration. This can be done easily
>> with functions introduced in previous commits.
>>
>> A more important property is that, if the netdev is running, then the
>> setup of the XSK pool shall not stop the netdev in case of errors. A
>> broken netdev after a failed XSK pool setup is bad behavior. Therefore,
>> the allocation and setup of resources during XSK pool setup is done only
>> before any queue is disabled. Additionally, freeing and later allocation
>> of resources is eliminated in some cases. Page pool entries are kept for
>> later use. Two memory models are registered in parallel. As a result,
>> the XSK pool setup cannot fail during queue reconfiguration.
>>
>> In contrast to other drivers, XSK pool setup and XDP BPF program setup
>> are separate actions. XSK pool setup can be done without any XDP BPF
>> program. The XDP BPF program can be added, removed or changed without
>> any reconfiguration of the XSK pool.
> 
> I won't argue about your design, but I'd be glad if you would present any
> perf numbers (ZC vs copy mode) just to give us some overview how your
> implementation works out. Also, please consider using batching APIs and
> see if this gives you any boost (my assumption is that it would).

I will add some numbers about ZC vs copy mode.

I assume the batching API consists of xsk_tx_peek_release_desc_batch()
and xsk_buff_alloc_batch(). I will give it try and measure the
difference.

>>
>> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
>> ---
>>   drivers/net/ethernet/engleder/tsnep.h      |   7 +
>>   drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
>>   drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
>>   3 files changed, 488 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/engleder/tsnep.h b/drivers/net/ethernet/engleder/tsnep.h
>> index 058c2bcf31a7..836fd6b1d62e 100644
>> --- a/drivers/net/ethernet/engleder/tsnep.h
>> +++ b/drivers/net/ethernet/engleder/tsnep.h
>> @@ -101,6 +101,7 @@ struct tsnep_rx_entry {
>>   	u32 properties;
>>   
>>   	struct page *page;
>> +	struct xdp_buff *xdp;
> 
> couldn't page and xdp be a union now?

No, because the pages are not freed during XSK pool setup. At least
some pages survive at this pointer during XSK pool setup. Therefore,
when switching back to page pool, those pages can be used again without
the risk of failed allocations.

>>   	size_t len;
>>   	dma_addr_t dma;
>>   };
>> @@ -120,6 +121,7 @@ struct tsnep_rx {
>>   	u32 owner_counter;
>>   	int increment_owner_counter;
>>   	struct page_pool *page_pool;
>> +	struct xsk_buff_pool *xsk_pool;
>>   
>>   	u32 packets;
>>   	u32 bytes;
>> @@ -128,6 +130,7 @@ struct tsnep_rx {
>>   	u32 alloc_failed;
>>   
>>   	struct xdp_rxq_info xdp_rxq;
>> +	struct xdp_rxq_info xdp_rxq_zc;
>>   };
>>   
>>   struct tsnep_queue {
>> @@ -213,6 +216,8 @@ int tsnep_rxnfc_del_rule(struct tsnep_adapter *adapter,
>>   
>>   int tsnep_xdp_setup_prog(struct tsnep_adapter *adapter, struct bpf_prog *prog,
>>   			 struct netlink_ext_ack *extack);
>> +int tsnep_xdp_setup_pool(struct tsnep_adapter *adapter,
>> +			 struct xsk_buff_pool *pool, u16 queue_id);
>>   
>>   #if IS_ENABLED(CONFIG_TSNEP_SELFTESTS)
>>   int tsnep_ethtool_get_test_count(void);
>> @@ -241,5 +246,7 @@ static inline void tsnep_ethtool_self_test(struct net_device *dev,
>>   void tsnep_get_system_time(struct tsnep_adapter *adapter, u64 *time);
>>   int tsnep_set_irq_coalesce(struct tsnep_queue *queue, u32 usecs);
>>   u32 tsnep_get_irq_coalesce(struct tsnep_queue *queue);
>> +int tsnep_enable_xsk(struct tsnep_queue *queue, struct xsk_buff_pool *pool);
>> +void tsnep_disable_xsk(struct tsnep_queue *queue);
>>   
>>   #endif /* _TSNEP_H */
>> diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
>> index 6d63b379f05a..e05835d675aa 100644
>> --- a/drivers/net/ethernet/engleder/tsnep_main.c
>> +++ b/drivers/net/ethernet/engleder/tsnep_main.c
>> @@ -28,11 +28,16 @@
>>   #include <linux/iopoll.h>
>>   #include <linux/bpf.h>
>>   #include <linux/bpf_trace.h>
>> +#include <net/xdp_sock_drv.h>
>>   
>>   #define TSNEP_RX_OFFSET (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN)
>>   #define TSNEP_HEADROOM ALIGN(TSNEP_RX_OFFSET, 4)
>>   #define TSNEP_MAX_RX_BUF_SIZE (PAGE_SIZE - TSNEP_HEADROOM - \
>>   			       SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
>> +/* XSK buffer shall store at least Q-in-Q frame */
>> +#define TSNEP_XSK_RX_BUF_SIZE (ALIGN(TSNEP_RX_INLINE_METADATA_SIZE + \
>> +				     ETH_FRAME_LEN + ETH_FCS_LEN + \
>> +		s survive at 		     VLAN_HLEN * 2, 4))
>>   
>>   #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>>   #define DMA_ADDR_HIGH(dma_addr) ((u32)(((dma_addr) >> 32) & 0xFFFFFFFF))
>> @@ -781,6 +786,9 @@ static void tsnep_rx_ring_cleanup(struct tsnep_rx *rx)
>>   			page_pool_put_full_page(rx->page_pool, entry->page,
>>   						false);
>>   		entry->page = NULL;
>> +		if (entry->xdp)
>> +			xsk_buff_free(entry->xdp);
>> +		entry->xdp = NULL;
>>   	}
>>   
>>   	if (rx->page_pool)
>> @@ -927,7 +935,7 @@ static void tsnep_rx_activate(struct tsnep_rx *rx, int index)
>>   {
>>   	struct tsnep_rx_entry *entry = &rx->entry[index];
>>   
>> -	/* TSNEP_MAX_RX_BUF_SIZE is a multiple of 4 */
>> +	/* TSNEP_MAX_RX_BUF_SIZE and TSNEP_XSK_RX_BUF_SIZE are multiple of 4 */
>>   	entry->properties = entry->len & TSNEP_DESC_LENGTH_MASK;
>>   	entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
>>   	if (index == rx->increment_owner_counter) {
>> @@ -979,6 +987,24 @@ static int tsnep_rx_alloc(struct tsnep_rx *rx, int count, bool reuse)
>>   	return i;
>>   }
>>   
>> +static int tsnep_rx_prealloc(struct tsnep_rx *rx)
>> +{
>> +	struct tsnep_rx_entry *entry;
>> +	int i;
>> +
>> +	/* prealloc all ring entries except the last one, because ring cannot be
>> +	 * filled completely
>> +	 */
>> +	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
>> +		entry = &rx->entry[i];
>> +		entry->page = page_pool_dev_alloc_pages(rx->page_pool);
>> +		if (!entry->page)
>> +			return -ENOMEM;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
>>   {
>>   	int desc_refilled;
>> @@ -990,22 +1016,118 @@ static int tsnep_rx_refill(struct tsnep_rx *rx, int count, bool reuse)
>>   	return desc_refilled;
>>   }
>>   
>> +static void tsnep_rx_set_xdp(struct tsnep_rx *rx, struct tsnep_rx_entry *entry,
>> +			     struct xdp_buff *xdp)
>> +{
>> +	entry->xdp = xdp;
>> +	entry->len = TSNEP_XSK_RX_BUF_SIZE;
>> +	entry->dma = xsk_buff_xdp_get_dma(entry->xdp);
>> +	entry->desc->rx = __cpu_to_le64(entry->dma);
>> +}
>> +
>> +static int tsnep_rx_alloc_buffer_zc(struct tsnep_rx *rx, int index)
>> +{
>> +	struct tsnep_rx_entry *entry = &rx->entry[index];
>> +	struct xdp_buff *xdp;
>> +
>> +	xdp = xsk_buff_alloc(rx->xsk_pool);
>> +	if (unlikely(!xdp))
>> +		return -ENOMEM;
>> +	tsnep_rx_set_xdp(rx, entry, xdp);
>> +
>> +	return 0;
>> +}
>> +
>> +static void tsnep_rx_reuse_buffer_zc(struct tsnep_rx *rx, int index)
>> +{
>> +	struct tsnep_rx_entry *entry = &rx->entry[index];
>> +	struct tsnep_rx_entry *read = &rx->entry[rx->read];
>> +
>> +	tsnep_rx_set_xdp(rx, entry, read->xdp);
>> +	read->xdp = NULL;
>> +}
>> +
>> +static int tsnep_rx_alloc_zc(struct tsnep_rx *rx, int count, bool reuse)
>> +{
>> +	bool alloc_failed = false;
>> +	int i, index, retval;
>> +
>> +	for (i = 0; i < count && !alloc_failed; i++) {
>> +		index = (rx->write + i) % TSNEP_RING_SIZE;
> 
> If your ring size is static (256) then you could use the trick of:
> 
> 		index = (rx->write + i) & (TSNEP_RING_SIZE - 1);
> 
> since TSNEP_RING_SIZE is of power of 2. This way you avoid modulo op in
> hotpath.

I did assume that the compiler does that work for me. Same modulo
operation is used in the existing TX/RX path. I will rework all
locations.

>> +
>> +		retval = tsnep_rx_alloc_buffer_zc(rx, index);
>> +		if (unlikely(retval)) {
> 
> retval is not needed. just do:
> 		if (unlikely(tsnep_rx_alloc_buffer_zc(rx, index))) {

I will eliminate retval.

>> +			rx->alloc_failed++;
>> +			alloc_failed = true;
>> +
>> +			/* reuse only if no other allocation was successful */
>> +			if (i == 0 && reuse)
>> +				tsnep_rx_reuse_buffer_zc(rx, index);
>> +			else
>> +				break;
> 
> isn't the else branch not needed as you've set the alloc_failed to true
> which will break out the loop?

No, because tsnep_rx_activate() shall not be called in this case.

>> +		}
>> +		tsnep_rx_activate(rx, index);
>> +	}
>> +
>> +	if (i)
>> +		rx->write = (rx->write + i) % TSNEP_RING_SIZE;
>> +
>> +	return i;
>> +}
>> +
>> +static void tsnep_rx_free_zc(struct tsnep_rx *rx, struct xsk_buff_pool *pool)
>> +{
>> +	struct tsnep_rx_entry *entry;
> 
> can be scoped within loop

Will be done.

>> +	int i;
>> +
>> +	for (i = 0; i < TSNEP_RING_SIZE; i++) {
>> +		entry = &rx->entry[i];
>> +		if (entry->xdp)
>> +			xsk_buff_free(entry->xdp);
>> +		entry->xdp = NULL;
>> +	}
>> +}
>> +
>> +static void tsnep_rx_prealloc_zc(struct tsnep_rx *rx,
>> +				 struct xsk_buff_pool *pool)
>> +{
>> +	struct tsnep_rx_entry *entry;
> 
> ditto

Will be done.

>> +	int i;
>> +
>> +	/* prealloc all ring entries except the last one, because ring cannot be
>> +	 * filled completely, as many buffers as possible is enough as wakeup is
>> +	 * done if new buffers are available
>> +	 */
>> +	for (i = 0; i < TSNEP_RING_SIZE - 1; i++) {
>> +		entry = &rx->entry[i];
>> +		entry->xdp = xsk_buff_alloc(pool);
> 
> anything stops you from using xsk_buff_alloc_batch() ?

I will use xsk_buff_alloc_batch().

>> +		if (!entry->xdp)
>> +			break;
>> +	}
>> +}
>> +
>> +static int tsnep_rx_refill_zc(struct tsnep_rx *rx, int count, bool reuse)
>> +{
>> +	int desc_refilled;
>> +
>> +	desc_refilled = tsnep_rx_alloc_zc(rx, count, reuse);
>> +	if (desc_refilled)
>> +		tsnep_rx_enable(rx);
>> +
>> +	return desc_refilled;
>> +}
>> +
>>   static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>   			       struct xdp_buff *xdp, int *status,
>> -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
>> +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
>> +			       bool zc)
>>   {
>>   	unsigned int length;
>> -	unsigned int sync;
>>   	u32 act;
>>   
>>   	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>>   
>>   	act = bpf_prog_run_xdp(prog, xdp);
>> -
>> -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
>> -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>> -	sync = max(sync, length);
>> -
>>   	switch (act) {
>>   	case XDP_PASS:
>>   		return false;
>> @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>   		trace_xdp_exception(rx->adapter->netdev, prog, act);
>>   		fallthrough;
>>   	case XDP_DROP:
>> -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
>> -				   sync, true);
>> +		if (zc) {
>> +			xsk_buff_free(xdp);
>> +		} else {
>> +			unsigned int sync;
>> +
>> +			/* Due xdp_adjust_tail: DMA sync for_device cover max
>> +			 * len CPU touch
>> +			 */
>> +			sync = xdp->data_end - xdp->data_hard_start -
>> +			       XDP_PACKET_HEADROOM;
>> +			sync = max(sync, length);
>> +			page_pool_put_page(rx->page_pool,
>> +					   virt_to_head_page(xdp->data), sync,
>> +					   true);
>> +		}
>>   		return true;
>>   	}
>>   }
>> @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>   					 length, false);
>>   
>>   			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
>> -						     &xdp_status, tx_nq, tx);
>> +						     &xdp_status, tx_nq, tx,
>> +						     false);
>>   			if (consume) {
>>   				rx->packets++;
>>   				rx->bytes += length;
>> @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>   	return done;
>>   }
>>   
>> +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
>> +			    int budget)
>> +{
>> +	struct tsnep_rx_entry *entry;
>> +	struct netdev_queue *tx_nq;
>> +	struct bpf_prog *prog;
>> +	struct tsnep_tx *tx;
>> +	int desc_available;
>> +	int xdp_status = 0;
>> +	struct page *page;
>> +	int done = 0;
>> +	int length;
>> +
>> +	desc_available = tsnep_rx_desc_available(rx);
>> +	prog = READ_ONCE(rx->adapter->xdp_prog);
>> +	if (prog) {
>> +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
>> +					    rx->tx_queue_index);
>> +		tx = &rx->adapter->tx[rx->tx_queue_index];
>> +	}
>> +
>> +	while (likely(done < budget) && (rx->read != rx->write)) {
>> +		entry = &rx->entry[rx->read];
>> +		if ((__le32_to_cpu(entry->desc_wb->properties) &
>> +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
>> +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
>> +			break;
>> +		done++;
>> +
>> +		if (desc_available >= TSNEP_RING_RX_REFILL) {
>> +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
>> +
>> +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
>> +							     reuse);
>> +			if (!entry->xdp) {
>> +				/* buffer has been reused for refill to prevent
>> +				 * empty RX ring, thus buffer cannot be used for
>> +				 * RX processing
>> +				 */
>> +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>> +				desc_available++;
>> +
>> +				rx->dropped++;
>> +
>> +				continue;
>> +			}
>> +		}
>> +
>> +		/* descriptor properties shall be read first, because valid data
>> +		 * is signaled there
>> +		 */
>> +		dma_rmb();
>> +
>> +		prefetch(entry->xdp->data);
>> +		length = __le32_to_cpu(entry->desc_wb->properties) &
>> +			 TSNEP_DESC_LENGTH_MASK;
>> +		entry->xdp->data_end = entry->xdp->data + length;
>> +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
>> +
>> +		/* RX metadata with timestamps is in front of actual data,
>> +		 * subtract metadata size to get length of actual data and
>> +		 * consider metadata size as offset of actual data during RX
>> +		 * processing
>> +		 */
>> +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
>> +
>> +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>> +		desc_available++;
>> +
>> +		if (prog) {
>> +			bool consume;
>> +
>> +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
>> +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
>> +
>> +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
>> +						     &xdp_status, tx_nq, tx,
>> +						     true);
> 
> reason for separate xdp run prog routine for ZC was usually "likely-fying"
> XDP_REDIRECT action as this is the main action for AF_XDP which was giving
> us perf improvement. Please try this out on your side to see if this
> yields any positive value.

I will try it.

>> +			if (consume) {
>> +				rx->packets++;
>> +				rx->bytes += length;
>> +
>> +				entry->xdp = NULL;
>> +
>> +				continue;
>> +			}
>> +		}
>> +
>> +		page = page_pool_dev_alloc_pages(rx->page_pool);
>> +		if (page) {
>> +			memcpy(page_address(page) + TSNEP_RX_OFFSET,
>> +			       entry->xdp->data - TSNEP_RX_INLINE_METADATA_SIZE,
>> +			       length + TSNEP_RX_INLINE_METADATA_SIZE);
>> +			tsnep_rx_page(rx, napi, page, length);
>> +		} else {
>> +			rx->dropped++;
>> +		}
>> +		xsk_buff_free(entry->xdp);
>> +		entry->xdp = NULL;
>> +	}
>> +
>> +	if (xdp_status)
>> +		tsnep_finalize_xdp(rx->adapter, xdp_status, tx_nq, tx);
>> +
>> +	if (desc_available)
>> +		desc_available -= tsnep_rx_refill_zc(rx, desc_available, false);
>> +
>> +	if (xsk_uses_need_wakeup(rx->xsk_pool)) {
>> +		if (desc_available)
>> +			xsk_set_rx_need_wakeup(rx->xsk_pool);
>> +		else
>> +			xsk_clear_rx_need_wakeup(rx->xsk_pool);
>> +
>> +		return done;
>> +	}
>> +
>> +	return desc_available ? budget : done;
>> +}
>>   
> 
> (...)
> 
>>   static int tsnep_mac_init(struct tsnep_adapter *adapter)
>> @@ -1974,7 +2369,8 @@ static int tsnep_probe(struct platform_device *pdev)
>>   
>>   	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
>>   			       NETDEV_XDP_ACT_NDO_XMIT |
>> -			       NETDEV_XDP_ACT_NDO_XMIT_SG;
>> +			       NETDEV_XDP_ACT_NDO_XMIT_SG |
>> +			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
>>   
>>   	/* carrier off reporting is important to ethtool even BEFORE open */
>>   	netif_carrier_off(netdev);
>> diff --git a/drivers/net/ethernet/engleder/tsnep_xdp.c b/drivers/net/ethernet/engleder/tsnep_xdp.c
>> index 4d14cb1fd772..6ec137870b59 100644
>> --- a/drivers/net/ethernet/engleder/tsnep_xdp.c
>> +++ b/drivers/net/ethernet/engleder/tsnep_xdp.c
>> @@ -6,6 +6,8 @@
>>   
>>   #include "tsnep.h"
>>   
>> +#define TSNEP_XSK_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC)
> 
> This is not introducing any value, you can operate directly on
> DMA_ATTR_SKIP_CPU_SYNC

Will be done.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-03 17:26     ` Maciej Fijalkowski
@ 2023-04-05 19:13       ` Gerhard Engleder
  2023-04-07 13:41         ` Maciej Fijalkowski
  0 siblings, 1 reply; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-05 19:13 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On 03.04.23 19:26, Maciej Fijalkowski wrote:
> On Mon, Apr 03, 2023 at 07:19:15PM +0200, Maciej Fijalkowski wrote:
>> On Sun, Apr 02, 2023 at 09:38:37PM +0200, Gerhard Engleder wrote:
>>
>> Hey Gerhard,
>>
>>> Add support for XSK zero-copy to RX path. The setup of the XSK pool can
>>> be done at runtime. If the netdev is running, then the queue must be
>>> disabled and enabled during reconfiguration. This can be done easily
>>> with functions introduced in previous commits.
>>>
>>> A more important property is that, if the netdev is running, then the
>>> setup of the XSK pool shall not stop the netdev in case of errors. A
>>> broken netdev after a failed XSK pool setup is bad behavior. Therefore,
>>> the allocation and setup of resources during XSK pool setup is done only
>>> before any queue is disabled. Additionally, freeing and later allocation
>>> of resources is eliminated in some cases. Page pool entries are kept for
>>> later use. Two memory models are registered in parallel. As a result,
>>> the XSK pool setup cannot fail during queue reconfiguration.
>>>
>>> In contrast to other drivers, XSK pool setup and XDP BPF program setup
>>> are separate actions. XSK pool setup can be done without any XDP BPF
>>> program. The XDP BPF program can be added, removed or changed without
>>> any reconfiguration of the XSK pool.
>>
>> I won't argue about your design, but I'd be glad if you would present any
>> perf numbers (ZC vs copy mode) just to give us some overview how your
>> implementation works out. Also, please consider using batching APIs and
>> see if this gives you any boost (my assumption is that it would).
>>
>>>
>>> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
>>> ---
>>>   drivers/net/ethernet/engleder/tsnep.h      |   7 +
>>>   drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
>>>   drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
>>>   3 files changed, 488 insertions(+), 18 deletions(-)
> 
> (...)
> 
>>> +
>>>   static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>>   			       struct xdp_buff *xdp, int *status,
>>> -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
>>> +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
>>> +			       bool zc)
>>>   {
>>>   	unsigned int length;
>>> -	unsigned int sync;
>>>   	u32 act;
>>>   
>>>   	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>>>   
>>>   	act = bpf_prog_run_xdp(prog, xdp);
>>> -
>>> -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
>>> -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>>> -	sync = max(sync, length);
>>> -
>>>   	switch (act) {
>>>   	case XDP_PASS:
>>>   		return false;
>>> @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>>   		trace_xdp_exception(rx->adapter->netdev, prog, act);
>>>   		fallthrough;
>>>   	case XDP_DROP:
>>> -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
>>> -				   sync, true);
>>> +		if (zc) {
>>> +			xsk_buff_free(xdp);
>>> +		} else {
>>> +			unsigned int sync;
>>> +
>>> +			/* Due xdp_adjust_tail: DMA sync for_device cover max
>>> +			 * len CPU touch
>>> +			 */
>>> +			sync = xdp->data_end - xdp->data_hard_start -
>>> +			       XDP_PACKET_HEADROOM;
>>> +			sync = max(sync, length);
>>> +			page_pool_put_page(rx->page_pool,
>>> +					   virt_to_head_page(xdp->data), sync,
>>> +					   true);
>>> +		}
>>>   		return true;
>>>   	}
>>>   }
>>> @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>>   					 length, false);
>>>   
>>>   			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
>>> -						     &xdp_status, tx_nq, tx);
>>> +						     &xdp_status, tx_nq, tx,
>>> +						     false);
>>>   			if (consume) {
>>>   				rx->packets++;
>>>   				rx->bytes += length;
>>> @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>>   	return done;
>>>   }
>>>   
>>> +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
>>> +			    int budget)
>>> +{
>>> +	struct tsnep_rx_entry *entry;
>>> +	struct netdev_queue *tx_nq;
>>> +	struct bpf_prog *prog;
>>> +	struct tsnep_tx *tx;
>>> +	int desc_available;
>>> +	int xdp_status = 0;
>>> +	struct page *page;
>>> +	int done = 0;
>>> +	int length;
>>> +
>>> +	desc_available = tsnep_rx_desc_available(rx);
>>> +	prog = READ_ONCE(rx->adapter->xdp_prog);
>>> +	if (prog) {
>>> +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
>>> +					    rx->tx_queue_index);
>>> +		tx = &rx->adapter->tx[rx->tx_queue_index];
>>> +	}
>>> +
>>> +	while (likely(done < budget) && (rx->read != rx->write)) {
>>> +		entry = &rx->entry[rx->read];
>>> +		if ((__le32_to_cpu(entry->desc_wb->properties) &
>>> +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
>>> +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
>>> +			break;
>>> +		done++;
>>> +
>>> +		if (desc_available >= TSNEP_RING_RX_REFILL) {
>>> +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
>>> +
>>> +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
>>> +							     reuse);
>>> +			if (!entry->xdp) {
>>> +				/* buffer has been reused for refill to prevent
>>> +				 * empty RX ring, thus buffer cannot be used for
>>> +				 * RX processing
>>> +				 */
>>> +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>>> +				desc_available++;
>>> +
>>> +				rx->dropped++;
>>> +
>>> +				continue;
>>> +			}
>>> +		}
>>> +
>>> +		/* descriptor properties shall be read first, because valid data
>>> +		 * is signaled there
>>> +		 */
>>> +		dma_rmb();
>>> +
>>> +		prefetch(entry->xdp->data);
>>> +		length = __le32_to_cpu(entry->desc_wb->properties) &
>>> +			 TSNEP_DESC_LENGTH_MASK;
>>> +		entry->xdp->data_end = entry->xdp->data + length;
>>> +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
>>> +
>>> +		/* RX metadata with timestamps is in front of actual data,
>>> +		 * subtract metadata size to get length of actual data and
>>> +		 * consider metadata size as offset of actual data during RX
>>> +		 * processing
>>> +		 */
>>> +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
>>> +
>>> +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>>> +		desc_available++;
>>> +
>>> +		if (prog) {
>>> +			bool consume;
>>> +
>>> +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
>>> +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
>>> +
>>> +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
>>> +						     &xdp_status, tx_nq, tx,
>>> +						     true);
>>
>> reason for separate xdp run prog routine for ZC was usually "likely-fying"
>> XDP_REDIRECT action as this is the main action for AF_XDP which was giving
>> us perf improvement. Please try this out on your side to see if this
>> yields any positive value.
> 
> One more thing - you have to handle XDP_TX action in a ZC specific way.
> Your current code will break if you enable xsk_pool and return XDP_TX from
> XDP prog.

I took again a look to igc, but I didn't found any specifics for XDP_TX
ZC. Only some buffer flipping, which I assume is needed for shared
pages.
For ice I see a call to xdp_convert_buff_to_frame() in ZC path, which
has some XSK logic within. Is this the ZC specific way? igc calls
xdp_convert_buff_to_frame() in both cases, so I'm not sure. But I will
try the XDP_TX action. I did test only with xdpsock.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support
  2023-04-03 22:36   ` Maciej Fijalkowski
@ 2023-04-05 19:15     ` Gerhard Engleder
  0 siblings, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-05 19:15 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On 04.04.23 00:36, Maciej Fijalkowski wrote:
> On Sun, Apr 02, 2023 at 09:38:38PM +0200, Gerhard Engleder wrote:
>> Send and complete XSK pool frames within TX NAPI context. NAPI context
>> is triggered by ndo_xsk_wakeup.
>>
>> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
>> ---
>>   drivers/net/ethernet/engleder/tsnep.h      |   2 +
>>   drivers/net/ethernet/engleder/tsnep_main.c | 131 +++++++++++++++++++--
>>   2 files changed, 123 insertions(+), 10 deletions(-)
>>
> 
> (...)
> 
>> +static void tsnep_xdp_xmit_zc(struct tsnep_tx *tx)
>> +{
>> +	int desc_available = tsnep_tx_desc_available(tx);
>> +	struct xdp_desc xdp_desc;
>> +	bool xmit = false;
>> +
>> +	/* ensure that TX ring is not filled up by XDP, always MAX_SKB_FRAGS
>> +	 * will be available for normal TX path and queue is stopped there if
>> +	 * necessary
>> +	 */
>> +	if (desc_available <= (MAX_SKB_FRAGS + 1))
>> +		return;
>> +	desc_available -= MAX_SKB_FRAGS + 1;
>> +
>> +	while (xsk_tx_peek_desc(tx->xsk_pool, &xdp_desc) && desc_available--) {
> 
> Again, I am curious how batch API usage would improve your perf.

I will try xsk_tx_peek_release_desc_batch().

Thank you for the review!

Gerhard

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-05 19:13       ` Gerhard Engleder
@ 2023-04-07 13:41         ` Maciej Fijalkowski
  2023-04-15 14:19           ` Gerhard Engleder
  0 siblings, 1 reply; 14+ messages in thread
From: Maciej Fijalkowski @ 2023-04-07 13:41 UTC (permalink / raw)
  To: Gerhard Engleder
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On Wed, Apr 05, 2023 at 09:13:58PM +0200, Gerhard Engleder wrote:
> On 03.04.23 19:26, Maciej Fijalkowski wrote:
> > On Mon, Apr 03, 2023 at 07:19:15PM +0200, Maciej Fijalkowski wrote:
> > > On Sun, Apr 02, 2023 at 09:38:37PM +0200, Gerhard Engleder wrote:
> > > 
> > > Hey Gerhard,
> > > 
> > > > Add support for XSK zero-copy to RX path. The setup of the XSK pool can
> > > > be done at runtime. If the netdev is running, then the queue must be
> > > > disabled and enabled during reconfiguration. This can be done easily
> > > > with functions introduced in previous commits.
> > > > 
> > > > A more important property is that, if the netdev is running, then the
> > > > setup of the XSK pool shall not stop the netdev in case of errors. A
> > > > broken netdev after a failed XSK pool setup is bad behavior. Therefore,
> > > > the allocation and setup of resources during XSK pool setup is done only
> > > > before any queue is disabled. Additionally, freeing and later allocation
> > > > of resources is eliminated in some cases. Page pool entries are kept for
> > > > later use. Two memory models are registered in parallel. As a result,
> > > > the XSK pool setup cannot fail during queue reconfiguration.
> > > > 
> > > > In contrast to other drivers, XSK pool setup and XDP BPF program setup
> > > > are separate actions. XSK pool setup can be done without any XDP BPF
> > > > program. The XDP BPF program can be added, removed or changed without
> > > > any reconfiguration of the XSK pool.
> > > 
> > > I won't argue about your design, but I'd be glad if you would present any
> > > perf numbers (ZC vs copy mode) just to give us some overview how your
> > > implementation works out. Also, please consider using batching APIs and
> > > see if this gives you any boost (my assumption is that it would).
> > > 
> > > > 
> > > > Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
> > > > ---
> > > >   drivers/net/ethernet/engleder/tsnep.h      |   7 +
> > > >   drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
> > > >   drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
> > > >   3 files changed, 488 insertions(+), 18 deletions(-)
> > 
> > (...)
> > 
> > > > +
> > > >   static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
> > > >   			       struct xdp_buff *xdp, int *status,
> > > > -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
> > > > +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
> > > > +			       bool zc)
> > > >   {
> > > >   	unsigned int length;
> > > > -	unsigned int sync;
> > > >   	u32 act;
> > > >   	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
> > > >   	act = bpf_prog_run_xdp(prog, xdp);
> > > > -
> > > > -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
> > > > -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
> > > > -	sync = max(sync, length);
> > > > -
> > > >   	switch (act) {
> > > >   	case XDP_PASS:
> > > >   		return false;
> > > > @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
> > > >   		trace_xdp_exception(rx->adapter->netdev, prog, act);
> > > >   		fallthrough;
> > > >   	case XDP_DROP:
> > > > -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
> > > > -				   sync, true);
> > > > +		if (zc) {
> > > > +			xsk_buff_free(xdp);
> > > > +		} else {
> > > > +			unsigned int sync;
> > > > +
> > > > +			/* Due xdp_adjust_tail: DMA sync for_device cover max
> > > > +			 * len CPU touch
> > > > +			 */
> > > > +			sync = xdp->data_end - xdp->data_hard_start -
> > > > +			       XDP_PACKET_HEADROOM;
> > > > +			sync = max(sync, length);
> > > > +			page_pool_put_page(rx->page_pool,
> > > > +					   virt_to_head_page(xdp->data), sync,
> > > > +					   true);
> > > > +		}
> > > >   		return true;
> > > >   	}
> > > >   }
> > > > @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
> > > >   					 length, false);
> > > >   			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
> > > > -						     &xdp_status, tx_nq, tx);
> > > > +						     &xdp_status, tx_nq, tx,
> > > > +						     false);
> > > >   			if (consume) {
> > > >   				rx->packets++;
> > > >   				rx->bytes += length;
> > > > @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
> > > >   	return done;
> > > >   }
> > > > +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
> > > > +			    int budget)
> > > > +{
> > > > +	struct tsnep_rx_entry *entry;
> > > > +	struct netdev_queue *tx_nq;
> > > > +	struct bpf_prog *prog;
> > > > +	struct tsnep_tx *tx;
> > > > +	int desc_available;
> > > > +	int xdp_status = 0;
> > > > +	struct page *page;
> > > > +	int done = 0;
> > > > +	int length;
> > > > +
> > > > +	desc_available = tsnep_rx_desc_available(rx);
> > > > +	prog = READ_ONCE(rx->adapter->xdp_prog);
> > > > +	if (prog) {
> > > > +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
> > > > +					    rx->tx_queue_index);
> > > > +		tx = &rx->adapter->tx[rx->tx_queue_index];
> > > > +	}
> > > > +
> > > > +	while (likely(done < budget) && (rx->read != rx->write)) {
> > > > +		entry = &rx->entry[rx->read];
> > > > +		if ((__le32_to_cpu(entry->desc_wb->properties) &
> > > > +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
> > > > +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
> > > > +			break;
> > > > +		done++;
> > > > +
> > > > +		if (desc_available >= TSNEP_RING_RX_REFILL) {
> > > > +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
> > > > +
> > > > +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
> > > > +							     reuse);
> > > > +			if (!entry->xdp) {
> > > > +				/* buffer has been reused for refill to prevent
> > > > +				 * empty RX ring, thus buffer cannot be used for
> > > > +				 * RX processing
> > > > +				 */
> > > > +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> > > > +				desc_available++;
> > > > +
> > > > +				rx->dropped++;
> > > > +
> > > > +				continue;
> > > > +			}
> > > > +		}
> > > > +
> > > > +		/* descriptor properties shall be read first, because valid data
> > > > +		 * is signaled there
> > > > +		 */
> > > > +		dma_rmb();
> > > > +
> > > > +		prefetch(entry->xdp->data);
> > > > +		length = __le32_to_cpu(entry->desc_wb->properties) &
> > > > +			 TSNEP_DESC_LENGTH_MASK;
> > > > +		entry->xdp->data_end = entry->xdp->data + length;
> > > > +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
> > > > +
> > > > +		/* RX metadata with timestamps is in front of actual data,
> > > > +		 * subtract metadata size to get length of actual data and
> > > > +		 * consider metadata size as offset of actual data during RX
> > > > +		 * processing
> > > > +		 */
> > > > +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
> > > > +
> > > > +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
> > > > +		desc_available++;
> > > > +
> > > > +		if (prog) {
> > > > +			bool consume;
> > > > +
> > > > +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
> > > > +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
> > > > +
> > > > +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
> > > > +						     &xdp_status, tx_nq, tx,
> > > > +						     true);
> > > 
> > > reason for separate xdp run prog routine for ZC was usually "likely-fying"
> > > XDP_REDIRECT action as this is the main action for AF_XDP which was giving
> > > us perf improvement. Please try this out on your side to see if this
> > > yields any positive value.
> > 
> > One more thing - you have to handle XDP_TX action in a ZC specific way.
> > Your current code will break if you enable xsk_pool and return XDP_TX from
> > XDP prog.
> 
> I took again a look to igc, but I didn't found any specifics for XDP_TX
> ZC. Only some buffer flipping, which I assume is needed for shared
> pages.
> For ice I see a call to xdp_convert_buff_to_frame() in ZC path, which
> has some XSK logic within. Is this the ZC specific way? igc calls
> xdp_convert_buff_to_frame() in both cases, so I'm not sure. But I will
> try the XDP_TX action. I did test only with xdpsock.

I think I will back off a bit with a statement that your XDP_TX is clearly
broken, here's why.

igc when converting xdp_buff to xdp_frame and xdp_buff's memory being
backed by xsk_buff_pool will grab the new page from kernel, copy the
contents of xdp_buff to it, recycle xdp_buff back to xsk_buff_pool and
return new page back to driver (i have just described what
xdp_convert_zc_to_xdp_frame() is doing). Thing is that it is expensive and
hurts perf and we stepped away from this on ice, this is a matter of
storing the xdp_buff onto adequate Tx buffer struct that you can access
while cleaning Tx descriptors so that you'll be able to xsk_buff_free()
it.

So saying 'your current code will break' might have been too much from my
side. Just make sure that your XDP_TX action on ZC works. In order to do
that, i was loading xdpsock with XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD flag
on a queue receiving frames and then running xdp_rxq_info with XDP_TX
action.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support
  2023-04-07 13:41         ` Maciej Fijalkowski
@ 2023-04-15 14:19           ` Gerhard Engleder
  0 siblings, 0 replies; 14+ messages in thread
From: Gerhard Engleder @ 2023-04-15 14:19 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: netdev, bpf, davem, kuba, edumazet, pabeni, bjorn,
	magnus.karlsson, jonathan.lemon

On 07.04.23 15:41, Maciej Fijalkowski wrote:
> On Wed, Apr 05, 2023 at 09:13:58PM +0200, Gerhard Engleder wrote:
>> On 03.04.23 19:26, Maciej Fijalkowski wrote:
>>> On Mon, Apr 03, 2023 at 07:19:15PM +0200, Maciej Fijalkowski wrote:
>>>> On Sun, Apr 02, 2023 at 09:38:37PM +0200, Gerhard Engleder wrote:
>>>>
>>>> Hey Gerhard,
>>>>
>>>>> Add support for XSK zero-copy to RX path. The setup of the XSK pool can
>>>>> be done at runtime. If the netdev is running, then the queue must be
>>>>> disabled and enabled during reconfiguration. This can be done easily
>>>>> with functions introduced in previous commits.
>>>>>
>>>>> A more important property is that, if the netdev is running, then the
>>>>> setup of the XSK pool shall not stop the netdev in case of errors. A
>>>>> broken netdev after a failed XSK pool setup is bad behavior. Therefore,
>>>>> the allocation and setup of resources during XSK pool setup is done only
>>>>> before any queue is disabled. Additionally, freeing and later allocation
>>>>> of resources is eliminated in some cases. Page pool entries are kept for
>>>>> later use. Two memory models are registered in parallel. As a result,
>>>>> the XSK pool setup cannot fail during queue reconfiguration.
>>>>>
>>>>> In contrast to other drivers, XSK pool setup and XDP BPF program setup
>>>>> are separate actions. XSK pool setup can be done without any XDP BPF
>>>>> program. The XDP BPF program can be added, removed or changed without
>>>>> any reconfiguration of the XSK pool.
>>>>
>>>> I won't argue about your design, but I'd be glad if you would present any
>>>> perf numbers (ZC vs copy mode) just to give us some overview how your
>>>> implementation works out. Also, please consider using batching APIs and
>>>> see if this gives you any boost (my assumption is that it would).
>>>>
>>>>>
>>>>> Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
>>>>> ---
>>>>>    drivers/net/ethernet/engleder/tsnep.h      |   7 +
>>>>>    drivers/net/ethernet/engleder/tsnep_main.c | 432 ++++++++++++++++++++-
>>>>>    drivers/net/ethernet/engleder/tsnep_xdp.c  |  67 ++++
>>>>>    3 files changed, 488 insertions(+), 18 deletions(-)
>>>
>>> (...)
>>>
>>>>> +
>>>>>    static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>>>>    			       struct xdp_buff *xdp, int *status,
>>>>> -			       struct netdev_queue *tx_nq, struct tsnep_tx *tx)
>>>>> +			       struct netdev_queue *tx_nq, struct tsnep_tx *tx,
>>>>> +			       bool zc)
>>>>>    {
>>>>>    	unsigned int length;
>>>>> -	unsigned int sync;
>>>>>    	u32 act;
>>>>>    	length = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>>>>>    	act = bpf_prog_run_xdp(prog, xdp);
>>>>> -
>>>>> -	/* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
>>>>> -	sync = xdp->data_end - xdp->data_hard_start - XDP_PACKET_HEADROOM;
>>>>> -	sync = max(sync, length);
>>>>> -
>>>>>    	switch (act) {
>>>>>    	case XDP_PASS:
>>>>>    		return false;
>>>>> @@ -1027,8 +1149,21 @@ static bool tsnep_xdp_run_prog(struct tsnep_rx *rx, struct bpf_prog *prog,
>>>>>    		trace_xdp_exception(rx->adapter->netdev, prog, act);
>>>>>    		fallthrough;
>>>>>    	case XDP_DROP:
>>>>> -		page_pool_put_page(rx->page_pool, virt_to_head_page(xdp->data),
>>>>> -				   sync, true);
>>>>> +		if (zc) {
>>>>> +			xsk_buff_free(xdp);
>>>>> +		} else {
>>>>> +			unsigned int sync;
>>>>> +
>>>>> +			/* Due xdp_adjust_tail: DMA sync for_device cover max
>>>>> +			 * len CPU touch
>>>>> +			 */
>>>>> +			sync = xdp->data_end - xdp->data_hard_start -
>>>>> +			       XDP_PACKET_HEADROOM;
>>>>> +			sync = max(sync, length);
>>>>> +			page_pool_put_page(rx->page_pool,
>>>>> +					   virt_to_head_page(xdp->data), sync,
>>>>> +					   true);
>>>>> +		}
>>>>>    		return true;
>>>>>    	}
>>>>>    }
>>>>> @@ -1181,7 +1316,8 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>>>>    					 length, false);
>>>>>    			consume = tsnep_xdp_run_prog(rx, prog, &xdp,
>>>>> -						     &xdp_status, tx_nq, tx);
>>>>> +						     &xdp_status, tx_nq, tx,
>>>>> +						     false);
>>>>>    			if (consume) {
>>>>>    				rx->packets++;
>>>>>    				rx->bytes += length;
>>>>> @@ -1205,6 +1341,125 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
>>>>>    	return done;
>>>>>    }
>>>>> +static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
>>>>> +			    int budget)
>>>>> +{
>>>>> +	struct tsnep_rx_entry *entry;
>>>>> +	struct netdev_queue *tx_nq;
>>>>> +	struct bpf_prog *prog;
>>>>> +	struct tsnep_tx *tx;
>>>>> +	int desc_available;
>>>>> +	int xdp_status = 0;
>>>>> +	struct page *page;
>>>>> +	int done = 0;
>>>>> +	int length;
>>>>> +
>>>>> +	desc_available = tsnep_rx_desc_available(rx);
>>>>> +	prog = READ_ONCE(rx->adapter->xdp_prog);
>>>>> +	if (prog) {
>>>>> +		tx_nq = netdev_get_tx_queue(rx->adapter->netdev,
>>>>> +					    rx->tx_queue_index);
>>>>> +		tx = &rx->adapter->tx[rx->tx_queue_index];
>>>>> +	}
>>>>> +
>>>>> +	while (likely(done < budget) && (rx->read != rx->write)) {
>>>>> +		entry = &rx->entry[rx->read];
>>>>> +		if ((__le32_to_cpu(entry->desc_wb->properties) &
>>>>> +		     TSNEP_DESC_OWNER_COUNTER_MASK) !=
>>>>> +		    (entry->properties & TSNEP_DESC_OWNER_COUNTER_MASK))
>>>>> +			break;
>>>>> +		done++;
>>>>> +
>>>>> +		if (desc_available >= TSNEP_RING_RX_REFILL) {
>>>>> +			bool reuse = desc_available >= TSNEP_RING_RX_REUSE;
>>>>> +
>>>>> +			desc_available -= tsnep_rx_refill_zc(rx, desc_available,
>>>>> +							     reuse);
>>>>> +			if (!entry->xdp) {
>>>>> +				/* buffer has been reused for refill to prevent
>>>>> +				 * empty RX ring, thus buffer cannot be used for
>>>>> +				 * RX processing
>>>>> +				 */
>>>>> +				rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>>>>> +				desc_available++;
>>>>> +
>>>>> +				rx->dropped++;
>>>>> +
>>>>> +				continue;
>>>>> +			}
>>>>> +		}
>>>>> +
>>>>> +		/* descriptor properties shall be read first, because valid data
>>>>> +		 * is signaled there
>>>>> +		 */
>>>>> +		dma_rmb();
>>>>> +
>>>>> +		prefetch(entry->xdp->data);
>>>>> +		length = __le32_to_cpu(entry->desc_wb->properties) &
>>>>> +			 TSNEP_DESC_LENGTH_MASK;
>>>>> +		entry->xdp->data_end = entry->xdp->data + length;
>>>>> +		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
>>>>> +
>>>>> +		/* RX metadata with timestamps is in front of actual data,
>>>>> +		 * subtract metadata size to get length of actual data and
>>>>> +		 * consider metadata size as offset of actual data during RX
>>>>> +		 * processing
>>>>> +		 */
>>>>> +		length -= TSNEP_RX_INLINE_METADATA_SIZE;
>>>>> +
>>>>> +		rx->read = (rx->read + 1) % TSNEP_RING_SIZE;
>>>>> +		desc_available++;
>>>>> +
>>>>> +		if (prog) {
>>>>> +			bool consume;
>>>>> +
>>>>> +			entry->xdp->data += TSNEP_RX_INLINE_METADATA_SIZE;
>>>>> +			entry->xdp->data_meta += TSNEP_RX_INLINE_METADATA_SIZE;
>>>>> +
>>>>> +			consume = tsnep_xdp_run_prog(rx, prog, entry->xdp,
>>>>> +						     &xdp_status, tx_nq, tx,
>>>>> +						     true);
>>>>
>>>> reason for separate xdp run prog routine for ZC was usually "likely-fying"
>>>> XDP_REDIRECT action as this is the main action for AF_XDP which was giving
>>>> us perf improvement. Please try this out on your side to see if this
>>>> yields any positive value.
>>>
>>> One more thing - you have to handle XDP_TX action in a ZC specific way.
>>> Your current code will break if you enable xsk_pool and return XDP_TX from
>>> XDP prog.
>>
>> I took again a look to igc, but I didn't found any specifics for XDP_TX
>> ZC. Only some buffer flipping, which I assume is needed for shared
>> pages.
>> For ice I see a call to xdp_convert_buff_to_frame() in ZC path, which
>> has some XSK logic within. Is this the ZC specific way? igc calls
>> xdp_convert_buff_to_frame() in both cases, so I'm not sure. But I will
>> try the XDP_TX action. I did test only with xdpsock.
> 
> I think I will back off a bit with a statement that your XDP_TX is clearly
> broken, here's why.
> 
> igc when converting xdp_buff to xdp_frame and xdp_buff's memory being
> backed by xsk_buff_pool will grab the new page from kernel, copy the
> contents of xdp_buff to it, recycle xdp_buff back to xsk_buff_pool and
> return new page back to driver (i have just described what
> xdp_convert_zc_to_xdp_frame() is doing). Thing is that it is expensive and
> hurts perf and we stepped away from this on ice, this is a matter of
> storing the xdp_buff onto adequate Tx buffer struct that you can access
> while cleaning Tx descriptors so that you'll be able to xsk_buff_free()
> it.
> 
> So saying 'your current code will break' might have been too much from my
> side. Just make sure that your XDP_TX action on ZC works. In order to do
> that, i was loading xdpsock with XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD flag
> on a queue receiving frames and then running xdp_rxq_info with XDP_TX
> action.

I took a look to the new ice code. It makes sense to prevent that
copying. I took a note for future work. For now XDP_REDIRECT is
the optimisation path.

Tested as suggested. XDP_TX works.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2023-04-15 14:20 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-02 19:38 [PATCH net-next 0/5] tsnep: XDP socket zero-copy support Gerhard Engleder
2023-04-02 19:38 ` [PATCH net-next 1/5] tsnep: Rework TX/RX queue initialization Gerhard Engleder
2023-04-02 19:38 ` [PATCH net-next 2/5] tsnep: Add functions for queue enable/disable Gerhard Engleder
2023-04-02 19:38 ` [PATCH net-next 3/5] tsnep: Move skb receive action to separate function Gerhard Engleder
2023-04-02 19:38 ` [PATCH net-next 4/5] tsnep: Add XDP socket zero-copy RX support Gerhard Engleder
2023-04-03 17:19   ` Maciej Fijalkowski
2023-04-03 17:26     ` Maciej Fijalkowski
2023-04-05 19:13       ` Gerhard Engleder
2023-04-07 13:41         ` Maciej Fijalkowski
2023-04-15 14:19           ` Gerhard Engleder
2023-04-05 18:49     ` Gerhard Engleder
2023-04-02 19:38 ` [PATCH net-next 5/5] tsnep: Add XDP socket zero-copy TX support Gerhard Engleder
2023-04-03 22:36   ` Maciej Fijalkowski
2023-04-05 19:15     ` Gerhard Engleder

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).