All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v3 0/4] GVE Raw Addressing.
@ 2020-09-24  1:01 David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 1/4] gve: Add support for raw addressing device option David Awogbemila
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: David Awogbemila @ 2020-09-24  1:01 UTC (permalink / raw)
  To: netdev; +Cc: David Awogbemila

Changes from v2:
Patch 1: Get end of struct more simply: descriptor + 1, no need to cast void pointers.
Patch 2: Separate page recycling to its own commit (patch 3 in v3).
Patch 4 (patch 3 in v2): Fix alignment of gve_dma_sync_for_device params.

Catherine Sullivan (3):
  gve: Add support for raw addressing device option
  gve: Add support for raw addressing to the rx path
  gve: Add support for raw addressing in the tx path

David Awogbemila (1):
  gve: Rx Buffer Recycling

 drivers/net/ethernet/google/gve/gve.h        |  40 ++-
 drivers/net/ethernet/google/gve/gve_adminq.c |  64 +++-
 drivers/net/ethernet/google/gve/gve_adminq.h |  15 +-
 drivers/net/ethernet/google/gve/gve_desc.h   |  18 +-
 drivers/net/ethernet/google/gve/gve_main.c   |  12 +-
 drivers/net/ethernet/google/gve/gve_rx.c     | 350 ++++++++++++++-----
 drivers/net/ethernet/google/gve/gve_tx.c     | 205 +++++++++--
 7 files changed, 554 insertions(+), 150 deletions(-)

-- 
2.28.0.681.g6f77f65b4e-goog


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH net-next v3 1/4] gve: Add support for raw addressing device option
  2020-09-24  1:01 [PATCH net-next v3 0/4] GVE Raw Addressing David Awogbemila
@ 2020-09-24  1:01 ` David Awogbemila
  2020-09-24 23:03   ` Jakub Kicinski
  2020-09-24  1:01 ` [PATCH net-next v3 2/4] gve: Add support for raw addressing to the rx path David Awogbemila
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 14+ messages in thread
From: David Awogbemila @ 2020-09-24  1:01 UTC (permalink / raw)
  To: netdev; +Cc: Catherine Sullivan, Yangchun Fu, David Awogbemila

From: Catherine Sullivan <csully@google.com>

Add support to describe device for parsing device options. As
the first device option, add raw addressing.

"Raw Addressing" mode (as opposed to the current "qpl" mode) is an
operational mode which allows the driver avoid bounce buffer copies
which it currently performs using pre-allocated qpls (queue_page_lists)
when sending and receiving packets.
For egress packets, the provided skb data addresses will be dma_map'ed and
passed to the device, allowing the NIC can perform DMA directly - the
driver will not have to copy the buffer content into pre-allocated
buffers/qpls (as in qpl mode).
For ingress packets, copies are also eliminated as buffers are handed to
the networking stack and then recycled or re-allocated as
necessary, avoiding the use of skb_copy_to_linear_data().

This patch only introduces the option to the driver.
Subsequent patches will add the ingress and egress functionality.

Reviewed-by: Yangchun Fu <yangchun@google.com>
Signed-off-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David Awogbemila <awogbemila@google.com>
---
 drivers/net/ethernet/google/gve/gve.h        |  1 +
 drivers/net/ethernet/google/gve/gve_adminq.c | 46 ++++++++++++++++++++
 drivers/net/ethernet/google/gve/gve_adminq.h | 15 +++++--
 drivers/net/ethernet/google/gve/gve_main.c   |  9 ++++
 4 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index f5c80229ea96..80cdae06ee39 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -199,6 +199,7 @@ struct gve_priv {
 	u64 num_registered_pages; /* num pages registered with NIC */
 	u32 rx_copybreak; /* copy packets smaller than this */
 	u16 default_num_queues; /* default num queues to set up */
+	bool raw_addressing; /* true if this dev supports raw addressing */
 
 	struct gve_queue_config tx_cfg;
 	struct gve_queue_config rx_cfg;
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index 24ae6a28a806..37221e11b5da 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -460,11 +460,14 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 int gve_adminq_describe_device(struct gve_priv *priv)
 {
 	struct gve_device_descriptor *descriptor;
+	struct gve_device_option *dev_opt;
 	union gve_adminq_command cmd;
 	dma_addr_t descriptor_bus;
+	u16 num_options;
 	int err = 0;
 	u8 *mac;
 	u16 mtu;
+	int i;
 
 	memset(&cmd, 0, sizeof(cmd));
 	descriptor = dma_alloc_coherent(&priv->pdev->dev, PAGE_SIZE,
@@ -518,6 +521,49 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 		priv->rx_desc_cnt = priv->rx_pages_per_qpl;
 	}
 	priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
+	dev_opt = (void *)(descriptor + 1);
+
+	num_options = be16_to_cpu(descriptor->num_device_options);
+	for (i = 0; i < num_options; i++) {
+		u16 option_length = be16_to_cpu(dev_opt->option_length);
+		u16 option_id = be16_to_cpu(dev_opt->option_id);
+
+		if ((void *)dev_opt + sizeof(*dev_opt) + option_length > (void *)descriptor +
+				      be16_to_cpu(descriptor->total_length)) {
+			dev_err(&priv->dev->dev,
+				"options exceed device_descriptor's total length.\n");
+			err = -EINVAL;
+			goto free_device_descriptor;
+		}
+
+		switch (option_id) {
+		case GVE_DEV_OPT_ID_RAW_ADDRESSING:
+			/* If the length or feature mask doesn't match,
+			 * continue without enabling the feature.
+			 */
+			if (option_length != GVE_DEV_OPT_LEN_RAW_ADDRESSING ||
+			    be32_to_cpu(dev_opt->feat_mask) !=
+			    GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING) {
+				dev_info(&priv->pdev->dev,
+					 "Raw addressing device option not enabled, length or features mask did not match expected.\n");
+				priv->raw_addressing = false;
+			} else {
+				dev_info(&priv->pdev->dev,
+					 "Raw addressing device option enabled.\n");
+				priv->raw_addressing = true;
+			}
+			break;
+		default:
+			/* If we don't recognize the option just continue
+			 * without doing anything.
+			 */
+			dev_info(&priv->pdev->dev,
+				 "Unrecognized device option 0x%hx not enabled.\n",
+				   option_id);
+			break;
+		}
+		dev_opt = (void *)dev_opt + sizeof(*dev_opt) + option_length;
+	}
 
 free_device_descriptor:
 	dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor,
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index 281de8326bc5..af5f586167bd 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -79,12 +79,17 @@ struct gve_device_descriptor {
 
 static_assert(sizeof(struct gve_device_descriptor) == 40);
 
-struct device_option {
-	__be32 option_id;
-	__be32 option_length;
+struct gve_device_option {
+	__be16 option_id;
+	__be16 option_length;
+	__be32 feat_mask;
 };
 
-static_assert(sizeof(struct device_option) == 8);
+static_assert(sizeof(struct gve_device_option) == 8);
+
+#define GVE_DEV_OPT_ID_RAW_ADDRESSING 0x1
+#define GVE_DEV_OPT_LEN_RAW_ADDRESSING 0x0
+#define GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING 0x0
 
 struct gve_adminq_configure_device_resources {
 	__be64 counter_array;
@@ -111,6 +116,8 @@ struct gve_adminq_unregister_page_list {
 
 static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4);
 
+#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF
+
 struct gve_adminq_create_tx_queue {
 	__be32 queue_id;
 	__be32 reserved;
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 48a433154ce0..70685c10db0e 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -678,6 +678,10 @@ static int gve_alloc_qpls(struct gve_priv *priv)
 	int i, j;
 	int err;
 
+	/* Raw addressing means no QPLs */
+	if (priv->raw_addressing)
+		return 0;
+
 	priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL);
 	if (!priv->qpls)
 		return -ENOMEM;
@@ -718,6 +722,10 @@ static void gve_free_qpls(struct gve_priv *priv)
 	int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv);
 	int i;
 
+	/* Raw addressing means no QPLs */
+	if (priv->raw_addressing)
+		return;
+
 	kvfree(priv->qpl_cfg.qpl_id_map);
 
 	for (i = 0; i < num_qpls; i++)
@@ -1078,6 +1086,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
 	if (skip_describe_device)
 		goto setup_device;
 
+	priv->raw_addressing = false;
 	/* Get the initial information we need from the device */
 	err = gve_adminq_describe_device(priv);
 	if (err) {
-- 
2.28.0.681.g6f77f65b4e-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next v3 2/4] gve: Add support for raw addressing to the rx path
  2020-09-24  1:01 [PATCH net-next v3 0/4] GVE Raw Addressing David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 1/4] gve: Add support for raw addressing device option David Awogbemila
@ 2020-09-24  1:01 ` David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 3/4] gve: Rx Buffer Recycling David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path David Awogbemila
  3 siblings, 0 replies; 14+ messages in thread
From: David Awogbemila @ 2020-09-24  1:01 UTC (permalink / raw)
  To: netdev; +Cc: Catherine Sullivan, Yangchun Fu, David Awogbemila

From: Catherine Sullivan <csully@google.com>

Add support to use raw dma addresses in the rx path. Due to this new
support we can alloc a new buffer instead of making a copy.

RX buffers are handed to the networking stack and are
re-allocated as needed, avoiding the need to use
skb_copy_to_linear_data() as in "qpl" mode.

Reviewed-by: Yangchun Fu <yangchun@google.com>
Signed-off-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David Awogbemila <awogbemila@google.com>
---
 drivers/net/ethernet/google/gve/gve.h        |  11 +-
 drivers/net/ethernet/google/gve/gve_adminq.c |  14 +-
 drivers/net/ethernet/google/gve/gve_desc.h   |  10 +-
 drivers/net/ethernet/google/gve/gve_main.c   |   3 +-
 drivers/net/ethernet/google/gve/gve_rx.c     | 185 +++++++++++++++----
 5 files changed, 177 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index 80cdae06ee39..b853efb0b17f 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -68,6 +68,7 @@ struct gve_rx_data_queue {
 	dma_addr_t data_bus; /* dma mapping of the slots */
 	struct gve_rx_slot_page_info *page_info; /* page info of the buffers */
 	struct gve_queue_page_list *qpl; /* qpl assigned to this queue */
+	bool raw_addressing; /* use raw_addressing? */
 };
 
 struct gve_priv;
@@ -82,11 +83,14 @@ struct gve_rx_ring {
 	u32 cnt; /* free-running total number of completed packets */
 	u32 fill_cnt; /* free-running total number of descs and buffs posted */
 	u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
+	u32 db_threshold; /* threshold for posting new buffs and descs */
 	u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
 	u64 rx_copied_pkt; /* free-running total number of copied packets */
 	u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
 	u64 rx_buf_alloc_fail; /* free-running count of buffer alloc fails */
 	u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */
+	/* free-running count of packets dropped because of lack of buffer refill */
+	u64 rx_no_refill_dropped_pkt;
 	u32 q_num; /* queue index */
 	u32 ntfy_id; /* notification block index */
 	struct gve_queue_resources *q_resources; /* head and tail pointer idx */
@@ -194,7 +198,7 @@ struct gve_priv {
 	u16 tx_desc_cnt; /* num desc per ring */
 	u16 rx_desc_cnt; /* num desc per ring */
 	u16 tx_pages_per_qpl; /* tx buffer length */
-	u16 rx_pages_per_qpl; /* rx buffer length */
+	u16 rx_data_slot_cnt; /* rx buffer length */
 	u64 max_registered_pages;
 	u64 num_registered_pages; /* num pages registered with NIC */
 	u32 rx_copybreak; /* copy packets smaller than this */
@@ -444,7 +448,10 @@ static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
  */
 static inline u32 gve_num_rx_qpls(struct gve_priv *priv)
 {
-	return priv->rx_cfg.num_queues;
+	if (priv->raw_addressing)
+		return 0;
+	else
+		return priv->rx_cfg.num_queues;
 }
 
 /* Returns a pointer to the next available tx qpl in the list of qpls
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index 37221e11b5da..094146e5cc22 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -357,8 +357,10 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
 {
 	struct gve_rx_ring *rx = &priv->rx[queue_index];
 	union gve_adminq_command cmd;
+	u32 qpl_id;
 	int err;
 
+	qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id;
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
 	cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
@@ -369,7 +371,7 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
 		.queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
 		.rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
 		.rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
-		.queue_page_list_id = cpu_to_be32(rx->data.qpl->id),
+		.queue_page_list_id = cpu_to_be32(qpl_id),
 	};
 
 	err = gve_adminq_issue_cmd(priv, &cmd);
@@ -514,11 +516,11 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 	mac = descriptor->mac;
 	dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);
 	priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl);
-	priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl);
-	if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) {
-		dev_err(&priv->pdev->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
-			priv->rx_pages_per_qpl);
-		priv->rx_desc_cnt = priv->rx_pages_per_qpl;
+	priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl);
+	if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) {
+		dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
+			priv->rx_data_slot_cnt);
+		priv->rx_desc_cnt = priv->rx_data_slot_cnt;
 	}
 	priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
 	dev_opt = (void *)(descriptor + 1);
diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
index 54779871d52e..0aad314aefaf 100644
--- a/drivers/net/ethernet/google/gve/gve_desc.h
+++ b/drivers/net/ethernet/google/gve/gve_desc.h
@@ -72,12 +72,14 @@ struct gve_rx_desc {
 } __packed;
 static_assert(sizeof(struct gve_rx_desc) == 64);
 
-/* As with the Tx ring format, the qpl_offset entries below are offsets into an
- * ordered list of registered pages.
+/* If the device supports raw dma addressing then the addr in data slot is
+ * the dma address of the buffer.
+ * If the device only supports registered segments than the addr is a byte
+ * offset into the registered segment (an ordered list of pages) where the
+ * buffer is.
  */
 struct gve_rx_data_slot {
-	/* byte offset into the rx registered segment of this slot */
-	__be64 qpl_offset;
+	__be64 addr;
 };
 
 /* GVE Recive Packet Descriptor Seq No */
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 70685c10db0e..225e17dd1ae5 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -596,6 +596,7 @@ int gve_alloc_page(struct gve_priv *priv, struct device *dev,
 	if (dma_mapping_error(dev, *dma)) {
 		priv->dma_mapping_error++;
 		put_page(*page);
+		*page = NULL;
 		return -ENOMEM;
 	}
 	return 0;
@@ -694,7 +695,7 @@ static int gve_alloc_qpls(struct gve_priv *priv)
 	}
 	for (; i < num_qpls; i++) {
 		err = gve_alloc_queue_page_list(priv, i,
-						priv->rx_pages_per_qpl);
+						priv->rx_data_slot_cnt);
 		if (err)
 			goto free_qpls;
 	}
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index 008fa897a3e6..ae76d2547d13 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -6,6 +6,7 @@
 
 #include "gve.h"
 #include "gve_adminq.h"
+#include "linux/device-mapper.h"
 #include <linux/etherdevice.h>
 
 static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx)
@@ -16,12 +17,22 @@ static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx)
 	block->rx = NULL;
 }
 
+static void gve_rx_free_buffer(struct device *dev,
+			       struct gve_rx_slot_page_info *page_info,
+			       struct gve_rx_data_slot *data_slot)
+{
+	dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) -
+				      page_info->page_offset);
+
+	gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
+}
+
 static void gve_rx_free_ring(struct gve_priv *priv, int idx)
 {
 	struct gve_rx_ring *rx = &priv->rx[idx];
 	struct device *dev = &priv->pdev->dev;
+	u32 slots = rx->mask + 1;
 	size_t bytes;
-	u32 slots;
 
 	gve_rx_remove_from_block(priv, idx);
 
@@ -33,11 +44,18 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx)
 			  rx->q_resources, rx->q_resources_bus);
 	rx->q_resources = NULL;
 
-	gve_unassign_qpl(priv, rx->data.qpl->id);
-	rx->data.qpl = NULL;
+	if (rx->data.raw_addressing) {
+		int i;
+
+		for (i = 0; i < slots; i++)
+			gve_rx_free_buffer(dev, &rx->data.page_info[i],
+					   &rx->data.data_ring[i]);
+	} else {
+		gve_unassign_qpl(priv, rx->data.qpl->id);
+		rx->data.qpl = NULL;
+	}
 	kvfree(rx->data.page_info);
 
-	slots = rx->mask + 1;
 	bytes = sizeof(*rx->data.data_ring) * slots;
 	dma_free_coherent(dev, bytes, rx->data.data_ring,
 			  rx->data.data_bus);
@@ -52,13 +70,14 @@ static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
 	page_info->page = page;
 	page_info->page_offset = 0;
 	page_info->page_address = page_address(page);
-	slot->qpl_offset = cpu_to_be64(addr);
+	slot->addr = cpu_to_be64(addr);
 }
 
 static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
 {
 	struct gve_priv *priv = rx->gve;
 	u32 slots;
+	int err;
 	int i;
 
 	/* Allocate one page per Rx queue slot. Each page is split into two
@@ -71,12 +90,30 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
 	if (!rx->data.page_info)
 		return -ENOMEM;
 
-	rx->data.qpl = gve_assign_rx_qpl(priv);
-
+	if (!rx->data.raw_addressing)
+		rx->data.qpl = gve_assign_rx_qpl(priv);
 	for (i = 0; i < slots; i++) {
-		struct page *page = rx->data.qpl->pages[i];
-		dma_addr_t addr = i * PAGE_SIZE;
+		struct page *page;
+		dma_addr_t addr;
+
+		if (rx->data.raw_addressing) {
+			err = gve_alloc_page(priv, &priv->pdev->dev, &page,
+					     &addr, DMA_FROM_DEVICE);
+			if (err) {
+				int j;
 
+				u64_stats_update_begin(&rx->statss);
+				rx->rx_buf_alloc_fail++;
+				u64_stats_update_end(&rx->statss);
+				for (j = 0; j < i; j++)
+					gve_free_page(&priv->pdev->dev, page,
+						      addr, DMA_FROM_DEVICE);
+				return err;
+			}
+		} else {
+			page = rx->data.qpl->pages[i];
+			addr = i * PAGE_SIZE;
+		}
 		gve_setup_rx_buffer(&rx->data.page_info[i],
 				    &rx->data.data_ring[i], addr, page);
 	}
@@ -110,8 +147,9 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
 	rx->gve = priv;
 	rx->q_num = idx;
 
-	slots = priv->rx_pages_per_qpl;
+	slots = priv->rx_data_slot_cnt;
 	rx->mask = slots - 1;
+	rx->data.raw_addressing = priv->raw_addressing;
 
 	/* alloc rx data ring */
 	bytes = sizeof(*rx->data.data_ring) * slots;
@@ -156,8 +194,8 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
 		err = -ENOMEM;
 		goto abort_with_q_resources;
 	}
-	rx->mask = slots - 1;
 	rx->cnt = 0;
+	rx->db_threshold = priv->rx_desc_cnt / 2;
 	rx->desc.seqno = 1;
 	gve_rx_add_to_block(priv, idx);
 
@@ -251,8 +289,7 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
 	return skb;
 }
 
-static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
-					struct napi_struct *napi,
+static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
 					struct gve_rx_slot_page_info *page_info,
 					u16 len)
 {
@@ -268,14 +305,49 @@ static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
 	return skb;
 }
 
+static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
+			       struct gve_rx_slot_page_info *page_info,
+			       struct gve_rx_data_slot *data_slot,
+			       struct gve_rx_ring *rx)
+{
+	struct page *page;
+	dma_addr_t dma;
+	int err;
+
+	err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE);
+	if (err) {
+		u64_stats_update_begin(&rx->statss);
+		rx->rx_buf_alloc_fail++;
+		u64_stats_update_end(&rx->statss);
+		return err;
+	}
+
+	gve_setup_rx_buffer(page_info, data_slot, dma, page);
+	return 0;
+}
+
 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
 			     struct gve_rx_data_slot *data_ring)
 {
-	u64 addr = be64_to_cpu(data_ring->qpl_offset);
+	u64 addr = be64_to_cpu(data_ring->addr);
 
 	page_info->page_offset ^= PAGE_SIZE / 2;
 	addr ^= PAGE_SIZE / 2;
-	data_ring->qpl_offset = cpu_to_be64(addr);
+	data_ring->addr = cpu_to_be64(addr);
+}
+
+static struct sk_buff *
+gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
+		      struct gve_rx_slot_page_info *page_info, u16 len,
+		      struct napi_struct *napi,
+		      struct gve_rx_data_slot *data_slot)
+{
+	struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
+
+	if (!skb)
+		return NULL;
+
+	return skb;
 }
 
 static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
@@ -285,7 +357,9 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 	struct gve_priv *priv = rx->gve;
 	struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
 	struct net_device *dev = priv->dev;
-	struct sk_buff *skb;
+	struct gve_rx_data_slot *data_slot;
+	struct sk_buff *skb = NULL;
+	dma_addr_t page_bus;
 	int pagecount;
 	u16 len;
 
@@ -294,18 +368,18 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 		u64_stats_update_begin(&rx->statss);
 		rx->rx_desc_err_dropped_pkt++;
 		u64_stats_update_end(&rx->statss);
-		return true;
+		return false;
 	}
 
 	len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD;
 	page_info = &rx->data.page_info[idx];
-	dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx],
-				PAGE_SIZE, DMA_FROM_DEVICE);
 
-	/* gvnic can only receive into registered segments. If the buffer
-	 * can't be recycled, our only choice is to copy the data out of
-	 * it so that we can return it to the device.
-	 */
+	data_slot = &rx->data.data_ring[idx];
+	page_bus = (rx->data.raw_addressing) ?
+					be64_to_cpu(data_slot->addr) - page_info->page_offset :
+					rx->data.qpl->page_buses[idx];
+	dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
+				PAGE_SIZE, DMA_FROM_DEVICE);
 
 	if (PAGE_SIZE == 4096) {
 		if (len <= priv->rx_copybreak) {
@@ -316,6 +390,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 			u64_stats_update_end(&rx->statss);
 			goto have_skb;
 		}
+		if (rx->data.raw_addressing) {
+			skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
+						    page_info, len, napi,
+						     data_slot);
+			goto have_skb;
+		}
 		if (unlikely(!gve_can_recycle_pages(dev))) {
 			skb = gve_rx_copy(rx, dev, napi, page_info, len);
 			goto have_skb;
@@ -326,12 +406,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 			 * the page fragment to a new SKB and pass it up the
 			 * stack.
 			 */
-			skb = gve_rx_add_frags(dev, napi, page_info, len);
+			skb = gve_rx_add_frags(napi, page_info, len);
 			if (!skb) {
 				u64_stats_update_begin(&rx->statss);
 				rx->rx_skb_alloc_fail++;
 				u64_stats_update_end(&rx->statss);
-				return true;
+				return false;
 			}
 			/* Make sure the kernel stack can't release the page */
 			get_page(page_info->page);
@@ -347,7 +427,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 			return false;
 		}
 	} else {
-		skb = gve_rx_copy(rx, dev, napi, page_info, len);
+		if (rx->data.raw_addressing)
+			skb = gve_rx_copy(rx, dev, napi, page_info, len);
+		else
+			skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
+						    page_info, len, napi,
+						    data_slot);
 	}
 
 have_skb:
@@ -358,7 +443,7 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 		u64_stats_update_begin(&rx->statss);
 		rx->rx_skb_alloc_fail++;
 		u64_stats_update_end(&rx->statss);
-		return true;
+		return false;
 	}
 
 	if (likely(feat & NETIF_F_RXCSUM)) {
@@ -399,19 +484,41 @@ static bool gve_rx_work_pending(struct gve_rx_ring *rx)
 	return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
 }
 
+static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
+{
+	u32 fill_cnt = rx->fill_cnt;
+
+	while ((fill_cnt & rx->mask) != (rx->cnt & rx->mask)) {
+		u32 idx = fill_cnt & rx->mask;
+		struct gve_rx_slot_page_info *page_info = &rx->data.page_info[idx];
+		struct gve_rx_data_slot *data_slot = &rx->data.data_ring[idx];
+		struct device *dev = &priv->pdev->dev;
+
+		gve_rx_free_buffer(dev, page_info, data_slot);
+		page_info->page = NULL;
+		if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
+			break;
+		fill_cnt++;
+	}
+	rx->fill_cnt = fill_cnt;
+	return true;
+}
+
 bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
 		       netdev_features_t feat)
 {
 	struct gve_priv *priv = rx->gve;
+	u32 work_done = 0, packets = 0;
 	struct gve_rx_desc *desc;
 	u32 cnt = rx->cnt;
 	u32 idx = cnt & rx->mask;
-	u32 work_done = 0;
 	u64 bytes = 0;
 
 	desc = rx->desc.desc_ring + idx;
 	while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
 	       work_done < budget) {
+		bool dropped;
+
 		netif_info(priv, rx_status, priv->dev,
 			   "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n",
 			   rx->q_num, idx, desc, desc->flags_seq);
@@ -419,9 +526,11 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
 			   "[%d] seqno=%d rx->desc.seqno=%d\n",
 			   rx->q_num, GVE_SEQNO(desc->flags_seq),
 			   rx->desc.seqno);
-		bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
-		if (!gve_rx(rx, desc, feat, idx))
-			gve_schedule_reset(priv);
+		dropped = !gve_rx(rx, desc, feat, idx);
+		if (!dropped) {
+			bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
+			packets++;
+		}
 		cnt++;
 		idx = cnt & rx->mask;
 		desc = rx->desc.desc_ring + idx;
@@ -433,11 +542,21 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
 		return false;
 
 	u64_stats_update_begin(&rx->statss);
-	rx->rpackets += work_done;
+	rx->rpackets += packets;
 	rx->rbytes += bytes;
 	u64_stats_update_end(&rx->statss);
 	rx->cnt = cnt;
-	rx->fill_cnt += work_done;
+	/* restock ring slots */
+	if (!rx->data.raw_addressing) {
+		/* In QPL mode buffs are refilled as the desc are processed */
+		rx->fill_cnt += work_done;
+	} else if (rx->fill_cnt - cnt <= rx->db_threshold) {
+		/* In raw addressing mode buffs are only refilled if the avail
+		 * falls below a threshold.
+		 */
+		if (!gve_rx_refill_buffers(priv, rx))
+			return false;
+	}
 
 	gve_rx_write_doorbell(priv, rx);
 	return gve_rx_work_pending(rx);
-- 
2.28.0.681.g6f77f65b4e-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next v3 3/4] gve: Rx Buffer Recycling
  2020-09-24  1:01 [PATCH net-next v3 0/4] GVE Raw Addressing David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 1/4] gve: Add support for raw addressing device option David Awogbemila
  2020-09-24  1:01 ` [PATCH net-next v3 2/4] gve: Add support for raw addressing to the rx path David Awogbemila
@ 2020-09-24  1:01 ` David Awogbemila
  2020-09-24 23:00   ` Jakub Kicinski
  2020-09-24  1:01 ` [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path David Awogbemila
  3 siblings, 1 reply; 14+ messages in thread
From: David Awogbemila @ 2020-09-24  1:01 UTC (permalink / raw)
  To: netdev; +Cc: David Awogbemila

This patch lets the driver reuse buffers that have been freed by the
networking stack.

In the raw addressing case, this allows the driver avoid allocating new
buffers.
In the qpl case, the driver can avoid copies.

Signed-off-by: David Awogbemila <awogbemila@google.com>
---
 drivers/net/ethernet/google/gve/gve.h    |  10 +-
 drivers/net/ethernet/google/gve/gve_rx.c | 197 +++++++++++++++--------
 2 files changed, 133 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index b853efb0b17f..9cce2b356235 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -50,6 +50,7 @@ struct gve_rx_slot_page_info {
 	struct page *page;
 	void *page_address;
 	u32 page_offset; /* offset to write to in page */
+	bool can_flip;
 };
 
 /* A list of pages registered with the device during setup and used by a queue
@@ -505,15 +506,6 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv,
 		return DMA_FROM_DEVICE;
 }
 
-/* Returns true if the max mtu allows page recycling */
-static inline bool gve_can_recycle_pages(struct net_device *dev)
-{
-	/* We can't recycle the pages if we can't fit a packet into half a
-	 * page.
-	 */
-	return dev->max_mtu <= PAGE_SIZE / 2;
-}
-
 /* buffers */
 int gve_alloc_page(struct gve_priv *priv, struct device *dev,
 		   struct page **page, dma_addr_t *dma,
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index ae76d2547d13..bea483db28f5 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -263,8 +263,7 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
 	return PKT_HASH_TYPE_L2;
 }
 
-static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
-				   struct net_device *dev,
+static struct sk_buff *gve_rx_copy(struct net_device *dev,
 				   struct napi_struct *napi,
 				   struct gve_rx_slot_page_info *page_info,
 				   u16 len)
@@ -282,10 +281,6 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
 
 	skb->protocol = eth_type_trans(skb, dev);
 
-	u64_stats_update_begin(&rx->statss);
-	rx->rx_copied_pkt++;
-	u64_stats_update_end(&rx->statss);
-
 	return skb;
 }
 
@@ -331,22 +326,91 @@ static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
 {
 	u64 addr = be64_to_cpu(data_ring->addr);
 
+	/* "flip" to other packet buffer on this page */
 	page_info->page_offset ^= PAGE_SIZE / 2;
 	addr ^= PAGE_SIZE / 2;
 	data_ring->addr = cpu_to_be64(addr);
 }
 
+static bool gve_rx_can_flip_buffers(struct net_device *netdev)
+{
+#if PAGE_SIZE == 4096
+	/* We can't flip a buffer if we can't fit a packet
+	 * into half a page.
+	 */
+	if (netdev->max_mtu + GVE_RX_PAD + ETH_HLEN  > PAGE_SIZE / 2)
+		return false;
+	return true;
+#else
+	/* PAGE_SIZE != 4096 - don't try to reuse */
+	return false;
+#endif
+}
+
+static int gve_rx_can_recycle_buffer(struct page *page)
+{
+	int pagecount = page_count(page);
+
+	/* This page is not being used by any SKBs - reuse */
+	if (pagecount == 1) {
+		return 1;
+	/* This page is still being used by an SKB - we can't reuse */
+	} else if (pagecount >= 2) {
+		return 0;
+	}
+	WARN(pagecount < 1, "Pagecount should never be < 1");
+	return -1;
+}
+
 static struct sk_buff *
 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
 		      struct gve_rx_slot_page_info *page_info, u16 len,
 		      struct napi_struct *napi,
-		      struct gve_rx_data_slot *data_slot)
+		      struct gve_rx_data_slot *data_slot, bool can_flip)
 {
 	struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
 
 	if (!skb)
 		return NULL;
 
+	/* Optimistically stop the kernel from freeing the page by increasing
+	 * the page bias. We will check the refcount in refill to determine if
+	 * we need to alloc a new page.
+	 */
+	get_page(page_info->page);
+	page_info->can_flip = can_flip;
+
+	return skb;
+}
+
+static struct sk_buff *
+gve_rx_qpl(struct device *dev, struct net_device *netdev,
+	   struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
+	   u16 len, struct napi_struct *napi,
+	   struct gve_rx_data_slot *data_slot, bool recycle)
+{
+	struct sk_buff *skb;
+	/* if raw_addressing mode is not enabled gvnic can only receive into
+	 * registered segmens. If the buffer can't be recycled, our only
+	 * choice is to copy the data out of it so that we can return it to the
+	 * device.
+	 */
+	if (recycle) {
+		skb = gve_rx_add_frags(napi, page_info, len);
+		/* No point in recycling if we didn't get the skb */
+		if (skb) {
+			/* Make sure the networking stack can't free the page */
+			get_page(page_info->page);
+			gve_rx_flip_buff(page_info, data_slot);
+		}
+	} else {
+		skb = gve_rx_copy(netdev, napi, page_info, len);
+		if (skb) {
+			u64_stats_update_begin(&rx->statss);
+			rx->rx_copied_pkt++;
+			u64_stats_update_end(&rx->statss);
+		}
+	}
 	return skb;
 }
 
@@ -360,7 +424,6 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 	struct gve_rx_data_slot *data_slot;
 	struct sk_buff *skb = NULL;
 	dma_addr_t page_bus;
-	int pagecount;
 	u16 len;
 
 	/* drop this packet */
@@ -381,64 +444,36 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
 	dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
 				PAGE_SIZE, DMA_FROM_DEVICE);
 
-	if (PAGE_SIZE == 4096) {
-		if (len <= priv->rx_copybreak) {
-			/* Just copy small packets */
-			skb = gve_rx_copy(rx, dev, napi, page_info, len);
-			u64_stats_update_begin(&rx->statss);
-			rx->rx_copybreak_pkt++;
-			u64_stats_update_end(&rx->statss);
-			goto have_skb;
+	if (len <= priv->rx_copybreak) {
+		/* Just copy small packets */
+		skb = gve_rx_copy(dev, napi, page_info, len);
+		u64_stats_update_begin(&rx->statss);
+		rx->rx_copied_pkt++;
+		rx->rx_copybreak_pkt++;
+		u64_stats_update_end(&rx->statss);
+	} else {
+		bool can_flip = gve_rx_can_flip_buffers(dev);
+		int recycle = 0;
+
+		if (can_flip) {
+			recycle = gve_rx_can_recycle_buffer(page_info->page);
+			if (recycle < 0) {
+				gve_schedule_reset(priv);
+				return false;
+			}
 		}
 		if (rx->data.raw_addressing) {
 			skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
 						    page_info, len, napi,
-						     data_slot);
-			goto have_skb;
-		}
-		if (unlikely(!gve_can_recycle_pages(dev))) {
-			skb = gve_rx_copy(rx, dev, napi, page_info, len);
-			goto have_skb;
-		}
-		pagecount = page_count(page_info->page);
-		if (pagecount == 1) {
-			/* No part of this page is used by any SKBs; we attach
-			 * the page fragment to a new SKB and pass it up the
-			 * stack.
-			 */
-			skb = gve_rx_add_frags(napi, page_info, len);
-			if (!skb) {
-				u64_stats_update_begin(&rx->statss);
-				rx->rx_skb_alloc_fail++;
-				u64_stats_update_end(&rx->statss);
-				return false;
-			}
-			/* Make sure the kernel stack can't release the page */
-			get_page(page_info->page);
-			/* "flip" to other packet buffer on this page */
-			gve_rx_flip_buff(page_info, &rx->data.data_ring[idx]);
-		} else if (pagecount >= 2) {
-			/* We have previously passed the other half of this
-			 * page up the stack, but it has not yet been freed.
-			 */
-			skb = gve_rx_copy(rx, dev, napi, page_info, len);
+						    data_slot,
+						    can_flip && recycle);
 		} else {
-			WARN(pagecount < 1, "Pagecount should never be < 1");
-			return false;
+			skb = gve_rx_qpl(&priv->pdev->dev, dev, rx,
+					 page_info, len, napi, data_slot,
+					 can_flip && recycle);
 		}
-	} else {
-		if (rx->data.raw_addressing)
-			skb = gve_rx_copy(rx, dev, napi, page_info, len);
-		else
-			skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
-						    page_info, len, napi,
-						    data_slot);
 	}
 
-have_skb:
-	/* We didn't manage to allocate an skb but we haven't had any
-	 * reset worthy failures.
-	 */
 	if (!skb) {
 		u64_stats_update_begin(&rx->statss);
 		rx->rx_skb_alloc_fail++;
@@ -490,14 +525,46 @@ static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
 
 	while ((fill_cnt & rx->mask) != (rx->cnt & rx->mask)) {
 		u32 idx = fill_cnt & rx->mask;
-		struct gve_rx_slot_page_info *page_info = &rx->data.page_info[idx];
-		struct gve_rx_data_slot *data_slot = &rx->data.data_ring[idx];
-		struct device *dev = &priv->pdev->dev;
+		struct gve_rx_slot_page_info *page_info =
+						&rx->data.page_info[idx];
 
-		gve_rx_free_buffer(dev, page_info, data_slot);
-		page_info->page = NULL;
-		if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
-			break;
+		if (page_info->can_flip) {
+			/* The other half of the page is free because it was
+			 * free when we processed the descriptor. Flip to it.
+			 */
+			struct gve_rx_data_slot *data_slot =
+						&rx->data.data_ring[idx];
+
+			gve_rx_flip_buff(page_info, data_slot);
+			page_info->can_flip = false;
+		} else {
+			/* It is possible that the networking stack has already
+			 * finished processing all outstanding packets in the buffer
+			 * and it can be reused.
+			 * Flipping is unnecessary here - if the networking stack still
+			 * owns half the page it is impossible to tell which half. Either
+			 * the whole page is free or it needs to be replaced.
+			 */
+			int recycle = gve_rx_can_recycle_buffer(page_info->page);
+
+			if (recycle < 0) {
+				gve_schedule_reset(priv);
+				return false;
+			}
+			if (!recycle) {
+				/* We can't reuse the buffer - alloc a new one*/
+				struct gve_rx_data_slot *data_slot =
+						&rx->data.data_ring[idx];
+				struct device *dev = &priv->pdev->dev;
+
+				gve_rx_free_buffer(dev, page_info, data_slot);
+				page_info->page = NULL;
+				if (gve_rx_alloc_buffer(priv, dev, page_info,
+							data_slot, rx)) {
+					break;
+				}
+			}
+		}
 		fill_cnt++;
 	}
 	rx->fill_cnt = fill_cnt;
-- 
2.28.0.681.g6f77f65b4e-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
  2020-09-24  1:01 [PATCH net-next v3 0/4] GVE Raw Addressing David Awogbemila
                   ` (2 preceding siblings ...)
  2020-09-24  1:01 ` [PATCH net-next v3 3/4] gve: Rx Buffer Recycling David Awogbemila
@ 2020-09-24  1:01 ` David Awogbemila
  2020-09-24 22:51   ` Jakub Kicinski
  2020-10-02  9:54     ` kernel test robot
  3 siblings, 2 replies; 14+ messages in thread
From: David Awogbemila @ 2020-09-24  1:01 UTC (permalink / raw)
  To: netdev; +Cc: Catherine Sullivan, Yangchun Fu, David Awogbemila

From: Catherine Sullivan <csully@google.com>

During TX, skbs' data addresses are dma_map'ed and passed to the NIC.
This means that the device can perform DMA directly from these addresses
and the driver does not have to copy the buffer content into
pre-allocated buffers/qpls (as in qpl mode).

Reviewed-by: Yangchun Fu <yangchun@google.com>
Signed-off-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David Awogbemila <awogbemila@google.com>
---
 drivers/net/ethernet/google/gve/gve.h        |  18 +-
 drivers/net/ethernet/google/gve/gve_adminq.c |   4 +-
 drivers/net/ethernet/google/gve/gve_desc.h   |   8 +-
 drivers/net/ethernet/google/gve/gve_tx.c     | 205 +++++++++++++++----
 4 files changed, 193 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index 9cce2b356235..2d87d67718e4 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -112,12 +112,20 @@ struct gve_tx_iovec {
 	u32 iov_padding; /* padding associated with this segment */
 };
 
+struct gve_tx_dma_buf {
+	DEFINE_DMA_UNMAP_ADDR(dma);
+	DEFINE_DMA_UNMAP_LEN(len);
+};
+
 /* Tracks the memory in the fifo occupied by the skb. Mapped 1:1 to a desc
  * ring entry but only used for a pkt_desc not a seg_desc
  */
 struct gve_tx_buffer_state {
 	struct sk_buff *skb; /* skb for this pkt */
-	struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */
+	union {
+		struct gve_tx_iovec iov[GVE_TX_MAX_IOVEC]; /* segments of this pkt */
+		struct gve_tx_dma_buf buf;
+	};
 };
 
 /* A TX buffer - each queue has one */
@@ -140,13 +148,16 @@ struct gve_tx_ring {
 	__be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */
 	u64 pkt_done; /* free-running - total packets completed */
 	u64 bytes_done; /* free-running - total bytes completed */
+	u32 dropped_pkt; /* free-running - total packets dropped */
 
 	/* Cacheline 2 -- Read-mostly fields */
 	union gve_tx_desc *desc ____cacheline_aligned;
 	struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */
 	struct netdev_queue *netdev_txq;
 	struct gve_queue_resources *q_resources; /* head and tail pointer idx */
+	struct device *dev;
 	u32 mask; /* masks req and done down to queue size */
+	bool raw_addressing; /* use raw_addressing? */
 
 	/* Slow-path fields */
 	u32 q_num ____cacheline_aligned; /* queue idx */
@@ -442,7 +453,10 @@ static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx)
  */
 static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
 {
-	return priv->tx_cfg.num_queues;
+	if (priv->raw_addressing)
+		return 0;
+	else
+		return priv->tx_cfg.num_queues;
 }
 
 /* Returns the number of rx queue page lists
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index 094146e5cc22..91ef07169629 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -318,8 +318,10 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 {
 	struct gve_tx_ring *tx = &priv->tx[queue_index];
 	union gve_adminq_command cmd;
+	u32 qpl_id;
 	int err;
 
+	qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id;
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE);
 	cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) {
@@ -328,7 +330,7 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 		.queue_resources_addr =
 			cpu_to_be64(tx->q_resources_bus),
 		.tx_ring_addr = cpu_to_be64(tx->bus),
-		.queue_page_list_id = cpu_to_be32(tx->tx_fifo.qpl->id),
+		.queue_page_list_id = cpu_to_be32(qpl_id),
 		.ntfy_id = cpu_to_be32(tx->ntfy_id),
 	};
 
diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
index 0aad314aefaf..a7da364e81c8 100644
--- a/drivers/net/ethernet/google/gve/gve_desc.h
+++ b/drivers/net/ethernet/google/gve/gve_desc.h
@@ -16,9 +16,11 @@
  * Base addresses encoded in seg_addr are not assumed to be physical
  * addresses. The ring format assumes these come from some linear address
  * space. This could be physical memory, kernel virtual memory, user virtual
- * memory. gVNIC uses lists of registered pages. Each queue is assumed
- * to be associated with a single such linear address space to ensure a
- * consistent meaning for seg_addrs posted to its rings.
+ * memory.
+ * If raw dma addressing is not supported then gVNIC uses lists of registered
+ * pages. Each queue is assumed to be associated with a single such linear
+ * address space to ensure a consistent meaning for seg_addrs posted to its
+ * rings.
  */
 
 struct gve_tx_pkt_desc {
diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c
index d0244feb0301..fa1197b35b39 100644
--- a/drivers/net/ethernet/google/gve/gve_tx.c
+++ b/drivers/net/ethernet/google/gve/gve_tx.c
@@ -158,9 +158,11 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx)
 			  tx->q_resources, tx->q_resources_bus);
 	tx->q_resources = NULL;
 
-	gve_tx_fifo_release(priv, &tx->tx_fifo);
-	gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
-	tx->tx_fifo.qpl = NULL;
+	if (!tx->raw_addressing) {
+		gve_tx_fifo_release(priv, &tx->tx_fifo);
+		gve_unassign_qpl(priv, tx->tx_fifo.qpl->id);
+		tx->tx_fifo.qpl = NULL;
+	}
 
 	bytes = sizeof(*tx->desc) * slots;
 	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
@@ -206,11 +208,15 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)
 	if (!tx->desc)
 		goto abort_with_info;
 
-	tx->tx_fifo.qpl = gve_assign_tx_qpl(priv);
+	tx->raw_addressing = priv->raw_addressing;
+	tx->dev = &priv->pdev->dev;
+	if (!tx->raw_addressing) {
+		tx->tx_fifo.qpl = gve_assign_tx_qpl(priv);
 
-	/* map Tx FIFO */
-	if (gve_tx_fifo_init(priv, &tx->tx_fifo))
-		goto abort_with_desc;
+		/* map Tx FIFO */
+		if (gve_tx_fifo_init(priv, &tx->tx_fifo))
+			goto abort_with_desc;
+	}
 
 	tx->q_resources =
 		dma_alloc_coherent(hdev,
@@ -228,7 +234,8 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx)
 	return 0;
 
 abort_with_fifo:
-	gve_tx_fifo_release(priv, &tx->tx_fifo);
+	if (!tx->raw_addressing)
+		gve_tx_fifo_release(priv, &tx->tx_fifo);
 abort_with_desc:
 	dma_free_coherent(hdev, bytes, tx->desc, tx->bus);
 	tx->desc = NULL;
@@ -301,27 +308,47 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
 	return bytes;
 }
 
-/* The most descriptors we could need are 3 - 1 for the headers, 1 for
- * the beginning of the payload at the end of the FIFO, and 1 if the
- * payload wraps to the beginning of the FIFO.
+/* The most descriptors we could need is MAX_SKB_FRAGS + 3 : 1 for each skb frag,
+ * +1 for the skb linear portion, +1 for when tcp hdr needs to be in separate descriptor,
+ * and +1 if the payload wraps to the beginning of the FIFO.
  */
-#define MAX_TX_DESC_NEEDED	3
+#define MAX_TX_DESC_NEEDED	(MAX_SKB_FRAGS + 3)
+static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
+{
+	if (info->skb) {
+		dma_unmap_single(dev, dma_unmap_addr(&info->buf, dma),
+				 dma_unmap_len(&info->buf, len),
+				 DMA_TO_DEVICE);
+		dma_unmap_len_set(&info->buf, len, 0);
+	} else {
+		dma_unmap_page(dev, dma_unmap_addr(&info->buf, dma),
+			       dma_unmap_len(&info->buf, len),
+			       DMA_TO_DEVICE);
+		dma_unmap_len_set(&info->buf, len, 0);
+	}
+}
 
 /* Check if sufficient resources (descriptor ring space, FIFO space) are
  * available to transmit the given number of bytes.
  */
 static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
 {
-	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED &&
-		gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required));
+	bool can_alloc = true;
+
+	if (!tx->raw_addressing)
+		can_alloc = gve_tx_fifo_can_alloc(&tx->tx_fifo, bytes_required);
+
+	return (gve_tx_avail(tx) >= MAX_TX_DESC_NEEDED && can_alloc);
 }
 
 /* Stops the queue if the skb cannot be transmitted. */
 static int gve_maybe_stop_tx(struct gve_tx_ring *tx, struct sk_buff *skb)
 {
-	int bytes_required;
+	int bytes_required = 0;
+
+	if (!tx->raw_addressing)
+		bytes_required = gve_skb_fifo_bytes_required(tx, skb);
 
-	bytes_required = gve_skb_fifo_bytes_required(tx, skb);
 	if (likely(gve_can_tx(tx, bytes_required)))
 		return 0;
 
@@ -390,22 +417,22 @@ static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
 	seg_desc->seg.seg_addr = cpu_to_be64(addr);
 }
 
-static void gve_dma_sync_for_device(struct device *dev, dma_addr_t *page_buses,
+static void gve_dma_sync_for_device(struct gve_priv *priv,
+				    dma_addr_t *page_buses,
 				    u64 iov_offset, u64 iov_len)
 {
 	u64 last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
 	u64 first_page = iov_offset / PAGE_SIZE;
-	dma_addr_t dma;
 	u64 page;
 
 	for (page = first_page; page <= last_page; page++) {
-		dma = page_buses[page];
-		dma_sync_single_for_device(dev, dma, PAGE_SIZE, DMA_TO_DEVICE);
+		dma_addr_t dma = page_buses[page];
+
+		dma_sync_single_for_device(&priv->pdev->dev, dma, PAGE_SIZE, DMA_TO_DEVICE);
 	}
 }
 
-static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,
-			  struct device *dev)
+static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)
 {
 	int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
 	union gve_tx_desc *pkt_desc, *seg_desc;
@@ -447,7 +474,7 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,
 	skb_copy_bits(skb, 0,
 		      tx->tx_fifo.base + info->iov[hdr_nfrags - 1].iov_offset,
 		      hlen);
-	gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses,
+	gve_dma_sync_for_device(priv, tx->tx_fifo.qpl->page_buses,
 				info->iov[hdr_nfrags - 1].iov_offset,
 				info->iov[hdr_nfrags - 1].iov_len);
 	copy_offset = hlen;
@@ -463,7 +490,7 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,
 		skb_copy_bits(skb, copy_offset,
 			      tx->tx_fifo.base + info->iov[i].iov_offset,
 			      info->iov[i].iov_len);
-		gve_dma_sync_for_device(dev, tx->tx_fifo.qpl->page_buses,
+		gve_dma_sync_for_device(priv, tx->tx_fifo.qpl->page_buses,
 					info->iov[i].iov_offset,
 					info->iov[i].iov_len);
 		copy_offset += info->iov[i].iov_len;
@@ -472,6 +499,98 @@ static int gve_tx_add_skb(struct gve_tx_ring *tx, struct sk_buff *skb,
 	return 1 + payload_nfrags;
 }
 
+static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
+				  struct sk_buff *skb)
+{
+	const struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int hlen, payload_nfrags, l4_hdr_offset, seg_idx_bias;
+	union gve_tx_desc *pkt_desc, *seg_desc;
+	struct gve_tx_buffer_state *info;
+	bool is_gso = skb_is_gso(skb);
+	u32 idx = tx->req & tx->mask;
+	struct gve_tx_dma_buf *buf;
+	int last_mapped = 0;
+	u64 addr;
+	u32 len;
+	int i;
+
+	info = &tx->info[idx];
+	pkt_desc = &tx->desc[idx];
+
+	l4_hdr_offset = skb_checksum_start_offset(skb);
+	/* If the skb is gso, then we want only up to the tcp header in the first segment
+	 * to efficiently replicate on each segment otherwise we want the linear portion
+	 * of the skb (which will contain the checksum because skb->csum_start and
+	 * skb->csum_offset are given relative to skb->head) in the first segment.
+	 */
+	hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
+			skb_headlen(skb);
+	len = skb_headlen(skb);
+
+	info->skb =  skb;
+
+	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(tx->dev, addr))) {
+		priv->dma_mapping_error++;
+		goto drop;
+	}
+	buf = &info->buf;
+	dma_unmap_len_set(buf, len, len);
+	dma_unmap_addr_set(buf, dma, addr);
+
+	payload_nfrags = shinfo->nr_frags;
+	if (hlen < len) {
+		/* For gso the rest of the linear portion of the skb needs to
+		 * be in its own descriptor.
+		 */
+		payload_nfrags++;
+		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
+				     1 + payload_nfrags, hlen, addr);
+
+		len -= hlen;
+		addr += hlen;
+		seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
+		seg_idx_bias = 2;
+		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
+	} else {
+		seg_idx_bias = 1;
+		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
+				     1 + payload_nfrags, hlen, addr);
+	}
+	idx = (tx->req + seg_idx_bias) & tx->mask;
+
+	for (i = 0; i < payload_nfrags - (seg_idx_bias - 1); i++) {
+		const skb_frag_t *frag = &shinfo->frags[i];
+
+		seg_desc = &tx->desc[idx];
+		len = skb_frag_size(frag);
+		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx->dev, addr))) {
+			priv->dma_mapping_error++;
+			goto unmap_drop;
+		}
+		buf = &tx->info[idx].buf;
+		tx->info[idx].skb = NULL;
+		dma_unmap_len_set(buf, len, len);
+		dma_unmap_addr_set(buf, dma, addr);
+
+		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
+		idx = (idx + 1) & tx->mask;
+	}
+
+	return 1 + payload_nfrags;
+
+unmap_drop:
+	i--;
+	for (last_mapped = i + seg_idx_bias; last_mapped >= 0; last_mapped--) {
+		idx = (tx->req + last_mapped) & tx->mask;
+		gve_tx_unmap_buf(tx->dev, &tx->info[idx]);
+	}
+drop:
+	tx->dropped_pkt++;
+	return 0;
+}
+
 netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
 {
 	struct gve_priv *priv = netdev_priv(dev);
@@ -490,12 +609,20 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev)
 		gve_tx_put_doorbell(priv, tx->q_resources, tx->req);
 		return NETDEV_TX_BUSY;
 	}
-	nsegs = gve_tx_add_skb(tx, skb, &priv->pdev->dev);
-
-	netdev_tx_sent_queue(tx->netdev_txq, skb->len);
-	skb_tx_timestamp(skb);
+	if (tx->raw_addressing)
+		nsegs = gve_tx_add_skb_no_copy(priv, tx, skb);
+	else
+		nsegs = gve_tx_add_skb_copy(priv, tx, skb);
+
+	/* If the packet is getting sent, we need to update the skb */
+	if (nsegs) {
+		netdev_tx_sent_queue(tx->netdev_txq, skb->len);
+		skb_tx_timestamp(skb);
+	}
 
-	/* give packets to NIC */
+	/* Give packets to NIC. Even if this packet failed to send the doorbell
+	 * might need to be rung because of xmit_more.
+	 */
 	tx->req += nsegs;
 
 	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
@@ -525,24 +652,30 @@ static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx,
 		info = &tx->info[idx];
 		skb = info->skb;
 
+		/* Unmap the buffer */
+		if (tx->raw_addressing)
+			gve_tx_unmap_buf(tx->dev, info);
 		/* Mark as free */
 		if (skb) {
 			info->skb = NULL;
 			bytes += skb->len;
 			pkts++;
 			dev_consume_skb_any(skb);
-			/* FIFO free */
-			for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
-				space_freed += info->iov[i].iov_len +
-					       info->iov[i].iov_padding;
-				info->iov[i].iov_len = 0;
-				info->iov[i].iov_padding = 0;
+			if (!tx->raw_addressing) {
+				/* FIFO free */
+				for (i = 0; i < ARRAY_SIZE(info->iov); i++) {
+					space_freed += info->iov[i].iov_len +
+						       info->iov[i].iov_padding;
+					info->iov[i].iov_len = 0;
+					info->iov[i].iov_padding = 0;
+				}
 			}
 		}
 		tx->done++;
 	}
 
-	gve_tx_free_fifo(&tx->tx_fifo, space_freed);
+	if (!tx->raw_addressing)
+		gve_tx_free_fifo(&tx->tx_fifo, space_freed);
 	u64_stats_update_begin(&tx->statss);
 	tx->bytes_done += bytes;
 	tx->pkt_done += pkts;
-- 
2.28.0.681.g6f77f65b4e-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
  2020-09-24  1:01 ` [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path David Awogbemila
@ 2020-09-24 22:51   ` Jakub Kicinski
  2020-09-30 16:09     ` David Awogbemila
  2020-10-02  9:54     ` kernel test robot
  1 sibling, 1 reply; 14+ messages in thread
From: Jakub Kicinski @ 2020-09-24 22:51 UTC (permalink / raw)
  To: David Awogbemila; +Cc: netdev, Catherine Sullivan, Yangchun Fu

On Wed, 23 Sep 2020 18:01:04 -0700 David Awogbemila wrote:
> +	info->skb =  skb;

double space

> +	addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
> +	if (unlikely(dma_mapping_error(tx->dev, addr))) {
> +		priv->dma_mapping_error++;
> +		goto drop;
> +	}
> +	buf = &info->buf;
> +	dma_unmap_len_set(buf, len, len);
> +	dma_unmap_addr_set(buf, dma, addr);
> +
> +	payload_nfrags = shinfo->nr_frags;
> +	if (hlen < len) {
> +		/* For gso the rest of the linear portion of the skb needs to
> +		 * be in its own descriptor.
> +		 */
> +		payload_nfrags++;
> +		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
> +				     1 + payload_nfrags, hlen, addr);

This..

> +		len -= hlen;
> +		addr += hlen;
> +		seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
> +		seg_idx_bias = 2;
> +		gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
> +	} else {
> +		seg_idx_bias = 1;
> +		gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
> +				     1 + payload_nfrags, hlen, addr);

and this look identical. You can probably move it before the if.

Otherwise this one is:

Reviewed-by: Jakub Kicinski <kuba@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 3/4] gve: Rx Buffer Recycling
  2020-09-24  1:01 ` [PATCH net-next v3 3/4] gve: Rx Buffer Recycling David Awogbemila
@ 2020-09-24 23:00   ` Jakub Kicinski
  2020-09-30 16:10     ` David Awogbemila
  0 siblings, 1 reply; 14+ messages in thread
From: Jakub Kicinski @ 2020-09-24 23:00 UTC (permalink / raw)
  To: David Awogbemila; +Cc: netdev

On Wed, 23 Sep 2020 18:01:03 -0700 David Awogbemila wrote:
> This patch lets the driver reuse buffers that have been freed by the
> networking stack.
> 
> In the raw addressing case, this allows the driver avoid allocating new
> buffers.
> In the qpl case, the driver can avoid copies.
> 
> Signed-off-by: David Awogbemila <awogbemila@google.com>
> ---
>  drivers/net/ethernet/google/gve/gve.h    |  10 +-
>  drivers/net/ethernet/google/gve/gve_rx.c | 197 +++++++++++++++--------
>  2 files changed, 133 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> index b853efb0b17f..9cce2b356235 100644
> --- a/drivers/net/ethernet/google/gve/gve.h
> +++ b/drivers/net/ethernet/google/gve/gve.h
> @@ -50,6 +50,7 @@ struct gve_rx_slot_page_info {
>  	struct page *page;
>  	void *page_address;
>  	u32 page_offset; /* offset to write to in page */
> +	bool can_flip;
>  };
>  
>  /* A list of pages registered with the device during setup and used by a queue
> @@ -505,15 +506,6 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv,
>  		return DMA_FROM_DEVICE;
>  }
>  
> -/* Returns true if the max mtu allows page recycling */
> -static inline bool gve_can_recycle_pages(struct net_device *dev)
> -{
> -	/* We can't recycle the pages if we can't fit a packet into half a
> -	 * page.
> -	 */
> -	return dev->max_mtu <= PAGE_SIZE / 2;
> -}
> -
>  /* buffers */
>  int gve_alloc_page(struct gve_priv *priv, struct device *dev,
>  		   struct page **page, dma_addr_t *dma,
> diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
> index ae76d2547d13..bea483db28f5 100644
> --- a/drivers/net/ethernet/google/gve/gve_rx.c
> +++ b/drivers/net/ethernet/google/gve/gve_rx.c
> @@ -263,8 +263,7 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
>  	return PKT_HASH_TYPE_L2;
>  }
>  
> -static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
> -				   struct net_device *dev,
> +static struct sk_buff *gve_rx_copy(struct net_device *dev,
>  				   struct napi_struct *napi,
>  				   struct gve_rx_slot_page_info *page_info,
>  				   u16 len)
> @@ -282,10 +281,6 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
>  
>  	skb->protocol = eth_type_trans(skb, dev);
>  
> -	u64_stats_update_begin(&rx->statss);
> -	rx->rx_copied_pkt++;
> -	u64_stats_update_end(&rx->statss);
> -
>  	return skb;
>  }
>  
> @@ -331,22 +326,91 @@ static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
>  {
>  	u64 addr = be64_to_cpu(data_ring->addr);
>  
> +	/* "flip" to other packet buffer on this page */
>  	page_info->page_offset ^= PAGE_SIZE / 2;
>  	addr ^= PAGE_SIZE / 2;
>  	data_ring->addr = cpu_to_be64(addr);
>  }
>  
> +static bool gve_rx_can_flip_buffers(struct net_device *netdev)
> +{
> +#if PAGE_SIZE == 4096
> +	/* We can't flip a buffer if we can't fit a packet
> +	 * into half a page.
> +	 */
> +	if (netdev->max_mtu + GVE_RX_PAD + ETH_HLEN  > PAGE_SIZE / 2)

double space

> +		return false;
> +	return true;

Flip the condition and just return it.

return netdev->max_mtu + GVE_RX_PAD + ETH_HLEN <= PAGE_SIZE / 2

Also you should probably look at mtu not max_mtu. More likely to be in
cache.

> +#else
> +	/* PAGE_SIZE != 4096 - don't try to reuse */
> +	return false;
> +#endif
> +}
> +
> +static int gve_rx_can_recycle_buffer(struct page *page)
> +{
> +	int pagecount = page_count(page);
> +
> +	/* This page is not being used by any SKBs - reuse */
> +	if (pagecount == 1) {
> +		return 1;
> +	/* This page is still being used by an SKB - we can't reuse */
> +	} else if (pagecount >= 2) {
> +		return 0;
> +	}

parenthesis not necessary around single line statements.

> +	WARN(pagecount < 1, "Pagecount should never be < 1");
> +	return -1;
> +}
> +
>  static struct sk_buff *
>  gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
>  		      struct gve_rx_slot_page_info *page_info, u16 len,
>  		      struct napi_struct *napi,
> -		      struct gve_rx_data_slot *data_slot)
> +		      struct gve_rx_data_slot *data_slot, bool can_flip)
>  {
>  	struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);

IMHO it looks weird when a function is called on variable init and then
error checking is done after an empty line.

>  	if (!skb)
>  		return NULL;
>  
> +	/* Optimistically stop the kernel from freeing the page by increasing
> +	 * the page bias. We will check the refcount in refill to determine if
> +	 * we need to alloc a new page.
> +	 */
> +	get_page(page_info->page);
> +	page_info->can_flip = can_flip;
> +
> +	return skb;
> +}
> +
> +static struct sk_buff *
> +gve_rx_qpl(struct device *dev, struct net_device *netdev,
> +	   struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
> +	   u16 len, struct napi_struct *napi,
> +	   struct gve_rx_data_slot *data_slot, bool recycle)
> +{
> +	struct sk_buff *skb;

empty line here

> +	/* if raw_addressing mode is not enabled gvnic can only receive into
> +	 * registered segmens. If the buffer can't be recycled, our only

segments?

> +	 * choice is to copy the data out of it so that we can return it to the
> +	 * device.
> +	 */
> +	if (recycle) {
> +		skb = gve_rx_add_frags(napi, page_info, len);
> +		/* No point in recycling if we didn't get the skb */
> +		if (skb) {
> +			/* Make sure the networking stack can't free the page */
> +			get_page(page_info->page);
> +			gve_rx_flip_buff(page_info, data_slot);
> +		}
> +	} else {
> +		skb = gve_rx_copy(netdev, napi, page_info, len);
> +		if (skb) {
> +			u64_stats_update_begin(&rx->statss);
> +			rx->rx_copied_pkt++;
> +			u64_stats_update_end(&rx->statss);
> +		}
> +	}
>  	return skb;
>  }

> @@ -490,14 +525,46 @@ static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
>  
>  	while ((fill_cnt & rx->mask) != (rx->cnt & rx->mask)) {
>  		u32 idx = fill_cnt & rx->mask;
> -		struct gve_rx_slot_page_info *page_info = &rx->data.page_info[idx];
> -		struct gve_rx_data_slot *data_slot = &rx->data.data_ring[idx];
> -		struct device *dev = &priv->pdev->dev;
> +		struct gve_rx_slot_page_info *page_info =
> +						&rx->data.page_info[idx];
>  
> -		gve_rx_free_buffer(dev, page_info, data_slot);
> -		page_info->page = NULL;
> -		if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
> -			break;
> +		if (page_info->can_flip) {
> +			/* The other half of the page is free because it was
> +			 * free when we processed the descriptor. Flip to it.
> +			 */
> +			struct gve_rx_data_slot *data_slot =
> +						&rx->data.data_ring[idx];
> +
> +			gve_rx_flip_buff(page_info, data_slot);
> +			page_info->can_flip = false;
> +		} else {
> +			/* It is possible that the networking stack has already
> +			 * finished processing all outstanding packets in the buffer
> +			 * and it can be reused.
> +			 * Flipping is unnecessary here - if the networking stack still
> +			 * owns half the page it is impossible to tell which half. Either
> +			 * the whole page is free or it needs to be replaced.
> +			 */
> +			int recycle = gve_rx_can_recycle_buffer(page_info->page);
> +
> +			if (recycle < 0) {
> +				gve_schedule_reset(priv);
> +				return false;
> +			}
> +			if (!recycle) {
> +				/* We can't reuse the buffer - alloc a new one*/
> +				struct gve_rx_data_slot *data_slot =
> +						&rx->data.data_ring[idx];
> +				struct device *dev = &priv->pdev->dev;
> +
> +				gve_rx_free_buffer(dev, page_info, data_slot);
> +				page_info->page = NULL;
> +				if (gve_rx_alloc_buffer(priv, dev, page_info,
> +							data_slot, rx)) {
> +					break;

What if the queue runs completely dry during memory shortage? 
You need some form of delayed work to periodically refill 
the free buffers, right?

> +				}

parens unnecessary

> +			}
> +		}
>  		fill_cnt++;
>  	}
>  	rx->fill_cnt = fill_cnt;


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 1/4] gve: Add support for raw addressing device option
  2020-09-24  1:01 ` [PATCH net-next v3 1/4] gve: Add support for raw addressing device option David Awogbemila
@ 2020-09-24 23:03   ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2020-09-24 23:03 UTC (permalink / raw)
  To: David Awogbemila; +Cc: Catherine Sullivan, Yangchun Fu

On Wed, 23 Sep 2020 18:01:01 -0700 David Awogbemila wrote:
> @@ -518,6 +521,49 @@ int gve_adminq_describe_device(struct gve_priv *priv)
>  		priv->rx_desc_cnt = priv->rx_pages_per_qpl;
>  	}
>  	priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
> +	dev_opt = (void *)(descriptor + 1);
> +
> +	num_options = be16_to_cpu(descriptor->num_device_options);
> +	for (i = 0; i < num_options; i++) {
> +		u16 option_length = be16_to_cpu(dev_opt->option_length);
> +		u16 option_id = be16_to_cpu(dev_opt->option_id);
> +
> +		if ((void *)dev_opt + sizeof(*dev_opt) + option_length > (void *)descriptor +
> +				      be16_to_cpu(descriptor->total_length)) {

Can you calculate an void *end pointer before the loop and avoid this
very long condition?

> +			dev_err(&priv->dev->dev,
> +				"options exceed device_descriptor's total length.\n");
> +			err = -EINVAL;
> +			goto free_device_descriptor;
> +		}
> +
> +		switch (option_id) {
> +		case GVE_DEV_OPT_ID_RAW_ADDRESSING:
> +			/* If the length or feature mask doesn't match,
> +			 * continue without enabling the feature.
> +			 */
> +			if (option_length != GVE_DEV_OPT_LEN_RAW_ADDRESSING ||
> +			    be32_to_cpu(dev_opt->feat_mask) !=
> +			    GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING) {

Apply the byteswap to the constant so it's done at compilation time.

> +				dev_info(&priv->pdev->dev,
> +					 "Raw addressing device option not enabled, length or features mask did not match expected.\n");

dev_warn(), also do yourself a favor and print what the values were.

> +				priv->raw_addressing = false;
> +			} else {
> +				dev_info(&priv->pdev->dev,
> +					 "Raw addressing device option enabled.\n");
> +				priv->raw_addressing = true;

Does this really need to be printed on every boot?

> +			}
> +			break;
> +		default:
> +			/* If we don't recognize the option just continue
> +			 * without doing anything.
> +			 */
> +			dev_info(&priv->pdev->dev,
> +				 "Unrecognized device option 0x%hx not enabled.\n",

dev_dbg()

> +				   option_id);

alignment is off

> +			break;
> +		}
> +		dev_opt = (void *)dev_opt + sizeof(*dev_opt) + option_length;
> +	}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
  2020-09-24 22:51   ` Jakub Kicinski
@ 2020-09-30 16:09     ` David Awogbemila
  2020-09-30 21:24       ` Jakub Kicinski
  0 siblings, 1 reply; 14+ messages in thread
From: David Awogbemila @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, Catherine Sullivan, Yangchun Fu

On Thu, Sep 24, 2020 at 3:51 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 23 Sep 2020 18:01:04 -0700 David Awogbemila wrote:
> > +     info->skb =  skb;
>
> double space

Ok, I'll fix this.

>
> > +     addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
> > +     if (unlikely(dma_mapping_error(tx->dev, addr))) {
> > +             priv->dma_mapping_error++;
> > +             goto drop;
> > +     }
> > +     buf = &info->buf;
> > +     dma_unmap_len_set(buf, len, len);
> > +     dma_unmap_addr_set(buf, dma, addr);
> > +
> > +     payload_nfrags = shinfo->nr_frags;
> > +     if (hlen < len) {
> > +             /* For gso the rest of the linear portion of the skb needs to
> > +              * be in its own descriptor.
> > +              */
> > +             payload_nfrags++;
> > +             gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
> > +                                  1 + payload_nfrags, hlen, addr);
>
> This..
>
> > +             len -= hlen;
> > +             addr += hlen;
> > +             seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
> > +             seg_idx_bias = 2;
> > +             gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
> > +     } else {
> > +             seg_idx_bias = 1;
> > +             gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
> > +                                  1 + payload_nfrags, hlen, addr);
>
> and this look identical. You can probably move it before the if.

Thanks, I need to make sure I understand: you're referring to the call
to gve_tx_fill_pkt_desc? if so, the calls look the same but
payload_nfrags is different in the if and else cases, perhaps I could
move it after the else? but I'm not sure if that helps.


>
> Otherwise this one is:
>
> Reviewed-by: Jakub Kicinski <kuba@kernel.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 3/4] gve: Rx Buffer Recycling
  2020-09-24 23:00   ` Jakub Kicinski
@ 2020-09-30 16:10     ` David Awogbemila
  2020-09-30 21:23       ` Jakub Kicinski
  0 siblings, 1 reply; 14+ messages in thread
From: David Awogbemila @ 2020-09-30 16:10 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev

On Thu, Sep 24, 2020 at 4:01 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 23 Sep 2020 18:01:03 -0700 David Awogbemila wrote:
> > This patch lets the driver reuse buffers that have been freed by the
> > networking stack.
> >
> > In the raw addressing case, this allows the driver avoid allocating new
> > buffers.
> > In the qpl case, the driver can avoid copies.
> >
> > Signed-off-by: David Awogbemila <awogbemila@google.com>
> > ---
> >  drivers/net/ethernet/google/gve/gve.h    |  10 +-
> >  drivers/net/ethernet/google/gve/gve_rx.c | 197 +++++++++++++++--------
> >  2 files changed, 133 insertions(+), 74 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> > index b853efb0b17f..9cce2b356235 100644
> > --- a/drivers/net/ethernet/google/gve/gve.h
> > +++ b/drivers/net/ethernet/google/gve/gve.h
> > @@ -50,6 +50,7 @@ struct gve_rx_slot_page_info {
> >       struct page *page;
> >       void *page_address;
> >       u32 page_offset; /* offset to write to in page */
> > +     bool can_flip;
> >  };
> >
> >  /* A list of pages registered with the device during setup and used by a queue
> > @@ -505,15 +506,6 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv,
> >               return DMA_FROM_DEVICE;
> >  }
> >
> > -/* Returns true if the max mtu allows page recycling */
> > -static inline bool gve_can_recycle_pages(struct net_device *dev)
> > -{
> > -     /* We can't recycle the pages if we can't fit a packet into half a
> > -      * page.
> > -      */
> > -     return dev->max_mtu <= PAGE_SIZE / 2;
> > -}
> > -
> >  /* buffers */
> >  int gve_alloc_page(struct gve_priv *priv, struct device *dev,
> >                  struct page **page, dma_addr_t *dma,
> > diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
> > index ae76d2547d13..bea483db28f5 100644
> > --- a/drivers/net/ethernet/google/gve/gve_rx.c
> > +++ b/drivers/net/ethernet/google/gve/gve_rx.c
> > @@ -263,8 +263,7 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
> >       return PKT_HASH_TYPE_L2;
> >  }
> >
> > -static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
> > -                                struct net_device *dev,
> > +static struct sk_buff *gve_rx_copy(struct net_device *dev,
> >                                  struct napi_struct *napi,
> >                                  struct gve_rx_slot_page_info *page_info,
> >                                  u16 len)
> > @@ -282,10 +281,6 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
> >
> >       skb->protocol = eth_type_trans(skb, dev);
> >
> > -     u64_stats_update_begin(&rx->statss);
> > -     rx->rx_copied_pkt++;
> > -     u64_stats_update_end(&rx->statss);
> > -
> >       return skb;
> >  }
> >
> > @@ -331,22 +326,91 @@ static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
> >  {
> >       u64 addr = be64_to_cpu(data_ring->addr);
> >
> > +     /* "flip" to other packet buffer on this page */
> >       page_info->page_offset ^= PAGE_SIZE / 2;
> >       addr ^= PAGE_SIZE / 2;
> >       data_ring->addr = cpu_to_be64(addr);
> >  }
> >
> > +static bool gve_rx_can_flip_buffers(struct net_device *netdev)
> > +{
> > +#if PAGE_SIZE == 4096
> > +     /* We can't flip a buffer if we can't fit a packet
> > +      * into half a page.
> > +      */
> > +     if (netdev->max_mtu + GVE_RX_PAD + ETH_HLEN  > PAGE_SIZE / 2)
>
> double space

I'll adjust this.

> > +             return false;
> > +     return true;
>
> Flip the condition and just return it.
>
> return netdev->max_mtu + GVE_RX_PAD + ETH_HLEN <= PAGE_SIZE / 2
>
> Also you should probably look at mtu not max_mtu. More likely to be in
> cache.

Ok, I'll adjust this to use mtu instead.

> > +#else
> > +     /* PAGE_SIZE != 4096 - don't try to reuse */
> > +     return false;
> > +#endif
> > +}
> > +
> > +static int gve_rx_can_recycle_buffer(struct page *page)
> > +{
> > +     int pagecount = page_count(page);
> > +
> > +     /* This page is not being used by any SKBs - reuse */
> > +     if (pagecount == 1) {
> > +             return 1;
> > +     /* This page is still being used by an SKB - we can't reuse */
> > +     } else if (pagecount >= 2) {
> > +             return 0;
> > +     }
>
> parenthesis not necessary around single line statements.

I'll adjust this.

> > +     WARN(pagecount < 1, "Pagecount should never be < 1");
> > +     return -1;
> > +}
> > +
> >  static struct sk_buff *
> >  gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
> >                     struct gve_rx_slot_page_info *page_info, u16 len,
> >                     struct napi_struct *napi,
> > -                   struct gve_rx_data_slot *data_slot)
> > +                   struct gve_rx_data_slot *data_slot, bool can_flip)
> >  {
> >       struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
>
> IMHO it looks weird when a function is called on variable init and then
> error checking is done after an empty line.

Ok, I'll declare skb and then initialize it separately.

> >       if (!skb)
> >               return NULL;
> >
> > +     /* Optimistically stop the kernel from freeing the page by increasing
> > +      * the page bias. We will check the refcount in refill to determine if
> > +      * we need to alloc a new page.
> > +      */
> > +     get_page(page_info->page);
> > +     page_info->can_flip = can_flip;
> > +
> > +     return skb;
> > +}
> > +
> > +static struct sk_buff *
> > +gve_rx_qpl(struct device *dev, struct net_device *netdev,
> > +        struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
> > +        u16 len, struct napi_struct *napi,
> > +        struct gve_rx_data_slot *data_slot, bool recycle)
> > +{
> > +     struct sk_buff *skb;
>
> empty line here

I'll adjust this.

> > +     /* if raw_addressing mode is not enabled gvnic can only receive into
> > +      * registered segmens. If the buffer can't be recycled, our only
>
> segments?

I'll correct this.

> > +      * choice is to copy the data out of it so that we can return it to the
> > +      * device.
> > +      */
> > +     if (recycle) {
> > +             skb = gve_rx_add_frags(napi, page_info, len);
> > +             /* No point in recycling if we didn't get the skb */
> > +             if (skb) {
> > +                     /* Make sure the networking stack can't free the page */
> > +                     get_page(page_info->page);
> > +                     gve_rx_flip_buff(page_info, data_slot);
> > +             }
> > +     } else {
> > +             skb = gve_rx_copy(netdev, napi, page_info, len);
> > +             if (skb) {
> > +                     u64_stats_update_begin(&rx->statss);
> > +                     rx->rx_copied_pkt++;
> > +                     u64_stats_update_end(&rx->statss);
> > +             }
> > +     }
> >       return skb;
> >  }
>
> > @@ -490,14 +525,46 @@ static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
> >
> >       while ((fill_cnt & rx->mask) != (rx->cnt & rx->mask)) {
> >               u32 idx = fill_cnt & rx->mask;
> > -             struct gve_rx_slot_page_info *page_info = &rx->data.page_info[idx];
> > -             struct gve_rx_data_slot *data_slot = &rx->data.data_ring[idx];
> > -             struct device *dev = &priv->pdev->dev;
> > +             struct gve_rx_slot_page_info *page_info =
> > +                                             &rx->data.page_info[idx];
> >
> > -             gve_rx_free_buffer(dev, page_info, data_slot);
> > -             page_info->page = NULL;
> > -             if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
> > -                     break;
> > +             if (page_info->can_flip) {
> > +                     /* The other half of the page is free because it was
> > +                      * free when we processed the descriptor. Flip to it.
> > +                      */
> > +                     struct gve_rx_data_slot *data_slot =
> > +                                             &rx->data.data_ring[idx];
> > +
> > +                     gve_rx_flip_buff(page_info, data_slot);
> > +                     page_info->can_flip = false;
> > +             } else {
> > +                     /* It is possible that the networking stack has already
> > +                      * finished processing all outstanding packets in the buffer
> > +                      * and it can be reused.
> > +                      * Flipping is unnecessary here - if the networking stack still
> > +                      * owns half the page it is impossible to tell which half. Either
> > +                      * the whole page is free or it needs to be replaced.
> > +                      */
> > +                     int recycle = gve_rx_can_recycle_buffer(page_info->page);
> > +
> > +                     if (recycle < 0) {
> > +                             gve_schedule_reset(priv);
> > +                             return false;
> > +                     }
> > +                     if (!recycle) {
> > +                             /* We can't reuse the buffer - alloc a new one*/
> > +                             struct gve_rx_data_slot *data_slot =
> > +                                             &rx->data.data_ring[idx];
> > +                             struct device *dev = &priv->pdev->dev;
> > +
> > +                             gve_rx_free_buffer(dev, page_info, data_slot);
> > +                             page_info->page = NULL;
> > +                             if (gve_rx_alloc_buffer(priv, dev, page_info,
> > +                                                     data_slot, rx)) {
> > +                                     break;
>
> What if the queue runs completely dry during memory shortage?
> You need some form of delayed work to periodically refill
> the free buffers, right?

Thanks, this looks like it will require modifications that will need
to be well tested.
Just for my own curiosity, how common of an occurrence are such memory
shortages?

>
> > +                             }
>
> parens unnecessary

I'll adjust this.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 3/4] gve: Rx Buffer Recycling
  2020-09-30 16:10     ` David Awogbemila
@ 2020-09-30 21:23       ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2020-09-30 21:23 UTC (permalink / raw)
  To: David Awogbemila; +Cc: netdev

On Wed, 30 Sep 2020 09:10:02 -0700 David Awogbemila wrote:
> > What if the queue runs completely dry during memory shortage?
> > You need some form of delayed work to periodically refill
> > the free buffers, right?  
> 
> Thanks, this looks like it will require modifications that will need
> to be well tested.
> Just for my own curiosity, how common of an occurrence are such memory
> shortages?

Depends on the workload. For some apps they are very common.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
  2020-09-30 16:09     ` David Awogbemila
@ 2020-09-30 21:24       ` Jakub Kicinski
  0 siblings, 0 replies; 14+ messages in thread
From: Jakub Kicinski @ 2020-09-30 21:24 UTC (permalink / raw)
  To: David Awogbemila; +Cc: netdev, Catherine Sullivan, Yangchun Fu

On Wed, 30 Sep 2020 09:09:57 -0700 David Awogbemila wrote:
> > > +             len -= hlen;
> > > +             addr += hlen;
> > > +             seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
> > > +             seg_idx_bias = 2;
> > > +             gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
> > > +     } else {
> > > +             seg_idx_bias = 1;
> > > +             gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
> > > +                                  1 + payload_nfrags, hlen, addr);  
> >
> > and this look identical. You can probably move it before the if.  
> 
> Thanks, I need to make sure I understand: you're referring to the call
> to gve_tx_fill_pkt_desc? if so, the calls look the same but
> payload_nfrags is different in the if and else cases, perhaps I could
> move it after the else? but I'm not sure if that helps.

Fair, you can leave it as is.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
  2020-09-24  1:01 ` [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path David Awogbemila
@ 2020-10-02  9:54     ` kernel test robot
  2020-10-02  9:54     ` kernel test robot
  1 sibling, 0 replies; 14+ messages in thread
From: kernel test robot @ 2020-10-02  9:54 UTC (permalink / raw)
  To: David Awogbemila, netdev
  Cc: kbuild-all, Catherine Sullivan, Yangchun Fu, David Awogbemila

[-- Attachment #1: Type: text/plain, Size: 4980 bytes --]

Hi David,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/David-Awogbemila/GVE-Raw-Addressing/20200924-091514
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 68d4fd30c83b1b208e08c954cd45e6474b148c87
config: riscv-randconfig-r003-20200930 (attached as .config)
compiler: riscv32-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a1e5a942c313ca3513dcf760370e780af10f694d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Awogbemila/GVE-Raw-Addressing/20200924-091514
        git checkout a1e5a942c313ca3513dcf760370e780af10f694d
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/google/gve/gve_tx.c: In function 'gve_tx_add_skb_no_copy':
>> drivers/net/ethernet/google/gve/gve_tx.c:511:25: warning: variable 'buf' set but not used [-Wunused-but-set-variable]
     511 |  struct gve_tx_dma_buf *buf;
         |                         ^~~

vim +/buf +511 drivers/net/ethernet/google/gve/gve_tx.c

   501	
   502	static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
   503					  struct sk_buff *skb)
   504	{
   505		const struct skb_shared_info *shinfo = skb_shinfo(skb);
   506		int hlen, payload_nfrags, l4_hdr_offset, seg_idx_bias;
   507		union gve_tx_desc *pkt_desc, *seg_desc;
   508		struct gve_tx_buffer_state *info;
   509		bool is_gso = skb_is_gso(skb);
   510		u32 idx = tx->req & tx->mask;
 > 511		struct gve_tx_dma_buf *buf;
   512		int last_mapped = 0;
   513		u64 addr;
   514		u32 len;
   515		int i;
   516	
   517		info = &tx->info[idx];
   518		pkt_desc = &tx->desc[idx];
   519	
   520		l4_hdr_offset = skb_checksum_start_offset(skb);
   521		/* If the skb is gso, then we want only up to the tcp header in the first segment
   522		 * to efficiently replicate on each segment otherwise we want the linear portion
   523		 * of the skb (which will contain the checksum because skb->csum_start and
   524		 * skb->csum_offset are given relative to skb->head) in the first segment.
   525		 */
   526		hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
   527				skb_headlen(skb);
   528		len = skb_headlen(skb);
   529	
   530		info->skb =  skb;
   531	
   532		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
   533		if (unlikely(dma_mapping_error(tx->dev, addr))) {
   534			priv->dma_mapping_error++;
   535			goto drop;
   536		}
   537		buf = &info->buf;
   538		dma_unmap_len_set(buf, len, len);
   539		dma_unmap_addr_set(buf, dma, addr);
   540	
   541		payload_nfrags = shinfo->nr_frags;
   542		if (hlen < len) {
   543			/* For gso the rest of the linear portion of the skb needs to
   544			 * be in its own descriptor.
   545			 */
   546			payload_nfrags++;
   547			gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
   548					     1 + payload_nfrags, hlen, addr);
   549	
   550			len -= hlen;
   551			addr += hlen;
   552			seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
   553			seg_idx_bias = 2;
   554			gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
   555		} else {
   556			seg_idx_bias = 1;
   557			gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
   558					     1 + payload_nfrags, hlen, addr);
   559		}
   560		idx = (tx->req + seg_idx_bias) & tx->mask;
   561	
   562		for (i = 0; i < payload_nfrags - (seg_idx_bias - 1); i++) {
   563			const skb_frag_t *frag = &shinfo->frags[i];
   564	
   565			seg_desc = &tx->desc[idx];
   566			len = skb_frag_size(frag);
   567			addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
   568			if (unlikely(dma_mapping_error(tx->dev, addr))) {
   569				priv->dma_mapping_error++;
   570				goto unmap_drop;
   571			}
   572			buf = &tx->info[idx].buf;
   573			tx->info[idx].skb = NULL;
   574			dma_unmap_len_set(buf, len, len);
   575			dma_unmap_addr_set(buf, dma, addr);
   576	
   577			gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
   578			idx = (idx + 1) & tx->mask;
   579		}
   580	
   581		return 1 + payload_nfrags;
   582	
   583	unmap_drop:
   584		i--;
   585		for (last_mapped = i + seg_idx_bias; last_mapped >= 0; last_mapped--) {
   586			idx = (tx->req + last_mapped) & tx->mask;
   587			gve_tx_unmap_buf(tx->dev, &tx->info[idx]);
   588		}
   589	drop:
   590		tx->dropped_pkt++;
   591		return 0;
   592	}
   593	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 27855 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path
@ 2020-10-02  9:54     ` kernel test robot
  0 siblings, 0 replies; 14+ messages in thread
From: kernel test robot @ 2020-10-02  9:54 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 5111 bytes --]

Hi David,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/David-Awogbemila/GVE-Raw-Addressing/20200924-091514
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 68d4fd30c83b1b208e08c954cd45e6474b148c87
config: riscv-randconfig-r003-20200930 (attached as .config)
compiler: riscv32-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a1e5a942c313ca3513dcf760370e780af10f694d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review David-Awogbemila/GVE-Raw-Addressing/20200924-091514
        git checkout a1e5a942c313ca3513dcf760370e780af10f694d
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/net/ethernet/google/gve/gve_tx.c: In function 'gve_tx_add_skb_no_copy':
>> drivers/net/ethernet/google/gve/gve_tx.c:511:25: warning: variable 'buf' set but not used [-Wunused-but-set-variable]
     511 |  struct gve_tx_dma_buf *buf;
         |                         ^~~

vim +/buf +511 drivers/net/ethernet/google/gve/gve_tx.c

   501	
   502	static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
   503					  struct sk_buff *skb)
   504	{
   505		const struct skb_shared_info *shinfo = skb_shinfo(skb);
   506		int hlen, payload_nfrags, l4_hdr_offset, seg_idx_bias;
   507		union gve_tx_desc *pkt_desc, *seg_desc;
   508		struct gve_tx_buffer_state *info;
   509		bool is_gso = skb_is_gso(skb);
   510		u32 idx = tx->req & tx->mask;
 > 511		struct gve_tx_dma_buf *buf;
   512		int last_mapped = 0;
   513		u64 addr;
   514		u32 len;
   515		int i;
   516	
   517		info = &tx->info[idx];
   518		pkt_desc = &tx->desc[idx];
   519	
   520		l4_hdr_offset = skb_checksum_start_offset(skb);
   521		/* If the skb is gso, then we want only up to the tcp header in the first segment
   522		 * to efficiently replicate on each segment otherwise we want the linear portion
   523		 * of the skb (which will contain the checksum because skb->csum_start and
   524		 * skb->csum_offset are given relative to skb->head) in the first segment.
   525		 */
   526		hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) :
   527				skb_headlen(skb);
   528		len = skb_headlen(skb);
   529	
   530		info->skb =  skb;
   531	
   532		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
   533		if (unlikely(dma_mapping_error(tx->dev, addr))) {
   534			priv->dma_mapping_error++;
   535			goto drop;
   536		}
   537		buf = &info->buf;
   538		dma_unmap_len_set(buf, len, len);
   539		dma_unmap_addr_set(buf, dma, addr);
   540	
   541		payload_nfrags = shinfo->nr_frags;
   542		if (hlen < len) {
   543			/* For gso the rest of the linear portion of the skb needs to
   544			 * be in its own descriptor.
   545			 */
   546			payload_nfrags++;
   547			gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
   548					     1 + payload_nfrags, hlen, addr);
   549	
   550			len -= hlen;
   551			addr += hlen;
   552			seg_desc = &tx->desc[(tx->req + 1) & tx->mask];
   553			seg_idx_bias = 2;
   554			gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
   555		} else {
   556			seg_idx_bias = 1;
   557			gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
   558					     1 + payload_nfrags, hlen, addr);
   559		}
   560		idx = (tx->req + seg_idx_bias) & tx->mask;
   561	
   562		for (i = 0; i < payload_nfrags - (seg_idx_bias - 1); i++) {
   563			const skb_frag_t *frag = &shinfo->frags[i];
   564	
   565			seg_desc = &tx->desc[idx];
   566			len = skb_frag_size(frag);
   567			addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
   568			if (unlikely(dma_mapping_error(tx->dev, addr))) {
   569				priv->dma_mapping_error++;
   570				goto unmap_drop;
   571			}
   572			buf = &tx->info[idx].buf;
   573			tx->info[idx].skb = NULL;
   574			dma_unmap_len_set(buf, len, len);
   575			dma_unmap_addr_set(buf, dma, addr);
   576	
   577			gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
   578			idx = (idx + 1) & tx->mask;
   579		}
   580	
   581		return 1 + payload_nfrags;
   582	
   583	unmap_drop:
   584		i--;
   585		for (last_mapped = i + seg_idx_bias; last_mapped >= 0; last_mapped--) {
   586			idx = (tx->req + last_mapped) & tx->mask;
   587			gve_tx_unmap_buf(tx->dev, &tx->info[idx]);
   588		}
   589	drop:
   590		tx->dropped_pkt++;
   591		return 0;
   592	}
   593	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 27855 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-10-02  9:59 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-24  1:01 [PATCH net-next v3 0/4] GVE Raw Addressing David Awogbemila
2020-09-24  1:01 ` [PATCH net-next v3 1/4] gve: Add support for raw addressing device option David Awogbemila
2020-09-24 23:03   ` Jakub Kicinski
2020-09-24  1:01 ` [PATCH net-next v3 2/4] gve: Add support for raw addressing to the rx path David Awogbemila
2020-09-24  1:01 ` [PATCH net-next v3 3/4] gve: Rx Buffer Recycling David Awogbemila
2020-09-24 23:00   ` Jakub Kicinski
2020-09-30 16:10     ` David Awogbemila
2020-09-30 21:23       ` Jakub Kicinski
2020-09-24  1:01 ` [PATCH net-next v3 4/4] gve: Add support for raw addressing in the tx path David Awogbemila
2020-09-24 22:51   ` Jakub Kicinski
2020-09-30 16:09     ` David Awogbemila
2020-09-30 21:24       ` Jakub Kicinski
2020-10-02  9:54   ` kernel test robot
2020-10-02  9:54     ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.