All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
@ 2014-09-29 23:48 David L Stevens
  2014-10-02  5:50 ` Raghuram Kothakota
  0 siblings, 1 reply; 6+ messages in thread
From: David L Stevens @ 2014-09-29 23:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Sowmini Varadhan, Raghuram Kothahota

This patch removes pre-allocated transmit buffers and instead directly maps
pending packets on demand. This saves O(n^2) maximum-sized transmit buffers,
for n hosts on a vswitch, as well as a copy to those buffers.

Single-stream TCP throughput linux-solaris dropped ~5% for 1500-byte MTU,
but linux-linux at 1500-bytes increased ~20%.

Signed-off-by: David L Stevens <david.stevens@oracle.com>
---
 drivers/net/ethernet/sun/sunvnet.c |  218 ++++++++++++++++++++++++++++-------
 drivers/net/ethernet/sun/sunvnet.h |    9 ++-
 2 files changed, 182 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index b1abcad..8f5f4e3 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -780,6 +780,117 @@ struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb)
 	return ret;
 }
 
+static struct sk_buff *vnet_clean_tx_ring(struct vnet_port *port,
+					  unsigned *pending)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct sk_buff *skb = NULL;
+	int i, txi;
+
+	*pending = 0;
+
+	txi = dr->prod-1;
+	if (txi < 0)
+		txi = VNET_TX_RING_SIZE-1;
+
+	for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
+		struct vio_net_desc *d;
+
+		d = vio_dring_entry(dr, txi);
+
+		if (d->hdr.state == VIO_DESC_DONE) {
+			if (port->tx_bufs[txi].skb) {
+				BUG_ON(port->tx_bufs[txi].skb->next);
+
+				port->tx_bufs[txi].skb->next = skb;
+				skb = port->tx_bufs[txi].skb;
+				port->tx_bufs[txi].skb = NULL;
+
+				ldc_unmap(port->vio.lp,
+					  port->tx_bufs[txi].cookies,
+					  port->tx_bufs[txi].ncookies);
+			}
+			d->hdr.state = VIO_DESC_FREE;
+		} else if (d->hdr.state == VIO_DESC_READY) {
+			(*pending)++;
+		} else if (d->hdr.state == VIO_DESC_FREE) {
+			break;
+		}
+		--txi;
+		if (txi < 0)
+			txi = VNET_TX_RING_SIZE-1;
+	}
+	return skb;
+}
+
+static inline void vnet_free_skbs(struct sk_buff *skb)
+{
+	struct sk_buff *next;
+
+	while (skb) {
+		next = skb->next;
+		skb->next = NULL;
+		dev_kfree_skb(skb);
+		skb = next;
+	}
+}
+
+static void vnet_clean_timer_expire(unsigned long port0)
+{
+	struct vnet_port *port = (struct vnet_port *)port0;
+	struct sk_buff *freeskbs;
+	unsigned pending;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->vio.lock, flags);
+	freeskbs = vnet_clean_tx_ring(port, &pending);
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+
+	vnet_free_skbs(freeskbs);
+
+	if (pending)
+		(void)mod_timer(&port->clean_timer,
+				jiffies + VNET_CLEAN_TIMEOUT);
+	 else
+		del_timer(&port->clean_timer);
+}
+
+static inline struct sk_buff *vnet_skb_shape(struct sk_buff *skb, void **pstart,
+					     int *plen)
+{
+	struct sk_buff *nskb;
+	int len, pad;
+
+	len = skb->len;
+	pad = 0;
+	if (len < ETH_ZLEN) {
+		pad += ETH_ZLEN - skb->len;
+		len += pad;
+	}
+	len += VNET_PACKET_SKIP;
+	pad += 8 - (len & 7);
+	len += 8 - (len & 7);
+
+	if (((unsigned long)skb->data & 7) != VNET_PACKET_SKIP ||
+	    skb_tailroom(skb) < pad ||
+	    skb_headroom(skb) < VNET_PACKET_SKIP) {
+		nskb = alloc_and_align_skb(skb->dev, skb->len);
+		skb_reserve(nskb, VNET_PACKET_SKIP);
+		if (skb_copy_bits(skb, 0, nskb->data, skb->len)) {
+			dev_kfree_skb(nskb);
+			dev_kfree_skb(skb);
+			return NULL;
+		}
+		(void)skb_put(nskb, skb->len);
+		dev_kfree_skb(skb);
+		skb = nskb;
+	}
+
+	*pstart = skb->data - VNET_PACKET_SKIP;
+	*plen = len;
+	return skb;
+}
+
 static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vnet *vp = netdev_priv(dev);
@@ -788,12 +899,20 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct vio_net_desc *d;
 	unsigned long flags;
 	unsigned int len;
-	void *tx_buf;
-	int i, err;
+	struct sk_buff *freeskbs = NULL;
+	int i, err, txi;
+	void *start = NULL;
+	int nlen = 0;
+	unsigned pending = 0;
 
 	if (unlikely(!port))
 		goto out_dropped;
 
+	skb = vnet_skb_shape(skb, &start, &nlen);
+
+	if (unlikely(!skb))
+		goto out_dropped;
+
 	spin_lock_irqsave(&port->vio.lock, flags);
 
 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
@@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	d = vio_dring_cur(dr);
 
-	tx_buf = port->tx_bufs[dr->prod].buf;
-	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
+	txi = dr->prod;
+
+	freeskbs = vnet_clean_tx_ring(port, &pending);
+
+	BUG_ON(port->tx_bufs[txi].skb);
 
 	len = skb->len;
-	if (len < ETH_ZLEN) {
+	if (len < ETH_ZLEN)
 		len = ETH_ZLEN;
-		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
+
+	port->tx_bufs[txi].skb = skb;
+	skb = NULL;
+
+	err = ldc_map_single(port->vio.lp, start, nlen,
+			     port->tx_bufs[txi].cookies, 2,
+			     (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));
+	if (err < 0) {
+		netdev_info(dev, "tx buffer map error %d\n", err);
+		goto out_dropped_unlock;
 	}
+	port->tx_bufs[txi].ncookies = err;
 
 	/* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
 	 * thus it is safe to not set VIO_ACK_ENABLE for each transmission:
@@ -830,9 +962,9 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	d->hdr.ack = VIO_ACK_DISABLE;
 	d->size = len;
-	d->ncookies = port->tx_bufs[dr->prod].ncookies;
+	d->ncookies = port->tx_bufs[txi].ncookies;
 	for (i = 0; i < d->ncookies; i++)
-		d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];
+		d->cookies[i] = port->tx_bufs[txi].cookies[i];
 
 	/* This has to be a non-SMP write barrier because we are writing
 	 * to memory which is shared with the peer LDOM.
@@ -876,7 +1008,7 @@ ldc_start_done:
 	port->start_cons = false;
 
 	dev->stats.tx_packets++;
-	dev->stats.tx_bytes += skb->len;
+	dev->stats.tx_bytes += port->tx_bufs[txi].skb->len;
 
 	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
 	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
@@ -887,7 +1019,9 @@ ldc_start_done:
 
 	spin_unlock_irqrestore(&port->vio.lock, flags);
 
-	dev_kfree_skb(skb);
+	vnet_free_skbs(freeskbs);
+
+	(void)mod_timer(&port->clean_timer, jiffies + VNET_CLEAN_TIMEOUT);
 
 	return NETDEV_TX_OK;
 
@@ -895,7 +1029,14 @@ out_dropped_unlock:
 	spin_unlock_irqrestore(&port->vio.lock, flags);
 
 out_dropped:
-	dev_kfree_skb(skb);
+	if (skb)
+		dev_kfree_skb(skb);
+	vnet_free_skbs(freeskbs);
+	if (pending)
+		(void)mod_timer(&port->clean_timer,
+				jiffies + VNET_CLEAN_TIMEOUT);
+	else
+		del_timer(&port->clean_timer);
 	dev->stats.tx_dropped++;
 	return NETDEV_TX_OK;
 }
@@ -1097,17 +1238,22 @@ static void vnet_port_free_tx_bufs(struct vnet_port *port)
 	}
 
 	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
-		void *buf = port->tx_bufs[i].buf;
+		struct vio_net_desc *d;
+		void *skb = port->tx_bufs[i].skb;
 
-		if (!buf)
+		if (!skb)
 			continue;
 
+		d = vio_dring_entry(dr, i);
+		if (d->hdr.state == VIO_DESC_READY)
+			pr_warn("active transmit buffers freed\n");
+
 		ldc_unmap(port->vio.lp,
 			  port->tx_bufs[i].cookies,
 			  port->tx_bufs[i].ncookies);
-
-		kfree(buf);
-		port->tx_bufs[i].buf = NULL;
+		dev_kfree_skb(skb);
+		port->tx_bufs[i].skb = NULL;
+		d->hdr.state = VIO_DESC_FREE;
 	}
 }
 
@@ -1118,34 +1264,6 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
 	int i, err, ncookies;
 	void *dring;
 
-	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
-		void *buf = kzalloc(VNET_MAXPACKET + 8, GFP_KERNEL);
-		int map_len = (VNET_MAXPACKET + 7) & ~7;
-
-		err = -ENOMEM;
-		if (!buf)
-			goto err_out;
-
-		err = -EFAULT;
-		if ((unsigned long)buf & (8UL - 1)) {
-			pr_err("TX buffer misaligned\n");
-			kfree(buf);
-			goto err_out;
-		}
-
-		err = ldc_map_single(port->vio.lp, buf, map_len,
-				     port->tx_bufs[i].cookies, 2,
-				     (LDC_MAP_SHADOW |
-				      LDC_MAP_DIRECT |
-				      LDC_MAP_RW));
-		if (err < 0) {
-			kfree(buf);
-			goto err_out;
-		}
-		port->tx_bufs[i].buf = buf;
-		port->tx_bufs[i].ncookies = err;
-	}
-
 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
 
 	len = (VNET_TX_RING_SIZE *
@@ -1172,6 +1290,12 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
 	dr->pending = VNET_TX_RING_SIZE;
 	dr->ncookies = ncookies;
 
+	for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
+		struct vio_net_desc *d;
+
+		d = vio_dring_entry(dr, i);
+		d->hdr.state = VIO_DESC_FREE;
+	}
 	return 0;
 
 err_out:
@@ -1203,6 +1327,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
 	dev = alloc_etherdev(sizeof(*vp));
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
+	dev->needed_headroom = VNET_PACKET_SKIP + 8;
+	dev->needed_tailroom = 8;
 
 	for (i = 0; i < ETH_ALEN; i++)
 		dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff;
@@ -1397,6 +1523,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	pr_info("%s: PORT ( remote-mac %pM%s )\n",
 		vp->dev->name, port->raddr, switch_port ? " switch-port" : "");
 
+	setup_timer(&port->clean_timer, vnet_clean_timer_expire,
+		    (unsigned long)port);
+
 	vio_port_up(&port->vio);
 
 	mdesc_release(hp);
@@ -1423,6 +1552,7 @@ static int vnet_port_remove(struct vio_dev *vdev)
 		unsigned long flags;
 
 		del_timer_sync(&port->vio.timer);
+		del_timer_sync(&port->clean_timer);
 
 		spin_lock_irqsave(&vp->lock, flags);
 		list_del(&port->list);
diff --git a/drivers/net/ethernet/sun/sunvnet.h b/drivers/net/ethernet/sun/sunvnet.h
index 986e04b..02f507d 100644
--- a/drivers/net/ethernet/sun/sunvnet.h
+++ b/drivers/net/ethernet/sun/sunvnet.h
@@ -11,6 +11,11 @@
  */
 #define VNET_TX_TIMEOUT			(5 * HZ)
 
+/* length of time (or less) we expect pending descriptors to be marked
+ * as VIO_DESC_DONE and skbs ready to be freed
+ */
+#define	VNET_CLEAN_TIMEOUT		((HZ/100)+1)
+
 #define VNET_MAXPACKET			1518ULL /* ETH_FRAMELEN + VLAN_HDR */
 #define VNET_TX_RING_SIZE		512
 #define VNET_TX_WAKEUP_THRESH(dr)	((dr)->pending / 4)
@@ -22,7 +27,7 @@
 #define VNET_PACKET_SKIP		6
 
 struct vnet_tx_entry {
-	void			*buf;
+	struct sk_buff		*skb;
 	unsigned int		ncookies;
 	struct ldc_trans_cookie	cookies[2];
 };
@@ -46,6 +51,8 @@ struct vnet_port {
 	bool			stop_rx;
 	bool			start_cons;
 
+	struct timer_list	clean_timer;
+
 	u64			rmtu;
 };
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
  2014-09-29 23:48 [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel David L Stevens
@ 2014-10-02  5:50 ` Raghuram Kothakota
  2014-10-02  9:06   ` David Laight
  2014-10-02 11:11   ` David L Stevens
  0 siblings, 2 replies; 6+ messages in thread
From: Raghuram Kothakota @ 2014-10-02  5:50 UTC (permalink / raw)
  To: David L Stevens; +Cc: David Miller, netdev, Sowmini Varadhan

Sorry I am late in providing my comments, but I feel it is important
to share this comment.

> static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> 	struct vnet *vp = netdev_priv(dev);
> @@ -788,12 +899,20 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 	struct vio_net_desc *d;
> 	unsigned long flags;
> 	unsigned int len;
> -	void *tx_buf;
> -	int i, err;
> +	struct sk_buff *freeskbs = NULL;
> +	int i, err, txi;
> +	void *start = NULL;
> +	int nlen = 0;
> +	unsigned pending = 0;
> 
> 	if (unlikely(!port))
> 		goto out_dropped;
> 
> +	skb = vnet_skb_shape(skb, &start, &nlen);
> +
> +	if (unlikely(!skb))
> +		goto out_dropped;
> +
> 	spin_lock_irqsave(&port->vio.lock, flags);
> 
> 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
> @@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 
> 	d = vio_dring_cur(dr);
> 
> -	tx_buf = port->tx_bufs[dr->prod].buf;
> -	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
> +	txi = dr->prod;
> +
> +	freeskbs = vnet_clean_tx_ring(port, &pending);
> +
> +	BUG_ON(port->tx_bufs[txi].skb);
> 
> 	len = skb->len;
> -	if (len < ETH_ZLEN) {
> +	if (len < ETH_ZLEN)
> 		len = ETH_ZLEN;
> -		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
> +
> +	port->tx_bufs[txi].skb = skb;
> +	skb = NULL;
> +
> +	err = ldc_map_single(port->vio.lp, start, nlen,
> +			     port->tx_bufs[txi].cookies, 2,
> +			     (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));


The LDC sharing protection mechanism is at a page level. If I understand
well, the vnet_skb_shape() function only addresses the alignment requirement
but it still leaves the possibility of exporting a lot more data than required to the
peer. This can be treated as a security issue,  wondering if you thought of this issue.


-Raghuram



> +	if (err < 0) {
> +		netdev_info(dev, "tx buffer map error %d\n", err);
> +		goto out_dropped_unlock;
> 	}
> +	port->tx_bufs[txi].ncookies = err;
> 
> 	/* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
> 	 * thus it is safe to not set VIO_ACK_ENABLE for each transmission:
> @@ -830,9 +962,9 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> 	 */
> 	d->hdr.ack = VIO_ACK_DISABLE;
> 	d->size = len;
> -	d->ncookies = port->tx_bufs[dr->prod].ncookies;
> +	d->ncookies = port->tx_bufs[txi].ncookies;
> 	for (i = 0; i < d->ncookies; i++)
> -		d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];
> +		d->cookies[i] = port->tx_bufs[txi].cookies[i];
> 
> 	/* This has to be a non-SMP write barrier because we are writing
> 	 * to memory which is shared with the peer LDOM.
> @@ -876,7 +1008,7 @@ ldc_start_done:
> 	port->start_cons = false;
> 
> 	dev->stats.tx_packets++;
> -	dev->stats.tx_bytes += skb->len;
> +	dev->stats.tx_bytes += port->tx_bufs[txi].skb->len;
> 
> 	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
> 	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
> @@ -887,7 +1019,9 @@ ldc_start_done:
> 
> 	spin_unlock_irqrestore(&port->vio.lock, flags);
> 
> -	dev_kfree_skb(skb);
> +	vnet_free_skbs(freeskbs);
> +
> +	(void)mod_timer(&port->clean_timer, jiffies + VNET_CLEAN_TIMEOUT);
> 
> 	return NETDEV_TX_OK;
> 
> @@ -895,7 +1029,14 @@ out_dropped_unlock:
> 	spin_unlock_irqrestore(&port->vio.lock, flags);
> 
> out_dropped:
> -	dev_kfree_skb(skb);
> +	if (skb)
> +		dev_kfree_skb(skb);
> +	vnet_free_skbs(freeskbs);
> +	if (pending)
> +		(void)mod_timer(&port->clean_timer,
> +				jiffies + VNET_CLEAN_TIMEOUT);
> +	else
> +		del_timer(&port->clean_timer);
> 	dev->stats.tx_dropped++;
> 	return NETDEV_TX_OK;
> }
> @@ -1097,17 +1238,22 @@ static void vnet_port_free_tx_bufs(struct vnet_port *port)
> 	}
> 
> 	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
> -		void *buf = port->tx_bufs[i].buf;
> +		struct vio_net_desc *d;
> +		void *skb = port->tx_bufs[i].skb;
> 
> -		if (!buf)
> +		if (!skb)
> 			continue;
> 
> +		d = vio_dring_entry(dr, i);
> +		if (d->hdr.state == VIO_DESC_READY)
> +			pr_warn("active transmit buffers freed\n");
> +
> 		ldc_unmap(port->vio.lp,
> 			  port->tx_bufs[i].cookies,
> 			  port->tx_bufs[i].ncookies);
> -
> -		kfree(buf);
> -		port->tx_bufs[i].buf = NULL;
> +		dev_kfree_skb(skb);
> +		port->tx_bufs[i].skb = NULL;
> +		d->hdr.state = VIO_DESC_FREE;
> 	}
> }
> 
> @@ -1118,34 +1264,6 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
> 	int i, err, ncookies;
> 	void *dring;
> 
> -	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
> -		void *buf = kzalloc(VNET_MAXPACKET + 8, GFP_KERNEL);
> -		int map_len = (VNET_MAXPACKET + 7) & ~7;
> -
> -		err = -ENOMEM;
> -		if (!buf)
> -			goto err_out;
> -
> -		err = -EFAULT;
> -		if ((unsigned long)buf & (8UL - 1)) {
> -			pr_err("TX buffer misaligned\n");
> -			kfree(buf);
> -			goto err_out;
> -		}
> -
> -		err = ldc_map_single(port->vio.lp, buf, map_len,
> -				     port->tx_bufs[i].cookies, 2,
> -				     (LDC_MAP_SHADOW |
> -				      LDC_MAP_DIRECT |
> -				      LDC_MAP_RW));
> -		if (err < 0) {
> -			kfree(buf);
> -			goto err_out;
> -		}
> -		port->tx_bufs[i].buf = buf;
> -		port->tx_bufs[i].ncookies = err;
> -	}
> -
> 	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
> 
> 	len = (VNET_TX_RING_SIZE *
> @@ -1172,6 +1290,12 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
> 	dr->pending = VNET_TX_RING_SIZE;
> 	dr->ncookies = ncookies;
> 
> +	for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
> +		struct vio_net_desc *d;
> +
> +		d = vio_dring_entry(dr, i);
> +		d->hdr.state = VIO_DESC_FREE;
> +	}
> 	return 0;
> 
> err_out:
> @@ -1203,6 +1327,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
> 	dev = alloc_etherdev(sizeof(*vp));
> 	if (!dev)
> 		return ERR_PTR(-ENOMEM);
> +	dev->needed_headroom = VNET_PACKET_SKIP + 8;
> +	dev->needed_tailroom = 8;
> 
> 	for (i = 0; i < ETH_ALEN; i++)
> 		dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff;
> @@ -1397,6 +1523,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
> 	pr_info("%s: PORT ( remote-mac %pM%s )\n",
> 		vp->dev->name, port->raddr, switch_port ? " switch-port" : "");
> 
> +	setup_timer(&port->clean_timer, vnet_clean_timer_expire,
> +		    (unsigned long)port);
> +
> 	vio_port_up(&port->vio);
> 
> 	mdesc_release(hp);
> @@ -1423,6 +1552,7 @@ static int vnet_port_remove(struct vio_dev *vdev)
> 		unsigned long flags;
> 
> 		del_timer_sync(&port->vio.timer);
> +		del_timer_sync(&port->clean_timer);
> 
> 		spin_lock_irqsave(&vp->lock, flags);
> 		list_del(&port->list);
> diff --git a/drivers/net/ethernet/sun/sunvnet.h b/drivers/net/ethernet/sun/sunvnet.h
> index 986e04b..02f507d 100644
> --- a/drivers/net/ethernet/sun/sunvnet.h
> +++ b/drivers/net/ethernet/sun/sunvnet.h
> @@ -11,6 +11,11 @@
>  */
> #define VNET_TX_TIMEOUT			(5 * HZ)
> 
> +/* length of time (or less) we expect pending descriptors to be marked
> + * as VIO_DESC_DONE and skbs ready to be freed
> + */
> +#define	VNET_CLEAN_TIMEOUT		((HZ/100)+1)
> +
> #define VNET_MAXPACKET			1518ULL /* ETH_FRAMELEN + VLAN_HDR */
> #define VNET_TX_RING_SIZE		512
> #define VNET_TX_WAKEUP_THRESH(dr)	((dr)->pending / 4)
> @@ -22,7 +27,7 @@
> #define VNET_PACKET_SKIP		6
> 
> struct vnet_tx_entry {
> -	void			*buf;
> +	struct sk_buff		*skb;
> 	unsigned int		ncookies;
> 	struct ldc_trans_cookie	cookies[2];
> };
> @@ -46,6 +51,8 @@ struct vnet_port {
> 	bool			stop_rx;
> 	bool			start_cons;
> 
> +	struct timer_list	clean_timer;
> +
> 	u64			rmtu;
> };
> 
> -- 
> 1.7.1
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
  2014-10-02  5:50 ` Raghuram Kothakota
@ 2014-10-02  9:06   ` David Laight
  2014-10-02 11:33     ` David L Stevens
  2014-10-02 11:11   ` David L Stevens
  1 sibling, 1 reply; 6+ messages in thread
From: David Laight @ 2014-10-02  9:06 UTC (permalink / raw)
  To: 'Raghuram Kothakota', David L Stevens
  Cc: David Miller, netdev, Sowmini Varadhan

From: Raghuram Kothakota
> Sorry I am late in providing my comments, but I feel it is important
> to share this comment.

A comment on the original patch...

...
> > @@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >
> > 	d = vio_dring_cur(dr);
> >
> > -	tx_buf = port->tx_bufs[dr->prod].buf;
> > -	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
> > +	txi = dr->prod;
> > +
> > +	freeskbs = vnet_clean_tx_ring(port, &pending);
> > +
> > +	BUG_ON(port->tx_bufs[txi].skb);
> >
> > 	len = skb->len;
> > -	if (len < ETH_ZLEN) {
> > +	if (len < ETH_ZLEN)
> > 		len = ETH_ZLEN;
> > -		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
> > +

Aren't you transmitting 'random' bytes from the end of the data?
Plausibly they might not even be mapped.

Also, for short frames the copy could well be faster - especially on systems
with non-trivial iommu.
It is also worth checking whether the original copy was aligned.

	David

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
  2014-10-02  5:50 ` Raghuram Kothakota
  2014-10-02  9:06   ` David Laight
@ 2014-10-02 11:11   ` David L Stevens
  2014-10-02 23:11     ` Raghuram Kothakota
  1 sibling, 1 reply; 6+ messages in thread
From: David L Stevens @ 2014-10-02 11:11 UTC (permalink / raw)
  To: Raghuram Kothakota; +Cc: David Miller, netdev, Sowmini Varadhan



On 10/02/2014 01:50 AM, Raghuram Kothakota wrote:


>> +	err = ldc_map_single(port->vio.lp, start, nlen,
>> +			     port->tx_bufs[txi].cookies, 2,
>> +			     (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));
> 
> 
> The LDC sharing protection mechanism is at a page level. If I understand
> well, the vnet_skb_shape() function only addresses the alignment requirement
> but it still leaves the possibility of exporting a lot more data than required to the
> peer. This can be treated as a security issue,  wondering if you thought of this issue.

Since the specific offsets and lengths are provided in the API, I didn't realize that it was
sharing more than the provided buffer until you pointed that out, before I submitted the
patches. At that point, I considered it.

The original buffers were ~1500 byte kzalloc'ed (for each buffer), meaning that they were
potentially shared with arbitrary kernel memory on the same page.

This patch-set doesn't increase or decrease any security concerns related to oversharing
with other LDOMs. The extents outside the provided buffers are (now) skbs, and so packet
data, where before they could be any dynamically allocated kernel memory.

								+-DLS

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
  2014-10-02  9:06   ` David Laight
@ 2014-10-02 11:33     ` David L Stevens
  0 siblings, 0 replies; 6+ messages in thread
From: David L Stevens @ 2014-10-02 11:33 UTC (permalink / raw)
  To: David Laight, 'Raghuram Kothakota'
  Cc: David Miller, netdev, Sowmini Varadhan



On 10/02/2014 05:06 AM, David Laight wrote:

>>> 	len = skb->len;
>>> -	if (len < ETH_ZLEN) {
>>> +	if (len < ETH_ZLEN)
>>> 		len = ETH_ZLEN;
>>> -		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
>>> +
> 
> Aren't you transmitting 'random' bytes from the end of the data?
> Plausibly they might not even be mapped.

	No. The checks to decide whether to copy or not make sure the
extra 6-14 bytes in the beginning and 0-8 bytes (for large frames, more
for small) at the end are part of the particular skb's headroom and
tailroom respectively. So, they aren't random bytes-- they are what
was allocated as part of the frame. For TCP streams, the trailing bytes
are likely part of the next packet. They definitely exist and are mapped,
but I don't zero them because they are usually COW and that forces a copy
every time.

> 
> Also, for short frames the copy could well be faster - especially on systems
> with non-trivial iommu.
> It is also worth checking whether the original copy was aligned.

For the short frames, sure. I'm not sure what you mean by that last sentence--
the point of the checks that determine whether to copy or not are to enforce
alignment and length restrictions; if they aren't met in the original buffer,
we copy.

						+-DLS

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel
  2014-10-02 11:11   ` David L Stevens
@ 2014-10-02 23:11     ` Raghuram Kothakota
  0 siblings, 0 replies; 6+ messages in thread
From: Raghuram Kothakota @ 2014-10-02 23:11 UTC (permalink / raw)
  To: David L Stevens; +Cc: David Miller, netdev, Sowmini Varadhan


On Oct 2, 2014, at 4:11 AM, David L Stevens <david.stevens@oracle.com> wrote:

> 
> 
> On 10/02/2014 01:50 AM, Raghuram Kothakota wrote:
> 
> 
>>> +	err = ldc_map_single(port->vio.lp, start, nlen,
>>> +			     port->tx_bufs[txi].cookies, 2,
>>> +			     (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));
>> 
>> 
>> The LDC sharing protection mechanism is at a page level. If I understand
>> well, the vnet_skb_shape() function only addresses the alignment requirement
>> but it still leaves the possibility of exporting a lot more data than required to the
>> peer. This can be treated as a security issue,  wondering if you thought of this issue.
> 
> Since the specific offsets and lengths are provided in the API, I didn't realize that it was
> sharing more than the provided buffer until you pointed that out, before I submitted the
> patches. At that point, I considered it.
> 
> The original buffers were ~1500 byte kzalloc'ed (for each buffer), meaning that they were
> potentially shared with arbitrary kernel memory on the same page.

That is not good as well, we do not want to share more than what the other guest
need to see.

-Raghuram
> 
> This patch-set doesn't increase or decrease any security concerns related to oversharing
> with other LDOMs. The extents outside the provided buffers are (now) skbs, and so packet
> data, where before they could be any dynamically allocated kernel memory.
> 
> 								+-DLS

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-10-02 23:11 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-29 23:48 [PATCHv9 net-next 2/4] sunvnet: make transmit path zero-copy in the kernel David L Stevens
2014-10-02  5:50 ` Raghuram Kothakota
2014-10-02  9:06   ` David Laight
2014-10-02 11:33     ` David L Stevens
2014-10-02 11:11   ` David L Stevens
2014-10-02 23:11     ` Raghuram Kothakota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.