All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 10/17] Transmit and receive checksum offload support.
       [not found] <cover.1254667618.git.ssmith@weybridge.uk.xensource.com>
@ 2009-10-04 15:04 ` steven.smith
  2009-10-04 23:09 ` [PATCH 00/17] Netchannel2 for a modern git kernel Jeremy Fitzhardinge
  1 sibling, 0 replies; 13+ messages in thread
From: steven.smith @ 2009-10-04 15:04 UTC (permalink / raw)
  To: xen-devel; +Cc: Steven Smith, keir.frasier, jeremy, joserenato.santos

Signed-off-by: Steven Smith <steven.smith@citrix.com>
---
 drivers/net/xen-netchannel2/Makefile           |    2 +-
 drivers/net/xen-netchannel2/chan.c             |   16 ++++
 drivers/net/xen-netchannel2/netchannel2_core.h |   19 +++++
 drivers/net/xen-netchannel2/offload.c          |   93 ++++++++++++++++++++++++
 drivers/net/xen-netchannel2/recv_packet.c      |   30 ++++++++
 drivers/net/xen-netchannel2/xmit_packet.c      |   15 ++++
 include/xen/interface/io/netchannel2.h         |   44 +++++++++++-
 7 files changed, 215 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/xen-netchannel2/offload.c

diff --git a/drivers/net/xen-netchannel2/Makefile b/drivers/net/xen-netchannel2/Makefile
index d6641a1..565ba89 100644
--- a/drivers/net/xen-netchannel2/Makefile
+++ b/drivers/net/xen-netchannel2/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o
 
 netchannel2-objs := chan.o netchan2.o rscb.o util.o \
-	xmit_packet.o recv_packet.o poll.o
+	xmit_packet.o offload.o recv_packet.o poll.o
 
 ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y)
 netchannel2-objs += netback2.o
diff --git a/drivers/net/xen-netchannel2/chan.c b/drivers/net/xen-netchannel2/chan.c
index bd64e53..e8d3796 100644
--- a/drivers/net/xen-netchannel2/chan.c
+++ b/drivers/net/xen-netchannel2/chan.c
@@ -83,6 +83,9 @@ retry:
 		case NETCHANNEL2_MSG_FINISH_PACKET:
 			nc2_handle_finish_packet_msg(nc, ncrp, &hdr);
 			break;
+		case NETCHANNEL2_MSG_SET_OFFLOAD:
+			nc2_handle_set_offload(nc, ncrp, &hdr);
+			break;
 		case NETCHANNEL2_MSG_PAD:
 			break;
 		default:
@@ -127,6 +130,7 @@ done:
    event channel if necessary. */
 static void flush_rings(struct netchannel2_ring_pair *ncrp)
 {
+	struct netchannel2 *nc = ncrp->interface;
 	int need_kick;
 
 	flush_hypercall_batcher(&ncrp->pending_rx_hypercalls,
@@ -134,6 +138,8 @@ static void flush_rings(struct netchannel2_ring_pair *ncrp)
 	send_finish_packet_messages(ncrp);
 	if (ncrp->need_advertise_max_packets)
 		advertise_max_packets(ncrp);
+	if (nc->need_advertise_offloads)
+		advertise_offloads(nc);
 
 	need_kick = 0;
 	if (nc2_finish_messages(&ncrp->cons_ring)) {
@@ -374,6 +380,9 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd)
 	nc->local_trusted = local_trusted;
 	nc->rings.filter_mac = filter_mac;
 
+	/* Default to RX csum on. */
+	nc->use_rx_csum = 1;
+
 	skb_queue_head_init(&nc->pending_skbs);
 	if (init_ring_pair(&nc->rings, nc) < 0) {
 		nc2_release(nc);
@@ -388,6 +397,7 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd)
 	netdev->features = NETIF_F_LLTX;
 
 	SET_NETDEV_DEV(netdev, &xd->dev);
+	SET_ETHTOOL_OPS(netdev, &nc2_ethtool_ops);
 
 	err = read_mac_address(xd->nodename, "remote-mac",
 			       nc->rings.remote_mac);
@@ -473,6 +483,8 @@ int nc2_attach_rings(struct netchannel2 *nc,
 	_nc2_attach_rings(&nc->rings, cons_sring, cons_payload, cons_size,
 			  prod_sring, prod_payload, prod_size, otherend_id);
 
+	nc->need_advertise_offloads = 1;
+
 	spin_unlock_bh(&nc->rings.lock);
 
 	netif_carrier_on(nc->net_device);
@@ -537,6 +549,10 @@ void nc2_detach_rings(struct netchannel2 *nc)
 	if (nc->rings.irq >= 0)
 		unbind_from_irqhandler(nc->rings.irq, &nc->rings);
 	nc->rings.irq = -1;
+
+	/* Disable all offloads */
+	nc->net_device->features &= ~NETIF_F_IP_CSUM;
+	nc->allow_tx_csum_offload = 0;
 }
 
 #if defined(CONFIG_XEN_NETDEV2_BACKEND)
diff --git a/drivers/net/xen-netchannel2/netchannel2_core.h b/drivers/net/xen-netchannel2/netchannel2_core.h
index 429dd0c..296b606 100644
--- a/drivers/net/xen-netchannel2/netchannel2_core.h
+++ b/drivers/net/xen-netchannel2/netchannel2_core.h
@@ -243,6 +243,19 @@ struct netchannel2 {
 	/* Packets which we need to transmit soon */
 	struct sk_buff_head pending_skbs;
 
+	/* Task offload control.  These are all protected by the
+	 * lock. */
+	/* Ethtool allows us to use RX checksumming */
+	uint8_t use_rx_csum;
+	/* The remote endpoint allows us to use TX checksumming.
+	   Whether we actually use TX checksumming is controlled by
+	   the net device feature bits. */
+	uint8_t allow_tx_csum_offload;
+	/* At some point in the past, we tried to tell the other end
+	   what our current offload policy is and failed.  Try again
+	   as soon as possible. */
+	uint8_t need_advertise_offloads;
+
 	/* Flag to indicate that the interface is stopped
 	   When the interface is stopped we need to run the tasklet
 	   after we receive an interrupt so that we can wake it up */
@@ -355,6 +368,12 @@ void receive_pending_skbs(struct sk_buff_head *rx_queue);
 void nc2_queue_purge(struct netchannel2_ring_pair *ncrp,
 		     struct sk_buff_head *queue);
 
+void advertise_offloads(struct netchannel2 *nc);
+void nc2_handle_set_offload(struct netchannel2 *nc,
+			    struct netchannel2_ring_pair *ncrp,
+			    struct netchannel2_msg_hdr *hdr);
+extern struct ethtool_ops nc2_ethtool_ops;
+
 void nc2_init_poller(struct netchannel2_ring_pair *ncrp);
 void nc2_start_polling(struct netchannel2_ring_pair *ncrp);
 void nc2_stop_polling(struct netchannel2_ring_pair *ncrp);
diff --git a/drivers/net/xen-netchannel2/offload.c b/drivers/net/xen-netchannel2/offload.c
new file mode 100644
index 0000000..90d0a54
--- /dev/null
+++ b/drivers/net/xen-netchannel2/offload.c
@@ -0,0 +1,93 @@
+/* All the bits used to handle enabling and disabling the various
+ * offloads. */
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include "netchannel2_core.h"
+
+static int nc2_set_tx_csum(struct net_device *nd, u32 val);
+
+/* ---------------- Interface to the other domain ----------------------- */
+void nc2_handle_set_offload(struct netchannel2 *nc,
+			    struct netchannel2_ring_pair *ncrp,
+			    struct netchannel2_msg_hdr *hdr)
+{
+	struct netchannel2_msg_set_offload msg;
+	if (hdr->size != sizeof(msg)) {
+		pr_debug("Strange sized offload message: %d\n",
+			 hdr->size);
+		return;
+	}
+	if (ncrp != &nc->rings) {
+		pr_debug("Setting offloads on an ancillary ring!\n");
+		return;
+	}
+	nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size);
+	if (msg.csum != nc->allow_tx_csum_offload) {
+		nc->allow_tx_csum_offload = msg.csum;
+		nc2_set_tx_csum(nc->net_device, msg.csum);
+	}
+}
+
+/* Tell the other end what sort of offloads it's allowed to use. */
+void advertise_offloads(struct netchannel2 *nc)
+{
+	struct netchannel2_msg_set_offload msg;
+
+	memset(&msg, 0, sizeof(msg));
+
+	if (nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) {
+		msg.csum = nc->use_rx_csum;
+		nc2_send_message(&nc->rings.prod_ring,
+				 NETCHANNEL2_MSG_SET_OFFLOAD,
+				 0, &msg, sizeof(msg));
+		nc->need_advertise_offloads = 0;
+		nc->rings.pending_time_sensitive_messages = 1;
+	} else {
+		nc->need_advertise_offloads = 1;
+	}
+}
+
+
+
+/* ---------------------- Ethtool interface ---------------------------- */
+
+static int nc2_set_rx_csum(struct net_device *nd, u32 val)
+{
+	struct netchannel2 *nc = netdev_priv(nd);
+
+	spin_lock_bh(&nc->rings.lock);
+	if (nc->use_rx_csum != val) {
+		nc->use_rx_csum = val;
+		nc->need_advertise_offloads = 1;
+		spin_unlock_bh(&nc->rings.lock);
+		nc2_kick(&nc->rings);
+	} else {
+		spin_unlock_bh(&nc->rings.lock);
+	}
+
+	return 0;
+}
+
+static u32 nc2_get_rx_csum(struct net_device *nd)
+{
+	struct netchannel2 *nc = netdev_priv(nd);
+	return nc->use_rx_csum;
+}
+
+static int nc2_set_tx_csum(struct net_device *nd, u32 val)
+{
+	struct netchannel2 *nc = netdev_priv(nd);
+
+	/* Can't turn on TX csum offload if the other end can't do RX
+	   csum offload. */
+	if (val != 0 && !nc->allow_tx_csum_offload)
+		return -EOPNOTSUPP;
+	return ethtool_op_set_tx_csum(nd, val);
+}
+
+struct ethtool_ops nc2_ethtool_ops = {
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = nc2_set_tx_csum,
+	.get_rx_csum = nc2_get_rx_csum,
+	.set_rx_csum = nc2_set_rx_csum,
+};
diff --git a/drivers/net/xen-netchannel2/recv_packet.c b/drivers/net/xen-netchannel2/recv_packet.c
index 4678c28..0d4e593 100644
--- a/drivers/net/xen-netchannel2/recv_packet.c
+++ b/drivers/net/xen-netchannel2/recv_packet.c
@@ -132,6 +132,36 @@ void nc2_handle_packet_msg(struct netchannel2 *nc,
 			goto err;
 		}
 
+		switch (msg.flags & (NC2_PACKET_FLAG_data_validated |
+				     NC2_PACKET_FLAG_csum_blank)) {
+		case 0:
+			skb->ip_summed = CHECKSUM_NONE;
+			break;
+		case NC2_PACKET_FLAG_data_validated:
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		default:
+			/* csum_blank implies data_validated, so
+			   csum_blank and csum_blank|data_validated
+			   are equivalent. */
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			if (msg.csum_offset + 2 > skb->len) {
+				/* Whoops.  Assuming no bugs in our
+				   receive methods, the other end just
+				   requested checksum calculation
+				   beyond the end of the packet. */
+				if (net_ratelimit())
+					dev_warn(&nc->net_device->dev,
+						 "csum field too far through packet (%d, skb len %d, headlen %d)\n",
+						 msg.csum_offset, skb->len,
+						 skb_headlen(skb));
+				goto err;
+			}
+			skb->csum_start = msg.csum_start + skb_headroom(skb);
+			skb->csum_offset = msg.csum_offset - msg.csum_start;
+			break;
+		}
+
 		__skb_queue_tail(pending_rx_queue, skb);
 
 		if (ncrp->pending_rx_hypercalls.nr_pending_gops >=
diff --git a/drivers/net/xen-netchannel2/xmit_packet.c b/drivers/net/xen-netchannel2/xmit_packet.c
index a693a75..2783754 100644
--- a/drivers/net/xen-netchannel2/xmit_packet.c
+++ b/drivers/net/xen-netchannel2/xmit_packet.c
@@ -90,6 +90,19 @@ int prepare_xmit_allocate_resources(struct netchannel2 *nc,
 	return -1;
 }
 
+static void set_offload_flags(struct sk_buff *skb,
+			      volatile struct netchannel2_msg_packet *msg)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		msg->flags |=
+			NC2_PACKET_FLAG_csum_blank |
+			NC2_PACKET_FLAG_data_validated;
+		msg->csum_start = skb->csum_start - (skb->data - skb->head);
+		msg->csum_offset = msg->csum_start + skb->csum_offset;
+	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+		msg->flags |= NC2_PACKET_FLAG_data_validated;
+}
+
 /* Transmit a packet which has previously been prepared with
    prepare_xmit_allocate_resources(). */
 /* Once this has been called, the ring must not be flushed until the
@@ -139,6 +152,8 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp,
 	       skb_co->inline_prefix_size);
 	barrier();
 
+	set_offload_flags(skb, msg);
+
 	switch (skb_co->policy) {
 	case transmit_policy_small:
 		/* Nothing to do */
diff --git a/include/xen/interface/io/netchannel2.h b/include/xen/interface/io/netchannel2.h
index c45963e..5a56eb9 100644
--- a/include/xen/interface/io/netchannel2.h
+++ b/include/xen/interface/io/netchannel2.h
@@ -53,15 +53,31 @@ struct netchannel2_msg_packet {
 	uint8_t pad1;
 	uint16_t prefix_size;
 	uint16_t pad2;
-	uint16_t pad3;
-	uint16_t pad4;
-	/* Variable-size array.	 The number of elements is determined
+	uint16_t csum_start;
+	uint16_t csum_offset;
+	/* Variable-size array.  The number of elements is determined
 	   by the size of the message. */
 	/* Until we support scatter-gather, this will be either 0 or 1
 	   element. */
 	struct netchannel2_fragment frags[0];
 };
 
+/* TX csum offload.  The transmitting domain has skipped a checksum
+ * calculation.  Before forwarding the packet on, the receiving domain
+ * must first perform a 16 bit IP checksum on everything from
+ * csum_start to the end of the packet, and then write the result to
+ * an offset csum_offset in the packet.  This should only be set if
+ * the transmitting domain has previously received a SET_OFFLOAD
+ * message with csum = 1.
+ */
+#define NC2_PACKET_FLAG_csum_blank 1
+/* RX csum offload.  The transmitting domain has already validated the
+ * protocol-level checksum on this packet (i.e. TCP or UDP), so the
+ * receiving domain shouldn't bother.  This does not tell you anything
+ * about the IP-level checksum.  This can be set on any packet,
+ * regardless of any SET_OFFLOAD messages which may or may not have
+ * been sent. */
+#define NC2_PACKET_FLAG_data_validated 2
 /* If set, the transmitting domain requires an event urgently when
  * this packet's finish message is sent.  Otherwise, the event can be
  * delayed. */
@@ -103,4 +119,26 @@ struct netchannel2_msg_finish_packet {
 	uint32_t id;
 };
 
+/* Tell the other end what sort of offloads we're going to let it use.
+ * An endpoint must not use any offload unless it has been enabled
+ * by a previous SET_OFFLOAD message. */
+/* Note that there is no acknowledgement for this message.  This means
+ * that an endpoint can continue to receive PACKET messages which
+ * require offload support for some time after it disables task
+ * offloading.  The endpoint is expected to handle this case correctly
+ * (which may just mean dropping the packet and returning a FINISH
+ * message, if appropriate).
+ */
+#define NETCHANNEL2_MSG_SET_OFFLOAD 4
+struct netchannel2_msg_set_offload {
+	struct netchannel2_msg_hdr hdr;
+	/* Checksum offload.  If this is 0, the other end must
+	 * calculate checksums before sending the packet.  If it is 1,
+	 * the other end does not have to perform the calculation.
+	 */
+	uint8_t csum;
+	uint8_t pad;
+	uint16_t reserved;
+};
+
 #endif /* !__NETCHANNEL2_H__ */
-- 
1.6.3.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
       [not found] <cover.1254667618.git.ssmith@weybridge.uk.xensource.com>
  2009-10-04 15:04 ` [PATCH 10/17] Transmit and receive checksum offload support steven.smith
@ 2009-10-04 23:09 ` Jeremy Fitzhardinge
  2009-10-05  9:29   ` Steven Smith
  1 sibling, 1 reply; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-04 23:09 UTC (permalink / raw)
  To: steven.smith; +Cc: keir.frasier, xen-devel, joserenato.santos

On 10/04/09 08:04, steven.smith@citrix.com wrote:
> This is a forward port of the NC2 patches I posted against so that
> they apply to Jeremy's git tree, rather than the XCI patchqueue.  It's
> identical in most important respects, except that I've not included
> any of the VMQ patches.
>   

Do you have a git tree I can pull these from?  I have
git://xenbits.xensource.com/people/ssmith/netchannel2-pvops.git in my
list of repos, but you don't seem to have updated it.

Thanks,
    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-04 23:09 ` [PATCH 00/17] Netchannel2 for a modern git kernel Jeremy Fitzhardinge
@ 2009-10-05  9:29   ` Steven Smith
  2009-10-05 21:22     ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Smith @ 2009-10-05  9:29 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, keir.fraser, xen-devel, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 635 bytes --]

> > This is a forward port of the NC2 patches I posted against so that
> > they apply to Jeremy's git tree, rather than the XCI patchqueue.  It's
> > identical in most important respects, except that I've not included
> > any of the VMQ patches.
> Do you have a git tree I can pull these from?  I have
> git://xenbits.xensource.com/people/ssmith/netchannel2-pvops.git in my
> list of repos, but you don't seem to have updated it.
Oops, sorry about that.  It should all be in now, in the nc2/master
branch.

(Although you may want to wait until the discussion about the new
grant table interface is sorted out before pulling.)

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-05  9:29   ` Steven Smith
@ 2009-10-05 21:22     ` Jeremy Fitzhardinge
  2009-10-06  9:06       ` Steven Smith
  0 siblings, 1 reply; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-05 21:22 UTC (permalink / raw)
  To: Steven Smith; +Cc: Steven Smith, keir.fraser, xen-devel, joserenato.santos

On 10/05/09 02:29, Steven Smith wrote:
>>> This is a forward port of the NC2 patches I posted against so that
>>> they apply to Jeremy's git tree, rather than the XCI patchqueue.  It's
>>> identical in most important respects, except that I've not included
>>> any of the VMQ patches.
>>>       
>> Do you have a git tree I can pull these from?  I have
>> git://xenbits.xensource.com/people/ssmith/netchannel2-pvops.git in my
>> list of repos, but you don't seem to have updated it.
>>     
> Oops, sorry about that.  It should all be in now, in the nc2/master
> branch.
>
> (Although you may want to wait until the discussion about the new
> grant table interface is sorted out before pulling.)
>   

Thanks.  I've pulled it anyway, but not yet merged it into anything yet.

What dependencies does it have on the rest of the xen changes?  In
general I like to avoid basing branches on xen/master, because it makes
them fairly brittle to rebase or otherwise rearrange.  Ideally I like to
base each topic branch on a mainline kernel version (like v2.6.31), or
if there are too many dependencies on other Xen topic branches, the most
specific one that covers the dependencies (xen/core, xen/dom0/core, for
example).

Also, does this include both front and backend parts together?  I think
it would be easier to deal with if you could split it into separate
common, frontend and backend branches.  Or maybe that would be too fiddly...

    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-05 21:22     ` Jeremy Fitzhardinge
@ 2009-10-06  9:06       ` Steven Smith
  2009-10-06 16:35         ` Steven Smith
  2009-10-06 17:06         ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 13+ messages in thread
From: Steven Smith @ 2009-10-06  9:06 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 2421 bytes --]

> >>> This is a forward port of the NC2 patches I posted against so that
> >>> they apply to Jeremy's git tree, rather than the XCI patchqueue.  It's
> >>> identical in most important respects, except that I've not included
> >>> any of the VMQ patches.
> >>>       
> >> Do you have a git tree I can pull these from?  I have
> >> git://xenbits.xensource.com/people/ssmith/netchannel2-pvops.git in my
> >> list of repos, but you don't seem to have updated it.
> >>     
> > Oops, sorry about that.  It should all be in now, in the nc2/master
> > branch.
> >
> > (Although you may want to wait until the discussion about the new
> > grant table interface is sorted out before pulling.)
> Thanks.  I've pulled it anyway, but not yet merged it into anything yet.
Okay.  I'm going to change the interface a bit following the review
comments; would you prefer I shove a fixup patch on the end or edit
history and keep the patches sensibly self-contained?

> What dependencies does it have on the rest of the xen changes?  In
> general I like to avoid basing branches on xen/master, because it makes
> them fairly brittle to rebase or otherwise rearrange.  Ideally I like to
> base each topic branch on a mainline kernel version (like v2.6.31), or
> if there are too many dependencies on other Xen topic branches, the most
> specific one that covers the dependencies (xen/core, xen/dom0/core, for
> example).
I needed to make some changes to netback to make forwarding packets
between NC1 and NC2 interfaces work, but apart from that it's fairly
self-contained.  Would you like me to rebase to
xen/dom0/backend/netback?

> Also, does this include both front and backend parts together?  I think
> it would be easier to deal with if you could split it into separate
> common, frontend and backend branches.  Or maybe that would be too fiddly...
It includes both.  I'm not sure it would be terribly helpful to split
them out: the distinction between frontend and backend is pretty
arbitrary for network devices anyway, and bypass support blurs the
line even further.

It'd certainly be possible to create the three branches you suggest,
but I'd guess you'd end up with almost everything going into the
common branch with the frontend and backend branches containing a
single patch each to add netfront2.c and netback2.c respectively.  I'm
not sure how useful that would be.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-06  9:06       ` Steven Smith
@ 2009-10-06 16:35         ` Steven Smith
  2009-10-06 17:12           ` Jeremy Fitzhardinge
  2009-10-06 17:06         ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 13+ messages in thread
From: Steven Smith @ 2009-10-06 16:35 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 5828 bytes --]

> > What dependencies does it have on the rest of the xen changes?  In
> > general I like to avoid basing branches on xen/master, because it makes
> > them fairly brittle to rebase or otherwise rearrange.  Ideally I like to
> > base each topic branch on a mainline kernel version (like v2.6.31), or
> > if there are too many dependencies on other Xen topic branches, the most
> > specific one that covers the dependencies (xen/core, xen/dom0/core, for
> > example).
> I needed to make some changes to netback to make forwarding packets
> between NC1 and NC2 interfaces work, but apart from that it's fairly
> self-contained.  Would you like me to rebase to
> xen/dom0/backend/netback?
Hmm... xen/dom0/backend/netback doesn't actually boot on my test box,
so that could be a little tricky:

<6>ACPI: IOAPIC (id[0x02] address[0xfec00000] gsi_base[0])
(XEN) mm.c:777:d0 Non-privileged (0) attempt to map I/O space 000fec00
<3>1 multicall(s) failed: cpu 0
Pid: 0, comm: swapper Not tainted 2.6.30 #27
Call Trace:
 [<c10031a2>] ? xen_mc_flush+0xa2/0x14c
 [<c100457a>] ? xen_set_iomap_pte+0x84/0x98
 [<c101b6a2>] ? set_pte_vaddr+0xdd/0x107
 [<c101a6f4>] ? __native_set_fixmap+0x1e/0x26
 [<c163d58f>] ? mp_register_ioapic+0xf5/0x10f
 [<c163d5d3>] ? acpi_parse_ioapic+0x2a/0x35
 [<c164bc1d>] ? acpi_table_parse_entries+0xa4/0x109
 [<c164bc95>] ? acpi_table_parse_madt+0x13/0x16
 [<c163d5a9>] ? acpi_parse_ioapic+0x0/0x35
 [<c163d833>] ? acpi_boot_init+0x189/0x2d6
 [<c1639716>] ? setup_arch+0x4fb/0x567
 [<c163bd1f>] ? reserve_early+0x36/0x3e
 [<c1637540>] ? start_kernel+0x66/0x257
 [<c1637fc6>] ? xen_start_kernel+0x428/0x430
<7>  call  1/1: op=1 arg=[c167e8e0] result=-22  xen_set_iomap_pte+0x39/0x98
<4>------------[ cut here ]------------
<4>WARNING: at arch/x86/xen/multicalls.c:182 xen_set_iomap_pte+0x84/0x98()
<4>Hardware name: To Be Filled By O.E.M.
<d>Modules linked in:
Pid: 0, comm: swapper Not tainted 2.6.30 #27
Call Trace:
 [<c10286b2>] ? warn_slowpath_common+0x41/0x71
 [<c10286cf>] ? warn_slowpath_common+0x5e/0x71
 [<c10286ec>] ? warn_slowpath_null+0xa/0xc
 [<c100457a>] ? xen_set_iomap_pte+0x84/0x98
 [<c101b6a2>] ? set_pte_vaddr+0xdd/0x107
 [<c101a6f4>] ? __native_set_fixmap+0x1e/0x26
 [<c163d58f>] ? mp_register_ioapic+0xf5/0x10f
 [<c163d5d3>] ? acpi_parse_ioapic+0x2a/0x35
 [<c164bc1d>] ? acpi_table_parse_entries+0xa4/0x109
 [<c164bc95>] ? acpi_table_parse_madt+0x13/0x16
 [<c163d5a9>] ? acpi_parse_ioapic+0x0/0x35
 [<c163d833>] ? acpi_boot_init+0x189/0x2d6
 [<c1639716>] ? setup_arch+0x4fb/0x567
 [<c163bd1f>] ? reserve_early+0x36/0x3e
 [<c1637540>] ? start_kernel+0x66/0x257
 [<c1637fc6>] ? xen_start_kernel+0x428/0x430
<4>---[ end trace 4eaa2a86a8e2da22 ]---
(XEN) d0:v0: unhandled page fault (ec=0002)
(XEN) Pagetable walk from 00000000ffbfa000:
(XEN)  L4[0x000] = 0000000079616027 0000000000001616
(XEN)  L3[0x003] = 0000000079690027 0000000000001690
(XEN)  L2[0x1fd] = 0000000079691067 0000000000001691
(XEN)  L1[0x1fa] = 0000000000000000 ffffffffffffffff
(XEN) domain_crash_sync called from entry.S
(XEN) Domain 0 (vcpu#0) crashed on cpu#0:
(XEN) ----[ Xen-3.5-unstable  x86_64  debug=y  Not tainted ]----
(XEN) CPU:    0
(XEN) RIP:    e019:[<00000000c1642697>]
(XEN) RFLAGS: 0000000000000246   EM: 1   CONTEXT: pv guest
(XEN) rax: 0000000000000000   rbx: 0000000000000000   rcx: 0000000000000000
(XEN) rdx: 0000000000000000   rsi: 00000000c101255f   rdi: 00000000ffbfa000
(XEN) rbp: 0000000000000002   rsp: 00000000c1631ea0   r8:  0000000000000000
(XEN) r9:  0000000000000000   r10: 0000000000000000   r11: 0000000000000000
(XEN) r12: 0000000000000000   r13: 0000000000000000   r14: 0000000000000000
(XEN) r15: 0000000000000000   cr0: 000000008005003b   cr4: 00000000000026f0
(XEN) cr3: 000000007d326000   cr2: 00000000ffbfa000
(XEN) ds: e021   es: e021   fs: e021   gs: e021   ss: e021   cs: e019
(XEN) Guest stack trace from esp=c1631ea0:
(XEN)   00000002 c1642697 0001e019 00010046 00000000 fec0047b c167e060 c1004af2
(XEN)   c1004af2 00000001 c100322e 00000000 79691067 ffbfa000 00000003 00000000
(XEN)   00000000 00000000 00000000 00000000 00000000 00000000 00000000 fec0047b
(XEN)   80000000 ffbfa000 ffbfa000 00000002 00000000 ffa003fc 00000000 80000000
(XEN)   00000000 ffa003fc c163d4f7 ffa003dc 00000001 c153d89b c163d5d3 ffa003dc
(XEN)   c164bc1d 00000001 0000006c ffa00390 00000000 00000000 c162cd20 c1631fb8
(XEN)   c164bc95 c163d5a9 00000040 c163d833 00000000 023ff000 c1639716 016f5d14
(XEN)   c163bd1f 016f5d14 00000000 00000000 c1631fdc c19ef000 00000000 00000000
(XEN)   c1631fdc c19ef000 00000000 c1637540 c1538568 c138d020 c1659360 c1631fe8
(XEN)   c1637fc6 00000000 c1631fec 00000000 00000000 00000000 00000000 1fc89b75
(XEN)   80000281 00020800 000006fb 00000001 00000000 c19ec000 00000000 00000000
(XEN)   c1382aeb c1382af3 c1003900 c16384a0 c1003aa0 c1003bd5 c1003df3 c10040d2
(XEN)   c1004231 c10042b9 c10045c9 c10046fe c100475d c16389b5 c16389bd c1382b45
(XEN)   c1382bce c1382bfd c100582a c10058a2 c1005a79 c1005ff3 c1006035 c1006047
(XEN)   c10064ec c10065de c1006ad4 c1006ae0 c1007e1b c1008572 c1008647 c1638fdc
(XEN)   c1639086 c10092d3 c163958d c1639595 c100a058 c163980a c1639868 c16398be
(XEN)   c1639917 c1639970 c16399c9 c1639a22 c1639a7b c1639ad4 c1639b2d c1639b86
(XEN)   c1639c02 c1639c10 c1639c6f c1639cce c1639d30 c1639d92 c1639df4 c163ab5c
(XEN)   c163ab64 c100ac41 c100ac4f c100acd3 c100ace1 c163c27f c163c292 c100ba3f
(XEN)   c100ba9e c100bd8f c100bdc6 c100be95 c100c02f c100c158 c100c181 c100c20b
(XEN) Domain 0 crashed: rebooting machine in 5 seconds.

This was presumably fixed by one of 5343 changesets which are present
in xen/master but not xen/dom0/backend/netback, so I'm not going to
try to chase it.

(For what it's worth, git rebase --onto seems to do roughly the right
thing, but it's kind of hard to be confident when I can't even boot.)

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-06  9:06       ` Steven Smith
  2009-10-06 16:35         ` Steven Smith
@ 2009-10-06 17:06         ` Jeremy Fitzhardinge
  2009-10-07  8:15           ` Steven Smith
  1 sibling, 1 reply; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-06 17:06 UTC (permalink / raw)
  To: Steven Smith; +Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos

On 10/06/09 02:06, Steven Smith wrote:
>> Thanks.  I've pulled it anyway, but not yet merged it into anything yet.
>>     
> Okay.  I'm going to change the interface a bit following the review
> comments; would you prefer I shove a fixup patch on the end or edit
> history and keep the patches sensibly self-contained?
>   

At this point I haven't done anything with the branch, so just rewrite
it to your heart's content.

BTW, do you see this is something as a candidate for merging upstream?

> I needed to make some changes to netback to make forwarding packets
> between NC1 and NC2 interfaces work, but apart from that it's fairly
> self-contained.  Would you like me to rebase to
> xen/dom0/backend/netback?
>   

Yep, sounds good.

An outstanding problem with netback is how we can deal with "foreign"
pages in dom0's network stack.  The current approach isn't viable for
upstream, but there has only been slow movement in coming up with a
better approach.

I haven't looked at nc2 yet, but does it use the same technique for
memory management, or something else?

> It'd certainly be possible to create the three branches you suggest,
> but I'd guess you'd end up with almost everything going into the
> common branch with the frontend and backend branches containing a
> single patch each to add netfront2.c and netback2.c respectively.  I'm
> not sure how useful that would be.
>   

OK, one branch is fine.

    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-06 16:35         ` Steven Smith
@ 2009-10-06 17:12           ` Jeremy Fitzhardinge
  2009-10-07 19:17             ` Steven Smith
  0 siblings, 1 reply; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-06 17:12 UTC (permalink / raw)
  To: Steven Smith; +Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos

On 10/06/09 09:35, Steven Smith wrote:
>>> What dependencies does it have on the rest of the xen changes?  In
>>> general I like to avoid basing branches on xen/master, because it makes
>>> them fairly brittle to rebase or otherwise rearrange.  Ideally I like to
>>> base each topic branch on a mainline kernel version (like v2.6.31), or
>>> if there are too many dependencies on other Xen topic branches, the most
>>> specific one that covers the dependencies (xen/core, xen/dom0/core, for
>>> example).
>>>       
>> I needed to make some changes to netback to make forwarding packets
>> between NC1 and NC2 interfaces work, but apart from that it's fairly
>> self-contained.  Would you like me to rebase to
>> xen/dom0/backend/netback?
>>     
> Hmm... xen/dom0/backend/netback doesn't actually boot on my test box,
> so that could be a little tricky:
>   

Yeah, I wouldn't expect the netback to boot on its own.  It is
functionally self-contained, but it doesn't have the other functionality
of the other branches.

My normal workflow is:

   1. git checkout -b hackingabout xen/master
   2. git merge xen/dom0/backend/netback
   3. (git tag hackbase)

Hack about (lots of little ad-hoc commits).
When happy:

   1. git rebase -i --onto xen/dom0/backend/netback hackbase (squash all
      the little commits into sensible groups, remove crud, reorder, etc)
   2. make sure it still works
   3. git checkout xen/dom0/backend/netback
   4. git reset --hard hackingabout
   5. git branch -d hackingabout
   6. git tag -d hackbase

    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-06 17:06         ` Jeremy Fitzhardinge
@ 2009-10-07  8:15           ` Steven Smith
  2009-10-20  6:00             ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Smith @ 2009-10-07  8:15 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 2244 bytes --]

> >> Thanks.  I've pulled it anyway, but not yet merged it into anything yet.
> >>     
> > Okay.  I'm going to change the interface a bit following the review
> > comments; would you prefer I shove a fixup patch on the end or edit
> > history and keep the patches sensibly self-contained?
> At this point I haven't done anything with the branch, so just rewrite
> it to your heart's content.
Okay.

> BTW, do you see this is something as a candidate for merging upstream?
I've mostly been defining ``upstream'' as you, but, yes, sending it
further would be good.

> > I needed to make some changes to netback to make forwarding packets
> > between NC1 and NC2 interfaces work, but apart from that it's fairly
> > self-contained.  Would you like me to rebase to
> > xen/dom0/backend/netback?
> An outstanding problem with netback is how we can deal with "foreign"
> pages in dom0's network stack.  The current approach isn't viable for
> upstream, but there has only been slow movement in coming up with a
> better approach.
> 
> I haven't looked at nc2 yet, but does it use the same technique for
> memory management, or something else?
The NC2 approach is basically similar to the NC1 approach, but
generalised so that NC1 and NC2 can cooperate in a reasonably sane
way.  It still uses the PG_foreign bit to identify foreign pages, and
the page->private and page->mapping fields for various bits of
information.

The basic idea is that everything which can map foreign pages and
expose them to the rest of Linux needs to allocate a foreign page
tracker (effectively an array of (domid, grant_ref, void *ctxt)
tuples), and to register mapped pages with that tracker.  It then uses
the top few bits of page->private to identify the tracker, and the
rest to index into the array.  This allows you to forward packets from
a foreign domain without knowing whether it was received by NC1 or
NC2.

Arguably, blkback should be using this mechanism as well, but since
we've gotten away with it so far I thought it'd be best to let
sleeping dogs lie.  The only time it'd make any difference would be
when pages out of a block request somehow get attached to network
packets, which seems unlikely.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-06 17:12           ` Jeremy Fitzhardinge
@ 2009-10-07 19:17             ` Steven Smith
  0 siblings, 0 replies; 13+ messages in thread
From: Steven Smith @ 2009-10-07 19:17 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, xen-devel, Keir Fraser, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 1311 bytes --]

> >>> What dependencies does it have on the rest of the xen changes?  In
> >>> general I like to avoid basing branches on xen/master, because it makes
> >>> them fairly brittle to rebase or otherwise rearrange.  Ideally I like to
> >>> base each topic branch on a mainline kernel version (like v2.6.31), or
> >>> if there are too many dependencies on other Xen topic branches, the most
> >>> specific one that covers the dependencies (xen/core, xen/dom0/core, for
> >>> example).
> >>>       
> >> I needed to make some changes to netback to make forwarding packets
> >> between NC1 and NC2 interfaces work, but apart from that it's fairly
> >> self-contained.  Would you like me to rebase to
> >> xen/dom0/backend/netback?
> >>     
> > Hmm... xen/dom0/backend/netback doesn't actually boot on my test box,
> > so that could be a little tricky:
> Yeah, I wouldn't expect the netback to boot on its own.  It is
> functionally self-contained, but it doesn't have the other functionality
> of the other branches.
Okay.  It took me a little while to figure out how to make git do the
desired thing, but the nc2/rebased-pq branch should now contain the
patch series, including the last set of interface changes, rebased
against xen/dom0/backend/netback.

Thank you for the hints.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-07  8:15           ` Steven Smith
@ 2009-10-20  6:00             ` Jeremy Fitzhardinge
  2009-10-20  9:40               ` Steven Smith
  0 siblings, 1 reply; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-20  6:00 UTC (permalink / raw)
  To: Steven Smith
  Cc: Steven Smith, Ian Campbell, xen-devel, Keir Fraser, joserenato.santos

On 10/07/09 17:15, Steven Smith wrote:
>>>> Thanks.  I've pulled it anyway, but not yet merged it into anything yet.
>>>>     
>>>>         
>>> Okay.  I'm going to change the interface a bit following the review
>>> comments; would you prefer I shove a fixup patch on the end or edit
>>> history and keep the patches sensibly self-contained?
>>>       
>> At this point I haven't done anything with the branch, so just rewrite
>> it to your heart's content.
>>     
> Okay.
>
>   
>> BTW, do you see this is something as a candidate for merging upstream?
>>     
> I've mostly been defining ``upstream'' as you, but, yes, sending it
> further would be good.
>   

OK, but that's a fair bit more work.

> The NC2 approach is basically similar to the NC1 approach, but
> generalised so that NC1 and NC2 can cooperate in a reasonably sane
> way.  It still uses the PG_foreign bit to identify foreign pages, and
> the page->private and page->mapping fields for various bits of
> information.
>   

Unfortunately the PG_foreign approach is a non-starter for upstream,
mainly because adding new page flags is strongly frowned upon unless
there's a very compelling reason.  Unless we can find some other kernel
subsystems which can make use of a page destructor, we probably won't
make the cut.  (It doesn't help that there are no more page flags left
on 32-bit.)

The approach I'm trying at the moment is to use the skb destructor
mechanism to grab the pages out of the skb as its freed.  To deal with
skb_clone, I'm adding a flag to the skb to force a clone to do a
complete copy so there are no more aliases to the pages (skb_clone
should be rare in the common case).

> The basic idea is that everything which can map foreign pages and
> expose them to the rest of Linux needs to allocate a foreign page
> tracker (effectively an array of (domid, grant_ref, void *ctxt)
> tuples), and to register mapped pages with that tracker.  It then uses
> the top few bits of page->private to identify the tracker, and the
> rest to index into the array.  This allows you to forward packets from
> a foreign domain without knowing whether it was received by NC1 or
> NC2.
>   

Well, if its wrapped by a skb, we can get the skb destructor to handle
the cleanup phase.  So long as we get the callback, I don't think it
should affect the rest of the mechanism.

> Arguably, blkback should be using this mechanism as well, but since
> we've gotten away with it so far I thought it'd be best to let
> sleeping dogs lie.  The only time it'd make any difference would be
> when pages out of a block request somehow get attached to network
> packets, which seems unlikely.
>   

Block lifetimes are simpler because there's no cloning and bios have a
end_io callback which is more or less equivalent to the skb destructor.

    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-20  6:00             ` Jeremy Fitzhardinge
@ 2009-10-20  9:40               ` Steven Smith
  2009-10-23 22:06                 ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Smith @ 2009-10-20  9:40 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Campbell, xen-devel, Keir Fraser, joserenato.santos


[-- Attachment #1.1: Type: text/plain, Size: 6880 bytes --]

> >> BTW, do you see this is something as a candidate for merging upstream?
> >>     
> > I've mostly been defining ``upstream'' as you, but, yes, sending it
> > further would be good.
> OK, but that's a fair bit more work.
Yes, indeed.  This is very much a long-term goal.

It might make sense to send an initial version which doesn't support
receiver-map mode first, because that avoids the whole PG_foreign
issue.  It'd be a bit slow, but it would work, and it'd be properly
cross-compatible with a receiver-map capable version.

> > The NC2 approach is basically similar to the NC1 approach, but
> > generalised so that NC1 and NC2 can cooperate in a reasonably sane
> > way.  It still uses the PG_foreign bit to identify foreign pages, and
> > the page->private and page->mapping fields for various bits of
> > information.
> Unfortunately the PG_foreign approach is a non-starter for upstream,
> mainly because adding new page flags is strongly frowned upon unless
> there's a very compelling reason.  Unless we can find some other kernel
> subsystems which can make use of a page destructor, we probably won't
> make the cut.  (It doesn't help that there are no more page flags left
> on 32-bit.)
Yeah, I didn't think that was going to go very far.

It might be possible to do something like:

1) Create a special struct address_space somewhere.  This wouldn't
   really do anything, but would just act as a placeholder.
2) Whenever we would normally set PG_foreign, set page->mapping to
   point at the placeholder address_space.
3) Rather than testing PG_foreign, test page->mapping == &placeholder.
4) Somehow move all of the Xen-specific bits which currently use
   ->mapping to use ->private instead.

Then we wouldn't need the page bit.  It's not even that much of an
abuse; foreign memory is arguably a special kind of address space, so
creating a struct address_space for it isn't insane.

> The approach I'm trying at the moment is to use the skb destructor
> mechanism to grab the pages out of the skb as its freed.  To deal with
> skb_clone, I'm adding a flag to the skb to force a clone to do a
> complete copy so there are no more aliases to the pages (skb_clone
> should be rare in the common case).
Yeah, that would work.  There needs to be some way for netback to get
grant references and so forth related to netchannel2-mapped pages, and
vice versa, but that shouldn't be too hard.

> > The basic idea is that everything which can map foreign pages and
> > expose them to the rest of Linux needs to allocate a foreign page
> > tracker (effectively an array of (domid, grant_ref, void *ctxt)
> > tuples), and to register mapped pages with that tracker.  It then uses
> > the top few bits of page->private to identify the tracker, and the
> > rest to index into the array.  This allows you to forward packets from
> > a foreign domain without knowing whether it was received by NC1 or
> > NC2.
> Well, if its wrapped by a skb, we can get the skb destructor to handle
> the cleanup phase.  So long as we get the callback, I don't think it
> should affect the rest of the mechanism.
Cleanup isn't the tricky part.  The problem is that you can't forward
a packet unless you know which domain it came from and the relevant
grant references, because Xen won't let you create grant references on
a mapping of another domain's memory.  You therefore need some way of
translating a struct page in an skb into a (domid_t, grant_ref_t)
pair.  netback currently handles this with some private lookup tables,
but that only works if it's the only thing which can inject foreign
mappings into the stack.  The foreign map tracker stuff was an attempt
to generalise this to work with multiple netback-like drivers.

> > Arguably, blkback should be using this mechanism as well, but since
> > we've gotten away with it so far I thought it'd be best to let
> > sleeping dogs lie.  The only time it'd make any difference would be
> > when pages out of a block request somehow get attached to network
> > packets, which seems unlikely.
> Block lifetimes are simpler because there's no cloning and bios have a
> end_io callback which is more or less equivalent to the skb destructor.
Yes, that's true, the cleanup bit is much easier for block requests,
but you still potentially have a forwarding issue.  There are a couple
of potentially problematic scenarios:

1) You might have nested block devices.  Suppose you have three
domains (domA, domB, and domC), and a physical block device sdX in
domA.  DomA could then be configured to run a blkback exposing sdX to
domB as xvdY.  DomB might then itself run a blkback exposing xvdY to
domC as xvdZ.  This won't work.  Requests issued by domC will be
mapped by domB's blkback and injected into its local storage stack,
and will eventually reach domB's xvdY blkfront.  This will try to
grant domA access to the relevant memory, but, because it doesn't know
about foreign mappings, it'll grant as if the memory was owned by
domB.  Xen will then reject domA's attempts to map these domB grants,
and every request on xvdZ will fail.

Admittedly, that'd be a rather stupid configuration, but it's not
currently blocked by the tools (and it'd be rather difficult to block,
even if we wanted to).

2) I've not actually checked this, but I suspect we have problem if
you're running an iSCSI initiator in dom0 against a target running in
a domU, and then try to expose the SCSI device in dom0 as a block
device in some other domU.  When requests come in from the blkfront,
the dom0 blkback will map them as foreign pages, and then pass them
off to the iSCSI initiator.  It would make sense for the pages in the
block request to get attached to the skb as fragment pages, rather
than copied.  When the skb eventually reaches netback, netback will
try to do a grant copy into the receiving netfront's buffers (because
PG_foreign isn't set), which will fail, because dom0 doesn't actually
own the pages.

As I say, I've not actually checked whether that's how the initiators
work, but it would be a sane implementation if you're talking to a NIC
with jumbogram support.



Thinking some more, there's another variant of this bug which doesn't
involve block devices at all: bridging between a netfront and a
netback.  If you have a single bridge with both netfront and netback
devices attached to it, and you're not in ALWAYS_COPY_SKB mode,
forwarding packets from the netback interface to the netfront one
won't work.  Packets received by netback will be foreign mappings, but
netfront doesn't know that, so when it sends packets to the backend
it'll set up grants as if they were in local memory, which won't work.
I'm not sure what the right fix for that is; probably just copying
the packet in netfront.

Steven.


[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 00/17] Netchannel2 for a modern git kernel
  2009-10-20  9:40               ` Steven Smith
@ 2009-10-23 22:06                 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 13+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-23 22:06 UTC (permalink / raw)
  To: Steven Smith
  Cc: Steven Smith, Ian Campbell, xen-devel, Keir Fraser, joserenato.santos

On 10/20/09 02:40, Steven Smith wrote:
> It might make sense to send an initial version which doesn't support
> receiver-map mode first, because that avoids the whole PG_foreign
> issue.  It'd be a bit slow, but it would work, and it'd be properly
> cross-compatible with a receiver-map capable version.
>   

I don't think there's any particular rush to get it upstream.  Getting
the core stuff up is more important; if there's a set of relatively
self-contained driver-like patches out of tree, then that's fairly easy
to manage (though getting everything upstream is the ultimate goal).

>>> The NC2 approach is basically similar to the NC1 approach, but
>>> generalised so that NC1 and NC2 can cooperate in a reasonably sane
>>> way.  It still uses the PG_foreign bit to identify foreign pages, and
>>> the page->private and page->mapping fields for various bits of
>>> information.
>>>       
>> Unfortunately the PG_foreign approach is a non-starter for upstream,
>> mainly because adding new page flags is strongly frowned upon unless
>> there's a very compelling reason.  Unless we can find some other kernel
>> subsystems which can make use of a page destructor, we probably won't
>> make the cut.  (It doesn't help that there are no more page flags left
>> on 32-bit.)
>>     
> Yeah, I didn't think that was going to go very far.
>
> It might be possible to do something like:
>
> 1) Create a special struct address_space somewhere.  This wouldn't
>    really do anything, but would just act as a placeholder.
> 2) Whenever we would normally set PG_foreign, set page->mapping to
>    point at the placeholder address_space.
> 3) Rather than testing PG_foreign, test page->mapping == &placeholder.
> 4) Somehow move all of the Xen-specific bits which currently use
>    ->mapping to use ->private instead.
>
> Then we wouldn't need the page bit.  It's not even that much of an
> abuse; foreign memory is arguably a special kind of address space, so
> creating a struct address_space for it isn't insane.
>   

Yes, that's an interesting idea.  There are a few instances of
non-usermode-visible filesystems for things like this, so there's
precedent.  (And maybe making it user-visible would be a clean way to
map foreign pages into usermode...).

But it still doesn't give us a callback to reclaim the page once its
done.  We could elevate the refcount to prevent the page from ever being
released, but we'd still have to go out an manually search for them
rather than getting proper notifications.

>> The approach I'm trying at the moment is to use the skb destructor
>> mechanism to grab the pages out of the skb as its freed.  To deal with
>> skb_clone, I'm adding a flag to the skb to force a clone to do a
>> complete copy so there are no more aliases to the pages (skb_clone
>> should be rare in the common case).
>>     
> Yeah, that would work.  There needs to be some way for netback to get
> grant references and so forth related to netchannel2-mapped pages, and
> vice versa, but that shouldn't be too hard.
>   

Large numbers of the struct page fields would be available for borrowing.

> Yes, that's true, the cleanup bit is much easier for block requests,
> but you still potentially have a forwarding issue.  There are a couple
> of potentially problematic scenarios:
>
> 1) You might have nested block devices.  Suppose you have three
> domains (domA, domB, and domC), and a physical block device sdX in
> domA.  DomA could then be configured to run a blkback exposing sdX to
> domB as xvdY.  DomB might then itself run a blkback exposing xvdY to
> domC as xvdZ.  This won't work.  Requests issued by domC will be
> mapped by domB's blkback and injected into its local storage stack,
> and will eventually reach domB's xvdY blkfront.  This will try to
> grant domA access to the relevant memory, but, because it doesn't know
> about foreign mappings, it'll grant as if the memory was owned by
> domB.  Xen will then reject domA's attempts to map these domB grants,
> and every request on xvdZ will fail.
>
> Admittedly, that'd be a rather stupid configuration, but it's not
> currently blocked by the tools (and it'd be rather difficult to block,
> even if we wanted to).
>   

It's not completely outlandish, but its a bit unfortunate that the
control-plane operations also require the page-data to be mapped in the
intermediate domain.  It would be nice to have some kind of bypass mode
so that the data can be mapped directly between A and C.

> 2) I've not actually checked this, but I suspect we have problem if
> you're running an iSCSI initiator in dom0 against a target running in
> a domU, and then try to expose the SCSI device in dom0 as a block
> device in some other domU.  When requests come in from the blkfront,
> the dom0 blkback will map them as foreign pages, and then pass them
> off to the iSCSI initiator.  It would make sense for the pages in the
> block request to get attached to the skb as fragment pages, rather
> than copied.  When the skb eventually reaches netback, netback will
> try to do a grant copy into the receiving netfront's buffers (because
> PG_foreign isn't set), which will fail, because dom0 doesn't actually
> own the pages.
>
> As I say, I've not actually checked whether that's how the initiators
> work, but it would be a sane implementation if you're talking to a NIC
> with jumbogram support.
>
>
>
> Thinking some more, there's another variant of this bug which doesn't
> involve block devices at all: bridging between a netfront and a
> netback.  If you have a single bridge with both netfront and netback
> devices attached to it, and you're not in ALWAYS_COPY_SKB mode,
> forwarding packets from the netback interface to the netfront one
> won't work.  Packets received by netback will be foreign mappings, but
> netfront doesn't know that, so when it sends packets to the backend
> it'll set up grants as if they were in local memory, which won't work.
> I'm not sure what the right fix for that is; probably just copying
> the packet in netfront.
>   

Yeah.  More instances of getting the data mixed up in control
operations.  If the domain is acting as an intermediate between two
other domains, it doesn't need to see or touch the data at all.  Net is
a bit more complex than block because the header is more closely mixed
up with the payload, but just copying the headers and passing
through-mapping references for the pages should do the trick.

And I think your AS idea works for distinguishing
transient-and-perhaps-not-present pages quite well.

    J

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-10-23 22:06 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1254667618.git.ssmith@weybridge.uk.xensource.com>
2009-10-04 15:04 ` [PATCH 10/17] Transmit and receive checksum offload support steven.smith
2009-10-04 23:09 ` [PATCH 00/17] Netchannel2 for a modern git kernel Jeremy Fitzhardinge
2009-10-05  9:29   ` Steven Smith
2009-10-05 21:22     ` Jeremy Fitzhardinge
2009-10-06  9:06       ` Steven Smith
2009-10-06 16:35         ` Steven Smith
2009-10-06 17:12           ` Jeremy Fitzhardinge
2009-10-07 19:17             ` Steven Smith
2009-10-06 17:06         ` Jeremy Fitzhardinge
2009-10-07  8:15           ` Steven Smith
2009-10-20  6:00             ` Jeremy Fitzhardinge
2009-10-20  9:40               ` Steven Smith
2009-10-23 22:06                 ` Jeremy Fitzhardinge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.