All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] tilegx network driver: initial support
  2012-04-30 14:35                   ` Arnd Bergmann
@ 2001-09-17  4:00                     ` Chris Metcalf
  2012-05-03  5:41                       ` David Miller
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2001-09-17  4:00 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This patch incoporates Arnd's comments about DEFINE_MUTEX(), per_cpu(),
and using module parameters.  I did not choose to break apart
tile_net_tx_tso() because when I did so, I ended up with sub-functions
requiring up to eleven parameters to carry the state around, and at
that point it seemed no better than the "one long function" model we
had before.

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1949 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1952 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..6f7eaf4
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1949 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+
+#include <gxio/mpipe.h>
+
+/* For TSO */
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+
+#include <arch/sim.h>
+
+
+/* #define USE_SIM_PRINTF */
+
+#ifdef USE_SIM_PRINTF
+
+static __attribute__((unused, format (printf, 1, 2))) void
+sim_printf(const char *format, ...)
+{
+	char *str;
+	char buf[1024];
+
+	va_list args;
+	va_start(args, format);
+	(void)vsnprintf(buf, sizeof(buf), format, args);
+	va_end(args);
+
+	/* NOTE: Copied from "sim_print()". */
+	for (str = buf; *str != '\0'; str++) {
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+			     (*str << _SIM_CONTROL_OPERATOR_BITS));
+	}
+	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+		     (SIM_PUTC_FLUSH_BINARY << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+
+/* HACK: Allow use of "sim_printf()" instead of "printk()". */
+#define printk sim_printf
+
+#endif
+
+
+/* First, "tile_net_init_module()" initializes each network cpu to
+ * handle incoming packets, and initializes all the network devices.
+ *
+ * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
+ * turn on packet processing, if needed.
+ *
+ * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
+ * stop egress, and possibly turn off packet processing.
+ *
+ * We start out with the ingress IRQ enabled on each CPU.  When it
+ * fires, it is automatically disabled, and we call "napi_schedule()".
+ * This will cause "tile_net_poll()" to be called, which will pull
+ * packets from the netio queue, filtering them out, or passing them
+ * to "netif_receive_skb()".  If our budget is exhausted, we will
+ * return, knowing we will be called again later.  Otherwise, we
+ * reenable the ingress IRQ, and call "napi_complete()".
+ *
+ *
+ * NOTE: Failing to free completions for an arbitrarily long time
+ * (which is defined to be illegal) does in fact cause bizarre problems.
+ *
+ * NOTE: The egress code can be interrupted by the interrupt handler.
+ */
+
+
+/* HACK: Define to support GSO.
+ * ISSUE: This may actually hurt performance of the TCP blaster.
+ */
+#undef TILE_NET_GSO
+
+/* HACK: Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Define to use "round robin" distribution. */
+#undef TILE_NET_ROUND_ROBIN
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+
+#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))
+
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warning("tile_net.cpus=%s: malformed cpu list\n",
+		       network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warning("Ignoring empty tile_net.cpus='%s'.\n",
+			   network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+	       s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				    gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+
+	void *va;
+
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	void *buf;
+	unsigned long len;
+
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = tile_io_addr_to_va((unsigned long)gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 * FIXME: The classifier will soon detect "multicast".
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va)
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p\n",
+			      buf, skb, skb->data);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		atomic_add(1, (atomic_t *)&priv->stats.rx_packets);
+		atomic_add(len, (atomic_t *)&priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	unsigned int work = 0;
+
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t* equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+
+	unsigned int i;
+
+	bool pending = false;
+
+	unsigned long irqflags;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				/* FIXME: HACK: We use one of the devices.
+				 * ISSUE: We never call "netif_napi_del()".
+				 */
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+
+/* Helper function for tile_net_open() and tile_net_stop(). */
+static int tile_net_update(void)
+{
+	struct net_device *dev = NULL;
+	int channel;
+	long count = 0;
+	int cpu;
+
+	/* HACK: This is too big for the linux stack. */
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	/* TODO: Add support for "dmac" splitting? */
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (dev == NULL) {
+			dev = tile_net_devs_for_channel[channel];
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can happen if there is no classifier.
+	 * ISSUE: Can anything else cause it to happen?
+	 */
+	if (gxio_mpipe_rules_commit(&rules) != 0) {
+		pr_warning("Failed to update classifier rules!\n");
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu, dev, 1);
+
+	/* HACK: Allow packets to flow. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+
+/* Helper function for "tile_net_init_cpus()". */
+static void tile_net_init_stacks(int network_cpus_count)
+{
+	int err;
+	int i;
+
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+
+	int num_buffers;
+
+	size_t stack_bytes;
+
+	pte_t pte = { 0 };
+
+	void *mem;
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes, honoring the 64KB minimum alignment. */
+	stack_bytes = ROUND_UP(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			       64 * 1024);
+	if (stack_bytes > HPAGE_SIZE)
+		panic("Cannot allocate %d physically contiguous buffers.",
+		      num_buffers);
+
+#if 0
+	sim_printf("Using %d buffers for %d network cpus.\n",
+		   num_buffers, network_cpus_count);
+#endif
+
+	/* Allocate two buffer stacks. */
+	small_buffer_stack = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (small_buffer_stack < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buffer_stacks()'");
+	large_buffer_stack = small_buffer_stack + 1;
+
+	/* Allocate the small memory stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					   small_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Allocate the large buffer stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					   large_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Pin all the client memory. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	err = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+	err = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+
+	/* Provide initial buffers. */
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true))
+			panic("Cannot provide initial buffers!");
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false))
+			panic("Cannot provide initial buffers!");
+	}
+}
+
+
+/* Actually initialize the mPIPE state. */
+static int tile_net_init_cpus(void)
+{
+	int network_cpus_count;
+
+	int ring;
+	int group;
+
+	int next_ring;
+
+	int cpu;
+
+	int i;
+
+#ifdef TILE_NET_ROUND_ROBIN
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_ROUND_ROBIN;
+#else
+	/* Use random rebalancing. */
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY;
+#endif
+
+	if (!hash_default) {
+		pr_warning("Networking requires hash_default!\n");
+		goto fail;
+	}
+
+	if (gxio_mpipe_init(&context, 0) != 0) {
+		pr_warning("Failed to initialize mPIPE!\n");
+		goto fail;
+	}
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	/* ISSUE: Handle failures more gracefully. */
+	tile_net_init_stacks(network_cpus_count);
+
+	/* Allocate one NotifRing for each network cpu. */
+	ring = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					    0, 0);
+	if (ring < 0) {
+		pr_warning("Failed to allocate notif rings.\n");
+		goto fail;
+	}
+
+	/* ISSUE: Handle failures below more cleanly. */
+
+	/* Init NotifRings. */
+	next_ring = ring;
+
+	for_each_online_cpu(cpu) {
+
+		size_t notif_ring_size =
+			IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		/* ISSUE: This is overkill. */
+		size_t comps_size =
+			TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate comps memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		/* ISSUE: Is this needed? */
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate iqueue memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+
+		if (gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					   addr, notif_ring_size, 0) != 0)
+			panic("Failure in 'gxio_mpipe_iqueue_init()'.");
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	group = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (group < 0)
+		panic("Failure in 'gxio_mpipe_alloc_notif_groups()'.");
+
+#ifndef TILE_NET_ROUND_ROBIN
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+#endif
+
+	/* Allocate some buckets. */
+	first_bucket = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (first_bucket < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buckets()'.");
+
+	/* Init group and buckets. */
+	if (gxio_mpipe_init_notif_group_and_buckets(&context, group, ring,
+						    network_cpus_count,
+						    first_bucket, num_buckets,
+						    mode) != 0)
+		panic("Fail in 'gxio_mpipe_init_notif_group_and_buckets().");
+
+
+	/* Create an irq and register it. */
+	ingress_irq = create_irq();
+	if (ingress_irq < 0)
+		panic("Failed to create irq for ingress.");
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	BUG_ON(request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			   0, NULL, NULL) != 0);
+
+	for_each_online_cpu(cpu) {
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		int ring = info->iqueue.ring;
+
+		if (!info->has_iqueue)
+			continue;
+
+		gxio_mpipe_request_notif_ring_interrupt(&context,
+							cpu_x(cpu), cpu_y(cpu),
+							1, ingress_irq, ring);
+	}
+
+	return 0;
+
+fail:
+	return -EIO;
+}
+
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(int echannel)
+{
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char* headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t* edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t* equeue;
+	int edma;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		pr_warning("Could not allocate memory for TSO headers.\n");
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		pr_warning("Could not allocate memory for eDMA ring.\n");
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		pr_warning("Could not allocate memory for equeue info.\n");
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring. */
+	edma = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (edma < 0) {
+		pr_warning("Could not allocate edma ring.\n");
+		goto fail_equeue;
+	}
+
+	/* Initialize the equeue.  This should not fail. */
+	if (gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				   edescs, edescs_size, 0) != 0)
+		panic("Failure in 'gxio_mpipe_equeue_init()'.");
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return -EIO;
+}
+
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Determine if this is the "loopify" device. */
+	bool loopify = (loopify_link_name != NULL) &&
+		!strcmp(dev->name, loopify_link_name);
+
+	int result;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	if (ingress_irq < 0) {
+		result = tile_net_init_cpus();
+		if (result != 0)
+			goto fail;
+	}
+
+	if (priv->channel < 0) {
+		const char* ln = loopify ? "loop0" : dev->name;
+		if (gxio_mpipe_link_open(&priv->link, &context, ln, 0) < 0) {
+			netdev_err(dev, "Failed to open '%s'.\n", ln);
+			result = -EIO;
+			goto fail;
+		}
+		priv->channel = gxio_mpipe_link_channel(&priv->link);
+		BUG_ON(priv->channel < 0 ||
+		       priv->channel >= TILE_NET_CHANNELS);
+	}
+
+	if (loopify && priv->loopify_channel < 0) {
+		if (gxio_mpipe_link_open(&priv->loopify_link,
+					 &context, "loop1", 0) < 0) {
+			netdev_err(dev, "Failed to open 'loop1'.\n");
+			result = -EIO;
+			goto fail;
+		}
+		priv->loopify_channel =
+			gxio_mpipe_link_channel(&priv->loopify_link);
+		BUG_ON(priv->loopify_channel < 0 ||
+			priv->loopify_channel >= TILE_NET_CHANNELS);
+	}
+
+	priv->echannel =
+		((priv->loopify_channel >= 0) ?
+		 priv->loopify_channel : priv->channel);
+
+	/* Initialize egress info (if needed). */
+	result = tile_net_init_egress(priv->echannel);
+	if (result != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	result = tile_net_update();
+	if (result != 0)
+		goto fail_channel;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail_channel:
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		else
+			priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		else
+			priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+	return result;
+}
+
+
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update();
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	BUG_ON(skb->data_len == 0);
+	BUG_ON(sh->nr_frags == 0);
+	BUG_ON(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	BUG_ON(skb_headlen(skb) != sh_len);
+	BUG_ON(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	BUG_ON(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	BUG_ON(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	BUG_ON(skb->protocol != htons(ETH_P_IP));
+	BUG_ON(ih->protocol != IPPROTO_TCP);
+	BUG_ON(skb->ip_summed != CHECKSUM_PARTIAL);
+	BUG_ON(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	BUG_ON(f_id + 1 != sh->nr_frags);
+	BUG_ON(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final)
+			uh->fin = uh->psh = 0;
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	atomic_add(tx_packets, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add(tx_bytes, (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+		   (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+
+/* Get System Network Statistics.
+ *
+ * Returns the address of the device statistics structure.
+ */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+
+/* Change the "mtu". */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+
+/* Change the Ethernet Address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops      = &tile_net_ops;
+	dev->watchdog_timeo  = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t* mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = priv->loopify_channel = priv->echannel = -1;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v10] tilegx network driver: initial support
  2012-06-06 18:54                                                 ` David Miller
@ 2001-09-17  4:00                                                   ` Chris Metcalf
  2012-04-06 20:42                                                   ` Chris Metcalf
  2012-06-07 20:45                                                   ` Chris Metcalf
  2 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2001-09-17  4:00 UTC (permalink / raw)
  To: David Miller, eric.dumazet, bhutchings, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version makes the driver multi-queued and support non-zero
tx_queue_len.  I also made a couple of magic numbers into #defines.
I skimmed the tg3.c driver, but didn't see any other obvious
changes that would be appropriate.

 drivers/net/ethernet/tile/Kconfig  |    2 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1898 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1902 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..098b1c4 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,8 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
+	select HIGH_RES_TIMERS if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..ee7556a
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1898 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and echannel. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* The transmit wake timer for a given cpu and echannel. */
+struct tile_net_tx_wake {
+	struct hrtimer timer;
+	struct net_device *dev;
+};
+	
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct hrtimer egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+	/* Transmit wake timer for each egress channel. */
+	struct tile_net_tx_wake tx_wake[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* Buffer sizes and mpipe enum codes for buffer stacks.
+ * See arch/tile/include/gxio/mpipe.h for the set of possible values.
+ */
+#define BUFFER_SIZE_SMALL_ENUM GXIO_MPIPE_BUFFER_SIZE_128
+#define BUFFER_SIZE_SMALL 128
+#define BUFFER_SIZE_LARGE_ENUM GXIO_MPIPE_BUFFER_SIZE_1664
+#define BUFFER_SIZE_LARGE 1664
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? BUFFER_SIZE_SMALL : BUFFER_SIZE_LARGE);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	for (;;) {
+		tile_io_addr_t addr =
+			(tile_io_addr_t)gxio_mpipe_pop_buffer(&context, stack);
+		if (addr == 0)
+			break;
+		dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr)));
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == BUFFER_SIZE_SMALL_ENUM)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(gxio_mpipe_idesc_t *idesc)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		struct tile_net_priv *priv = netdev_priv(dev);
+		tile_net_stats_add(1, &priv->stats.rx_dropped);
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers();
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			break;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			break;
+	}
+	return n;
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	hrtimer_start(&info->tx_wake[priv->echannel].timer,
+		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+	struct tile_net_tx_wake *tx_wake =
+		container_of(t, struct tile_net_tx_wake, timer);
+	netif_wake_subqueue(tx_wake->dev, smp_processor_id());
+	return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled. */
+static void tile_net_schedule_egress_timer(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->egress_timer_scheduled) {
+		hrtimer_start(&info->egress_timer,
+			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      HRTIMER_MODE_REL_PINNED);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer();
+
+	local_irq_restore(irqflags);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = arg;
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  BUFFER_SIZE_SMALL_ENUM,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  BUFFER_SIZE_LARGE_ENUM,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring++,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc < 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return ring;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring);
+		if (rc < 0)
+			goto fail;
+		ring = rc;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu, rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize the transmit wake timer for this device for each cpu. */
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_init(&tx_wake->timer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		tx_wake->timer.function = tile_net_handle_tx_wake_timer;
+		tx_wake->dev = dev;
+	}
+
+	for_each_online_cpu(cpu)
+		netif_start_subqueue(dev, cpu);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_cancel(&tx_wake->timer);
+		netif_stop_subqueue(dev, cpu);
+	}
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+				       struct tile_net_comps *comps,
+				       gxio_mpipe_equeue_t *equeue,
+				       int num_edescs)
+{
+	/* Try to acquire a completion entry. */
+	if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+	    tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+		/* Try to acquire an egress slot. */
+		s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+
+		/* Freeing some completions gives the equeue time to drain. */
+		tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+		slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+	}
+
+	/* Still nothing; give up and stop the queue for a short while. */
+	netif_stop_subqueue(dev, smp_processor_id());
+	tile_net_schedule_tx_wake_timer(dev);
+	return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int data_len = skb->data_len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, p_len;
+	unsigned int isum_seed, tsum_seed, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = skb_network_offset(skb);
+	th_off = skb_transport_offset(skb);
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_seed = ((0xFFFF - ih->check) +
+		     (0xFFFF - ih->tot_len) +
+		     (0xFFFF - ih->id));
+	tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(sh_len + p_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_seed + ih->tot_len +
+				      ih->id) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_seed + htons(sh_len + p_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do "TSO" handling for egress.
+ *
+ * Normally drivers set NETIF_F_TSO only to support hardware TSO;
+ * otherwise the stack uses scatter-gather to implement GSO in software.
+ * On our testing, enabling GSO support (via NETIF_F_SG) drops network
+ * performance down to around 7.5 Gbps on the 10G interfaces, although
+ * also dropping cpu utilization way down, to under 8%.  But
+ * implementing "TSO" in the driver brings performance back up to line
+ * rate, while dropping cpu usage even further, to less than 4%.  In
+ * practice, profiling of GSO shows that skb_segment() is what causes
+ * the performance overheads; we benefit in the driver from using
+ * preallocated memory to duplicate the TCP/IP headers.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	struct tile_net_comps *comps = info->comps_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned long irqflags;
+	int num_edescs;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_edescs;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_edescs; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_edescs - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_edescs; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Return subqueue id on this core (one per core). */
+static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	return smp_processor_id();
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		netif_wake_subqueue(dev, cpu);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_select_queue = tile_net_select_queue,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev_mqs(sizeof(*priv), name, tile_net_setup,
+			       NR_CPUS, 1);
+	if (!dev) {
+		pr_err("alloc_netdev_mqs(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 0/6] arch/tile: provide tilegx networking support
@ 2012-04-04 20:39 Chris Metcalf
  2012-04-04 20:39 ` [PATCH 1/6] arch/tile: introduce GXIO IORPC framework for tilegx Chris Metcalf
                   ` (5 more replies)
  0 siblings, 6 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:39 UTC (permalink / raw)
  To: linux-kernel, netdev

This change set provides support for the on-chip networking hardware
on the TILE-Gx chip.  The actual network driver is layered on top of
the mPIPE hardware support, which is layered on top of the GXIO IORPC
facility (and its DMA layer).  The change set breaks this down into
a series of patches, each progressively adding another component of
functionality.  There are also a couple of simple changes to add
support for MMIO memory mappings for readb/writeb, and for installing
MMIO PTEs, which are necessary for this change.

 arch/tile/Kconfig                         |    2 +
 arch/tile/Makefile                        |    1 +
 arch/tile/gxio/Kconfig                    |   25 +
 arch/tile/gxio/Makefile                   |    7 +
 arch/tile/gxio/dma_queue.c                |  236 ++++
 arch/tile/gxio/iorpc_globals.c            |  102 ++
 arch/tile/gxio/iorpc_mpipe.c              |  571 ++++++++
 arch/tile/gxio/iorpc_mpipe_info.c         |   95 ++
 arch/tile/gxio/kiorpc.c                   |   60 +
 arch/tile/gxio/mpipe.c                    |  631 +++++++++
 arch/tile/include/arch/mpipe.h            |  321 +++++
 arch/tile/include/arch/mpipe_constants.h  |   43 +
 arch/tile/include/arch/mpipe_def.h        |   39 +
 arch/tile/include/arch/mpipe_shm.h        |  421 ++++++
 arch/tile/include/arch/mpipe_shm_def.h    |   23 +
 arch/tile/include/asm/io.h                |  110 ++-
 arch/tile/include/gxio/common.h           |   40 +
 arch/tile/include/gxio/dma_queue.h        |   59 +
 arch/tile/include/gxio/iorpc_globals.h    |   38 +
 arch/tile/include/gxio/iorpc_mpipe.h      |  124 ++
 arch/tile/include/gxio/iorpc_mpipe_info.h |   46 +
 arch/tile/include/gxio/kiorpc.h           |   29 +
 arch/tile/include/gxio/mpipe.h            | 1986 ++++++++++++++++++++++++++++
 arch/tile/include/hv/drv_mpipe_intf.h     |  602 +++++++++
 arch/tile/include/hv/iorpc.h              |  716 ++++++++++
 arch/tile/mm/pgtable.c                    |   16 +-
 drivers/net/ethernet/tile/Kconfig         |    1 +
 drivers/net/ethernet/tile/Makefile        |    4 +-
 drivers/net/ethernet/tile/tilegx.c        | 2045 +++++++++++++++++++++++++++++
 29 files changed, 8359 insertions(+), 34 deletions(-)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH 1/6] arch/tile: introduce GXIO IORPC framework for tilegx
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
@ 2012-04-04 20:39 ` Chris Metcalf
  2012-04-04 20:58 ` [PATCH 4/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:39 UTC (permalink / raw)
  To: linux-kernel

The GXIO I/O RPC subsystem handles exporting I/O hardware resources to
Linux and to applications running under Linux.

For instance, memory which is made available for I/O DMA must be mapped
by an I/O TLB; that means that such memory must be locked down by Linux,
so that it is not swapped or otherwise reused, as long as those I/O
TLB entries are active. Similarly, configuring direct hardware access
introduces new validation requirements. If a user application registers
memory, Linux must ensure that the supplied virtual addresses are valid,
and turn them into client physical addresses. Similarly, when Linux then
supplies those client physical addresses to the Tilera hypervisor, it
must in turn validate those before turning them into the real physical
addresses which are required by the hardware.

To the extent that these sorts of activities were required on previous
TILE architecture processors, they were implemented in a device-specific
fashion. This meant that every I/O device had its own Tilera hypervisor
driver, its own Linux driver, and in some cases its own user-level
library support. There was a large amount of more-or-less functionally
identical code in different places, particularly in the different Linux
drivers. For TILE-Gx, this support has been generalized into a common
framework, known as the I/O RPC framework or just IORPC.

The two "gxio" directories (one for headers, one for sources) start
with just a few files in each with this infrastructure commit, but
after adding support for the on-board I/O shims for networking, PCI,
USB, crypto, compression, I2CS, etc., there end up being about 20 files
in each directory.

More information on the IORPC framework is in the <hv/iorpc.h> header,
included in this commit.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/Kconfig                      |    2 +
 arch/tile/Makefile                     |    1 +
 arch/tile/gxio/Kconfig                 |    7 +
 arch/tile/gxio/Makefile                |    5 +
 arch/tile/gxio/iorpc_globals.c         |  102 +++++
 arch/tile/gxio/kiorpc.c                |   60 +++
 arch/tile/include/gxio/common.h        |   40 ++
 arch/tile/include/gxio/iorpc_globals.h |   38 ++
 arch/tile/include/gxio/kiorpc.h        |   29 ++
 arch/tile/include/hv/iorpc.h           |  716 ++++++++++++++++++++++++++++++++
 10 files changed, 1000 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/Kconfig
 create mode 100644 arch/tile/gxio/Makefile
 create mode 100644 arch/tile/gxio/iorpc_globals.c
 create mode 100644 arch/tile/gxio/kiorpc.c
 create mode 100644 arch/tile/include/gxio/common.h
 create mode 100644 arch/tile/include/gxio/iorpc_globals.h
 create mode 100644 arch/tile/include/gxio/kiorpc.h
 create mode 100644 arch/tile/include/hv/iorpc.h

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 6599679..533820c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -346,6 +346,8 @@ config KERNEL_PL
 	  kernel will be built to run at.  Generally you should use
 	  the default value here.
 
+source "arch/tile/gxio/Kconfig"
+
 endmenu  # Tilera-specific configuration
 
 menu "Bus options"
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 17acce7..ea5966e 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -53,6 +53,7 @@ libs-y		+= $(LIBGCC_PATH)
 # See arch/tile/Kbuild for content of core part of the kernel
 core-y		+= arch/tile/
 
+core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
 core-$(CONFIG_KVM) += arch/tile/kvm/
 
 ifdef TILERA_ROOT
diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
new file mode 100644
index 0000000..c2bae7d
--- /dev/null
+++ b/arch/tile/gxio/Kconfig
@@ -0,0 +1,7 @@
+config TILE_GXIO
+	bool "Tilera Gx I/O support"
+	depends on TILEGX
+	---help---
+	  This option supports direct access to TILE-Gx hardware from
+	  user space, via the gxio library, or from kernel space, via
+	  kernel IORPC support.
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
new file mode 100644
index 0000000..db1ee28
--- /dev/null
+++ b/arch/tile/gxio/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Tile-Gx device access support.
+#
+
+obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
diff --git a/arch/tile/gxio/iorpc_globals.c b/arch/tile/gxio/iorpc_globals.c
new file mode 100644
index 0000000..3d60a1f
--- /dev/null
+++ b/arch/tile/gxio/iorpc_globals.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_globals.h"
+
+typedef struct {
+	iorpc_pollfd_t pollfd;
+} arm_pollfd_param_t;
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie)
+{
+	uint64_t __offset;
+	int __result;
+	arm_pollfd_param_t temp;
+	arm_pollfd_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	__offset = IORPC_OP_ARM_POLLFD;
+	__result = hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, __size, __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_arm_pollfd);
+
+typedef struct {
+	iorpc_pollfd_t pollfd;
+} close_pollfd_param_t;
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie)
+{
+	uint64_t __offset;
+	int __result;
+	close_pollfd_param_t temp;
+	close_pollfd_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	__offset = IORPC_OP_CLOSE_POLLFD;
+	__result = hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, __size, __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_close_pollfd);
+
+typedef struct {
+	HV_PTE base;
+} get_mmio_base_param_t;
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base)
+{
+	uint64_t __offset;
+	int __result;
+	get_mmio_base_param_t temp;
+	get_mmio_base_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__offset = IORPC_OP_GET_MMIO_BASE;
+	__result = hv_dev_pread(fd, 0, (HV_VirtAddr) params, __size, __offset);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_get_mmio_base);
+
+typedef struct {
+	unsigned long offset;
+	unsigned long size;
+} check_mmio_offset_param_t;
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size)
+{
+	uint64_t __offset;
+	int __result;
+	check_mmio_offset_param_t temp;
+	check_mmio_offset_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->offset = offset;
+	params->size = size;
+
+	__offset = IORPC_OP_CHECK_MMIO_OFFSET;
+	__result = hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, __size, __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_check_mmio_offset);
diff --git a/arch/tile/gxio/kiorpc.c b/arch/tile/gxio/kiorpc.c
new file mode 100644
index 0000000..1ad01c1
--- /dev/null
+++ b/arch/tile/gxio/kiorpc.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx IORPC support for kernel I/O drivers.
+ */
+
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/kiorpc.h>
+
+#ifdef DEBUG_IORPC
+#define TRACE(FMT, ...) pr_info(SIMPLE_MSG_LINE FMT, ## __VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+/* Create kernel-VA-space MMIO mapping for an on-chip IO device. */
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+			    unsigned long size)
+{
+	pgprot_t mmio_base, prot = { 0 };
+	unsigned long pfn;
+	int err;
+
+	/* Look up the shim's lotar and base PA. */
+	err = __iorpc_get_mmio_base(hv_fd, &mmio_base);
+	if (err) {
+		TRACE("get_mmio_base() failure: %d\n", err);
+		return NULL;
+	}
+
+	/* Make sure the HV driver approves of our offset and size. */
+	err = __iorpc_check_mmio_offset(hv_fd, offset, size);
+	if (err) {
+		TRACE("check_mmio_offset() failure: %d\n", err);
+		return NULL;
+	}
+
+	/*
+	 * mmio_base contains a base pfn and homing coordinates.  Turn
+	 * it into an MMIO pgprot and offset pfn.
+	 */
+	prot = hv_pte_set_lotar(prot, hv_pte_get_lotar(mmio_base));
+	pfn = pte_pfn(mmio_base) + PFN_DOWN(offset);
+
+	return ioremap_prot(PFN_PHYS(pfn), size, prot);
+}
+EXPORT_SYMBOL(iorpc_ioremap);
diff --git a/arch/tile/include/gxio/common.h b/arch/tile/include/gxio/common.h
new file mode 100644
index 0000000..e5ef1e2
--- /dev/null
+++ b/arch/tile/include/gxio/common.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_COMMON_H_
+#define _GXIO_COMMON_H_
+
+/*
+ * Routines shared between the various GXIO device components.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/io.h>
+
+/* Define the standard gxio MMIO functions using kernel functions. */
+#define __gxio_mmio_read8(addr)		readb(addr)
+#define __gxio_mmio_read16(addr)	readw(addr)
+#define __gxio_mmio_read32(addr)	readl(addr)
+#define __gxio_mmio_read64(addr)	readq(addr)
+#define __gxio_mmio_write8(addr, val)	writeb((val), (addr))
+#define __gxio_mmio_write16(addr, val)	writew((val), (addr))
+#define __gxio_mmio_write32(addr, val)	writel((val), (addr))
+#define __gxio_mmio_write64(addr, val)	writeq((val), (addr))
+
+/* Default size is 64-bit. */
+#define __gxio_mmio_read(addr) __gxio_mmio_read64(addr)
+#define __gxio_mmio_write(addr, val) __gxio_mmio_write64(addr, val)
+
+#endif /* !_GXIO_COMMON_H_ */
diff --git a/arch/tile/include/gxio/iorpc_globals.h b/arch/tile/include/gxio/iorpc_globals.h
new file mode 100644
index 0000000..3a8c07e
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_globals.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __IORPC_LINUX_RPC_H__
+#define __IORPC_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define IORPC_OP_ARM_POLLFD            IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define IORPC_OP_CLOSE_POLLFD          IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define IORPC_OP_GET_MMIO_BASE         IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define IORPC_OP_CHECK_MMIO_OFFSET     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base);
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size);
+
+#endif /* !__IORPC_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/kiorpc.h b/arch/tile/include/gxio/kiorpc.h
new file mode 100644
index 0000000..44df801
--- /dev/null
+++ b/arch/tile/include/gxio/kiorpc.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Support routines for kernel IORPC drivers.
+ */
+
+#ifndef _GXIO_KIORPC_H
+#define _GXIO_KIORPC_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+#include <arch/chip.h>
+
+#if CHIP_HAS_MMIO()
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+                            unsigned long size);
+#endif
+
+#endif /* _GXIO_KIORPC_H */
diff --git a/arch/tile/include/hv/iorpc.h b/arch/tile/include/hv/iorpc.h
new file mode 100644
index 0000000..08db918
--- /dev/null
+++ b/arch/tile/include/hv/iorpc.h
@@ -0,0 +1,716 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _HV_IORPC_H_
+#define _HV_IORPC_H_
+
+/**
+ *
+ * Error codes and struct definitions for the IO RPC library.
+ *
+ * The hypervisor's IO RPC component provides a convenient way for
+ * driver authors to proxy system calls between user space, linux, and
+ * the hypervisor driver.  The core of the system is a set of Python
+ * files that take ".idl" files as input and generates the following
+ * source code:
+ *
+ * - _rpc_call() routines for use in userspace IO libraries.  These
+ * routines take an argument list specified in the .idl file, pack the
+ * arguments in to a buffer, and read or write that buffer via the
+ * Linux iorpc driver.
+ *
+ * - dispatch_read() and dispatch_write() routines that hypervisor
+ * drivers can use to implement most of their dev_pread() and
+ * dev_pwrite() methods.  These routines decode the incoming parameter
+ * blob, permission check and translate parameters where appropriate,
+ * and then invoke a callback routine for whichever RPC call has
+ * arrived.  The driver simply implements the set of callback
+ * routines.
+ *
+ * The IO RPC system also includes the Linux 'iorpc' driver, which
+ * proxies calls between the userspace library and the hypervisor
+ * driver.  The Linux driver is almost entirely device agnostic; it
+ * watches for special flags indicating cases where a memory buffer
+ * address might need to be translated, etc.  As a result, driver
+ * writers can avoid many of the problem cases related to registering
+ * hardware resources like memory pages or interrupts.  However, the
+ * drivers must be careful to obey the conventions documented below in
+ * order to work properly with the generic Linux iorpc driver.
+ *
+ * @section iorpc_domains Service Domains
+ *
+ * All iorpc-based drivers must support a notion of service domains.
+ * A service domain is basically an application context - state
+ * indicating resources that are allocated to that particular app
+ * which it may access and (perhaps) other applications may not
+ * access.  Drivers can support any number of service domains they
+ * choose.  In some cases the design is limited by a number of service
+ * domains supported by the IO hardware; in other cases the service
+ * domains are a purely software concept and the driver chooses a
+ * maximum number of domains based on how much state memory it is
+ * willing to preallocate.
+ *
+ * For example, the mPIPE driver only supports as many service domains
+ * as are supported by the mPIPE hardware.  This limitation is
+ * required because the hardware implements its own MMIO protection
+ * scheme to allow large MMIO mappings while still protecting small
+ * register ranges within the page that should only be accessed by the
+ * hypervisor.
+ *
+ * In contrast, drivers with no hardware service domain limitations
+ * (for instance the TRIO shim) can implement an arbitrary number of
+ * service domains.  In these cases, each service domain is limited to
+ * a carefully restricted set of legal MMIO addresses if necessary to
+ * keep one application from corrupting another application's state.
+ *
+ * @section iorpc_conventions System Call Conventions
+ *
+ * The driver's open routine is responsible for allocating a new
+ * service domain for each hv_dev_open() call.  By convention, the
+ * return value from open() should be the service domain number on
+ * success, or GXIO_ERR_NO_SVC_DOM if no more service domains are
+ * available.
+ *
+ * The implementations of hv_dev_pread() and hv_dev_pwrite() are
+ * responsible for validating the devhdl value passed up by the
+ * client.  Since the device handle returned by hv_dev_open() should
+ * embed the positive service domain number, drivers should make sure
+ * that DRV_HDL2BITS(devhdl) is a legal service domain.  If the client
+ * passes an illegal service domain number, the routine should return
+ * GXIO_ERR_INVAL_SVC_DOM.  Once the service domain number has been
+ * validated, the driver can copy to/from the client buffer and call
+ * the dispatch_read() or dispatch_write() methods created by the RPC
+ * generator.
+ *
+ * The hv_dev_close() implementation should reset all service domain
+ * state and put the service domain back on a free list for
+ * reallocation by a future application.  In most cases, this will
+ * require executing a hardware reset or drain flow and denying any
+ * MMIO regions that were created for the service domain.
+ *
+ * @section iorpc_data Special Data Types
+ *
+ * The .idl file syntax allows the creation of syscalls with special
+ * parameters that require permission checks or translations as part
+ * of the system call path.  Because of limitations in the code
+ * generator, APIs are generally limited to just one of these special
+ * parameters per system call, and they are sometimes required to be
+ * the first or last parameter to the call.  Special parameters
+ * include:
+ *
+ * @subsection iorpc_mem_buffer MEM_BUFFER
+ *
+ * The MEM_BUFFER() datatype allows user space to "register" memory
+ * buffers with a device.  Registering memory accomplishes two tasks:
+ * Linux keeps track of all buffers that might be modified by a
+ * hardware device, and the hardware device drivers bind registered
+ * buffers to particular hardware resources like ingress NotifRings.
+ * The MEM_BUFFER() idl syntax can take extra flags like ALIGN_64KB,
+ * ALIGN_SELF_SIZE, and FLAGS indicating that memory buffers must have
+ * certain alignment or that the user should be able to pass a "memory
+ * flags" word specifying attributes like nt_hint or IO cache pinning.
+ * The parser will accept multiple MEM_BUFFER() flags.
+ *
+ * Implementations must obey the following conventions when
+ * registering memory buffers via the iorpc flow.  These rules are a
+ * result of the Linux driver implementation, which needs to keep
+ * track of how many times a particular page has been registered with
+ * the hardware so that it can release the page when all those
+ * registrations are cleared.
+ *
+ * - Memory registrations that refer to a resource which has already
+ * been bound must return GXIO_ERR_ALREADY_INIT.  Thus, it is an
+ * error to register memory twice without resetting (i.e. closing) the
+ * resource in between.  This convention keeps the Linux driver from
+ * having to track which particular devices a page is bound to.
+ *
+ * - At present, a memory registration is only cleared when the
+ * service domain is reset.  In this case, the Linux driver simply
+ * closes the HV device file handle and then decrements the reference
+ * counts of all pages that were previously registered with the
+ * device.
+ *
+ * - In the future, we may add a mechanism for unregistering memory.
+ * One possible implementation would require that the user specify
+ * which buffer is currently registered.  The HV would then verify
+ * that that page was actually the one currently mapped and return
+ * success or failure to Linux, which would then only decrement the
+ * page reference count if the addresses were mapped.  Another scheme
+ * might allow Linux to pass a token to the HV to be returned when the
+ * resource is unmapped.
+ *
+ * @subsection iorpc_interrupt INTERRUPT
+ *
+ * The INTERRUPT .idl datatype allows the client to bind hardware
+ * interrupts to a particular combination of IPI parameters - CPU, IPI
+ * PL, and event bit number.  This data is passed via a special
+ * datatype so that the Linux driver can validate the CPU and PL and
+ * the HV generic iorpc code can translate client CPUs to real CPUs.
+ *
+ * @subsection iorpc_pollfd_setup POLLFD_SETUP
+ *
+ * The POLLFD_SETUP .idl datatype allows the client to set up hardware
+ * interrupt bindings which are received by Linux but which are made
+ * visible to user processes as state transitions on a file descriptor;
+ * this allows user processes to use Linux primitives, such as poll(), to
+ * await particular hardware events.  This data is passed via a special
+ * datatype so that the Linux driver may recognize the pollable file
+ * descriptor and translate it to a set of interrupt target information,
+ * and so that the HV generic iorpc code can translate client CPUs to real
+ * CPUs.
+ *
+ * @subsection iorpc_pollfd POLLFD
+ *
+ * The POLLFD .idl datatype allows manipulation of hardware interrupt
+ * bindings set up via the POLLFD_SETUP datatype; common operations are
+ * resetting the state of the requested interrupt events, and unbinding any
+ * bound interrupts.  This data is passed via a special datatype so that
+ * the Linux driver may recognize the pollable file descriptor and
+ * translate it to an interrupt identifier previously supplied by the
+ * hypervisor as the result of an earlier pollfd_setup operation.
+ *
+ * @subsection iorpc_blob BLOB
+ *
+ * The BLOB .idl datatype allows the client to write an arbitrary
+ * length string of bytes up to the hypervisor driver.  This can be
+ * useful for passing up large, arbitrarily structured data like
+ * classifier programs.  The iorpc stack takes care of validating the
+ * buffer VA and CPA as the data passes up to the hypervisor.  Unlike
+ * MEM_BUFFER(), the buffer is not registered - Linux does not bump
+ * page refcounts and the HV driver should not reuse the buffer once
+ * the system call is complete.
+ *
+ * @section iorpc_translation Translating User Space Calls
+ *
+ * The ::iorpc_offset_t structure describes the formatting of the offset
+ * that is passed to pread() or pwrite() as part of the generated RPC code.
+ * When the user calls up to Linux, the rpc code fills in all the fields of
+ * the offset, including a 16-bit opcode, a 16 bit format indicator, and 32
+ * bits of user-specified "sub-offset".  The opcode indicates which syscall
+ * is being requested.  The format indicates whether there is a "prefix
+ * struct" at the start of the memory buffer passed to pwrite(), and if so
+ * what data is in that prefix struct.  These prefix structs are used to
+ * implement special datatypes like MEM_BUFFER() and INTERRUPT - we arrange
+ * to put data that needs translation and permission checks at the start of
+ * the buffer so that the Linux driver and generic portions of the HV iorpc
+ * code can easily access the data.  The 32 bits of user-specified
+ * "sub-offset" are most useful for pread() calls where the user needs to
+ * also pass in a few bits indicating which register to read, etc.
+ *
+ * The Linux iorpc driver watches for system calls that contain prefix
+ * structs so that it can translate parameters and bump reference
+ * counts as appropriate.  It does not (currently) have any knowledge
+ * of the per-device opcodes - it doesn't care what operation you're
+ * doing to mPIPE, so long as it can do all the generic book-keeping.
+ * The hv/iorpc.h header file defines all of the generic encoding bits
+ * needed to translate iorpc calls without knowing which particular
+ * opcode is being issued.
+ *
+ * @section iorpc_globals Global iorpc Calls
+ *
+ * Implementing mmap() required adding some special iorpc syscalls
+ * that are only called by the Linux driver, never by userspace.
+ * These include get_mmio_base() and check_mmio_offset().  These
+ * routines are described in globals.idl and must be included in every
+ * iorpc driver.  By providing these routines in every driver, Linux's
+ * mmap implementation can easily get the PTE bits it needs and
+ * validate the PA offset without needing to know the per-device
+ * opcodes to perform those tasks.
+ *
+ * @section iorpc_kernel Supporting gxio APIs in the Kernel
+ *
+ * The iorpc code generator also supports generation of kernel code
+ * implementing the gxio APIs.  This capability is currently used by
+ * the mPIPE network driver, and will likely be used by the TRIO root
+ * complex and endpoint drivers and perhaps an in-kernel crypto
+ * driver.  Each driver that wants to instantiate iorpc calls in the
+ * kernel needs to generate a kernel version of the generate rpc code
+ * and (probably) copy any related gxio source files into the kernel.
+ * The mPIPE driver provides a good example of this pattern.
+ */
+
+#ifdef __KERNEL__
+#include <linux/stddef.h>
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__HV__)
+#include <hv/hypervisor.h>
+#elif defined(__KERNEL__)
+#include "hypervisor.h"
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+
+/** Code indicating translation services required within the RPC path.
+ * These indicate whether there is a translatable struct at the start
+ * of the RPC buffer and what information that struct contains.
+ */
+enum iorpc_format_e
+{
+  /** No translation required, no prefix struct. */
+  IORPC_FORMAT_NONE,
+
+  /** No translation required, no prefix struct, no access to this
+   *  operation from user space. */
+  IORPC_FORMAT_NONE_NOUSER,
+
+  /** Prefix struct contains user VA and size. */
+  IORPC_FORMAT_USER_MEM,
+
+  /** Prefix struct contains CPA, size, and homing bits. */
+  IORPC_FORMAT_KERNEL_MEM,
+
+  /** Prefix struct contains interrupt. */
+  IORPC_FORMAT_KERNEL_INTERRUPT,
+
+  /** Prefix struct contains user-level interrupt. */
+  IORPC_FORMAT_USER_INTERRUPT,
+
+  /** Prefix struct contains pollfd_setup (interrupt information). */
+  IORPC_FORMAT_KERNEL_POLLFD_SETUP,
+
+  /** Prefix struct contains user-level pollfd_setup (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD_SETUP,
+
+  /** Prefix struct contains pollfd (interrupt cookie). */
+  IORPC_FORMAT_KERNEL_POLLFD,
+
+  /** Prefix struct contains user-level pollfd (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD,
+};
+
+
+/** Generate an opcode given format and code. */
+#define IORPC_OPCODE(FORMAT, CODE) (((FORMAT) << 16) | (CODE))
+
+/** The offset passed through the read() and write() system calls
+    combines an opcode with 32 bits of user-specified offset. */
+typedef union
+{
+#ifndef __BIG_ENDIAN__
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint16_t code;              /**< RPC code. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint32_t sub_offset;        /**< caller-specified offset. */
+  };
+
+  uint32_t opcode;              /**< Opcode combines code & format. */
+#else
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint32_t sub_offset;        /**< caller-specified offset. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint16_t code;              /**< RPC code. */
+  };
+
+  struct
+  {
+    uint32_t padding;
+    uint32_t opcode;              /**< Opcode combines code & format. */
+  };
+#endif
+}
+iorpc_offset_t;
+
+
+/** Homing and cache hinting bits that can be used by IO devices. */
+typedef struct
+{
+  unsigned int lotar_x:4;       /**< lotar X bits (or Gx page_mask). */
+  unsigned int lotar_y:4;       /**< lotar Y bits (or Gx page_offset). */
+  unsigned int hfh:1;           /**< Uses hash-for-home. */
+  unsigned int nt_hint:1;       /**< Non-temporal hint. */
+  unsigned int io_pin:1;        /**< Only fill 'IO' cache ways. */
+}
+iorpc_mem_attr_t;
+
+/** Set the nt_hint bit. */
+#define IORPC_MEM_BUFFER_FLAG_NT_HINT (1 << 0)
+
+/** Set the IO pin bit. */
+#define IORPC_MEM_BUFFER_FLAG_IO_PIN (1 << 1)
+
+
+/** A structure used to describe memory registration.  Different
+    protection levels describe memory differently, so this union
+    contains all the different possible descriptions.  As a request
+    moves up the call chain, each layer translates from one
+    description format to the next.  In particular, the Linux iorpc
+    driver translates user VAs into CPAs and homing parameters. */
+typedef union
+{
+  struct
+  {
+    uint64_t va;                /**< User virtual address. */
+    uint64_t size;              /**< Buffer size. */
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  user;                         /**< Buffer as described by user apps. */
+
+  struct
+  {
+    unsigned long long cpa;     /**< Client physical address. */
+#if defined(__KERNEL__) || defined(__HV__)
+    size_t size;                /**< Buffer size. */
+    HV_PTE pte;                 /**< PTE describing memory homing. */
+#else
+    uint64_t size;
+    uint64_t pte;
+#endif
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  kernel;                       /**< Buffer as described by kernel. */
+
+  struct
+  {
+    unsigned long long pa;      /**< Physical address. */
+    size_t size;                /**< Buffer size. */
+    iorpc_mem_attr_t attr;      /**< Homing and locality hint bits. */
+  }
+  hv;                           /**< Buffer parameters for HV driver. */
+} iorpc_mem_buffer_t;
+
+
+/** A structure used to describe interrupts.  The format differs slightly
+ *  for user and kernel interrupts.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+typedef union
+{
+  struct
+  {
+    int cpu;   /**< CPU. */
+    int event; /**< evt_num */
+  }
+  user;        /**< Interrupt as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< Interrupt as described by the kernel. */
+
+} iorpc_interrupt_t;
+
+
+/** A structure used to describe interrupts used with poll().  The format
+ *  differs significantly for requests from user to kernel, and kernel to
+ *  hypervisor.  As with the mem_buffer_t, translation between the formats
+ *  is done at each level. */
+typedef union
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd_setup as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< pollfd_setup as described by the kernel. */
+
+} iorpc_pollfd_setup_t;
+
+
+/** A structure used to describe previously set up interrupts used with
+ *  poll().  The format differs significantly for requests from user to
+ *  kernel, and kernel to hypervisor.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+typedef union
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd as described by user applications. */
+
+  struct
+  {
+    int cookie; /**< hv cookie returned by the pollfd_setup operation. */
+  }
+  kernel;      /**< pollfd as described by the kernel. */
+
+} iorpc_pollfd_t;
+
+
+/** The various iorpc devices use error codes from -1100 to -1299.
+ *
+ * This range is distinct from netio (-700 to -799), the hypervisor
+ * (-800 to -899), tilepci (-900 to -999), ilib (-1000 to -1099),
+ * gxcr (-1300 to -1399) and gxpci (-1400 to -1499).
+ */
+enum gxio_err_e {
+
+  /** Largest iorpc error number. */
+  GXIO_ERR_MAX = -1101,
+
+
+  /********************************************************/
+  /*                   Generic Error Codes                */
+  /********************************************************/
+
+  /** Bad RPC opcode - possible version incompatibility. */
+  GXIO_ERR_OPCODE = -1101,
+
+  /** Invalid parameter. */
+  GXIO_ERR_INVAL = -1102,
+
+  /** Memory buffer did not meet alignment requirements. */
+  GXIO_ERR_ALIGNMENT = -1103,
+
+  /** Memory buffers must be coherent and cacheable. */
+  GXIO_ERR_COHERENCE = -1104,
+
+  /** Resource already initialized. */
+  GXIO_ERR_ALREADY_INIT = -1105,
+
+  /** No service domains available. */
+  GXIO_ERR_NO_SVC_DOM = -1106,
+
+  /** Illegal service domain number. */
+  GXIO_ERR_INVAL_SVC_DOM = -1107,
+
+  /** Illegal MMIO address. */
+  GXIO_ERR_MMIO_ADDRESS = -1108,
+
+  /** Illegal interrupt binding. */
+  GXIO_ERR_INTERRUPT = -1109,
+
+  /** Unreasonable client memory. */
+  GXIO_ERR_CLIENT_MEMORY = -1110,
+
+  /** No more IOTLB entries. */
+  GXIO_ERR_IOTLB_ENTRY = -1111,
+
+  /** Invalid memory size. */
+  GXIO_ERR_INVAL_MEMORY_SIZE = -1112,
+
+  /** Unsupported operation. */
+  GXIO_ERR_UNSUPPORTED_OP = -1113,
+
+  /** Insufficient DMA credits. */
+  GXIO_ERR_DMA_CREDITS = -1114,
+
+  /** Operation timed out. */
+  GXIO_ERR_TIMEOUT = -1115,
+
+  /** No such device or object. */
+  GXIO_ERR_NO_DEVICE = -1116,
+
+  /** Device or resource busy. */
+  GXIO_ERR_BUSY = -1117,
+
+  /** I/O error. */
+  GXIO_ERR_IO = -1118,
+
+  /** Permissions error. */
+  GXIO_ERR_PERM = -1119,
+
+
+
+  /********************************************************/
+  /*                 Test Device Error Codes              */
+  /********************************************************/
+
+  /** Illegal register number. */
+  GXIO_TEST_ERR_REG_NUMBER = -1120,
+
+  /** Illegal buffer slot. */
+  GXIO_TEST_ERR_BUFFER_SLOT = -1121,
+
+
+  /********************************************************/
+  /*                    MPIPE Error Codes                 */
+  /********************************************************/
+
+
+  /** Invalid buffer size. */
+  GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE = -1131,
+
+  /** Cannot allocate buffer stack. */
+  GXIO_MPIPE_ERR_NO_BUFFER_STACK = -1140,
+
+  /** Invalid buffer stack number. */
+  GXIO_MPIPE_ERR_BAD_BUFFER_STACK = -1141,
+
+  /** Cannot allocate NotifRing. */
+  GXIO_MPIPE_ERR_NO_NOTIF_RING = -1142,
+
+  /** Invalid NotifRing number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_RING = -1143,
+
+  /** Cannot allocate NotifGroup. */
+  GXIO_MPIPE_ERR_NO_NOTIF_GROUP = -1144,
+
+  /** Invalid NotifGroup number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_GROUP = -1145,
+
+  /** Cannot allocate bucket. */
+  GXIO_MPIPE_ERR_NO_BUCKET = -1146,
+
+  /** Invalid bucket number. */
+  GXIO_MPIPE_ERR_BAD_BUCKET = -1147,
+
+  /** Cannot allocate eDMA ring. */
+  GXIO_MPIPE_ERR_NO_EDMA_RING = -1148,
+
+  /** Invalid eDMA ring number. */
+  GXIO_MPIPE_ERR_BAD_EDMA_RING = -1149,
+
+  /** Invalid channel number. */
+  GXIO_MPIPE_ERR_BAD_CHANNEL = -1150,
+
+  /** Bad configuration. */
+  GXIO_MPIPE_ERR_BAD_CONFIG = -1151,
+
+  /** Empty iqueue. */
+  GXIO_MPIPE_ERR_IQUEUE_EMPTY = -1152,
+
+  /** Empty rules. */
+  GXIO_MPIPE_ERR_RULES_EMPTY = -1160,
+
+  /** Full rules. */
+  GXIO_MPIPE_ERR_RULES_FULL = -1161,
+
+  /** Corrupt rules. */
+  GXIO_MPIPE_ERR_RULES_CORRUPT = -1162,
+
+  /** Invalid rules. */
+  GXIO_MPIPE_ERR_RULES_INVALID = -1163,
+
+  /** Classifier is too big. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_BIG = -1170,
+
+  /** Classifier is too complex. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_COMPLEX = -1171,
+
+  /** Classifier has bad header. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_HEADER = -1172,
+
+  /** Classifier has bad contents. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_CONTENTS = -1173,
+
+  /** Classifier encountered invalid symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_SYMBOL = -1174,
+
+  /** Classifier encountered invalid bounds. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_BOUNDS = -1175,
+
+  /** Classifier encountered invalid relocation. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_RELOCATION = -1176,
+
+  /** Classifier encountered undefined symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_UNDEF_SYMBOL = -1177,
+
+
+  /********************************************************/
+  /*                    TRIO  Error Codes                 */
+  /********************************************************/
+
+  /** Cannot allocate memory map region. */
+  GXIO_TRIO_ERR_NO_MEMORY_MAP = -1180,
+
+  /** Invalid memory map region number. */
+  GXIO_TRIO_ERR_BAD_MEMORY_MAP = -1181,
+
+  /** Cannot allocate scatter queue. */
+  GXIO_TRIO_ERR_NO_SCATTER_QUEUE = -1182,
+
+  /** Invalid scatter queue number. */
+  GXIO_TRIO_ERR_BAD_SCATTER_QUEUE = -1183,
+
+  /** Cannot allocate push DMA ring. */
+  GXIO_TRIO_ERR_NO_PUSH_DMA_RING = -1184,
+
+  /** Invalid push DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PUSH_DMA_RING = -1185,
+
+  /** Cannot allocate pull DMA ring. */
+  GXIO_TRIO_ERR_NO_PULL_DMA_RING = -1186,
+
+  /** Invalid pull DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PULL_DMA_RING = -1187,
+
+  /** Cannot allocate PIO region. */
+  GXIO_TRIO_ERR_NO_PIO = -1188,
+
+  /** Invalid PIO region index. */
+  GXIO_TRIO_ERR_BAD_PIO = -1189,
+
+  /** Cannot allocate ASID. */
+  GXIO_TRIO_ERR_NO_ASID = -1190,
+
+  /** Invalid ASID. */
+  GXIO_TRIO_ERR_BAD_ASID = -1191,
+
+
+  /********************************************************/
+  /*                    MICA Error Codes                  */
+  /********************************************************/
+
+  /** No such accelerator type. */
+  GXIO_MICA_ERR_BAD_ACCEL_TYPE = -1220,
+
+  /** Cannot allocate context. */
+  GXIO_MICA_ERR_NO_CONTEXT = -1221,
+
+  /** PKA command queue is full, can't add another command. */
+  GXIO_MICA_ERR_PKA_CMD_QUEUE_FULL = -1222,
+
+  /** PKA result queue is empty, can't get a result from the queue. */
+  GXIO_MICA_ERR_PKA_RESULT_QUEUE_EMPTY = -1223,
+
+  /********************************************************/
+  /*                    GPIO Error Codes                  */
+  /********************************************************/
+
+  /** Pin not available.  Either the physical pin does not exist, or
+   *  it is reserved by the hypervisor for system usage. */
+  GXIO_GPIO_ERR_PIN_UNAVAILABLE = -1240,
+
+  /** Pin busy.  The pin exists, and is available for use via GXIO, but
+   *  it has been attached by some other process or driver. */
+  GXIO_GPIO_ERR_PIN_BUSY = -1241,
+
+  /** Cannot access unattached pin.  One or more of the pins being
+   *  manipulated by this call are not attached to the requesting
+   *  context. */
+  GXIO_GPIO_ERR_PIN_UNATTACHED = -1242,
+
+  /** Invalid I/O mode for pin.  The wiring of the pin in the system
+   *  is such that the I/O mode or electrical control parameters
+   *  requested could cause damage. */
+  GXIO_GPIO_ERR_PIN_INVALID_MODE = -1243,
+
+  /** Smallest iorpc error number. */
+  GXIO_ERR_MIN = -1299
+};
+
+
+#endif /* !_HV_IORPC_H_ */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 0/6] arch/tile: networking support for tilegx
  2012-04-28 22:07             ` Chris Metcalf
@ 2012-04-04 20:39               ` Chris Metcalf
  2012-04-04 20:39                 ` [PATCH v2 1/6] arch/tile: introduce GXIO IORPC framework " Chris Metcalf
                                   ` (5 more replies)
  2012-04-29 11:15               ` [PATCH 6/6] tilegx network driver: initial support Arnd Bergmann
  1 sibling, 6 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:39 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel, netdev

This patch series addresses the feedback from the community for the
first patch series.  Changes include:

- Clean up network driver to not keep two arrays of net_device pointers
- Use explicit tile_io_addr type and conversion functions, not __pa/__va
- Avoid tile-specific __insn_mf() in favor of wmb()
- Clean up checksumming in tilegx.c to use new csum_long() routine
- Use inline assembly for readb/writeb etc to avoid splitting accesses
- Remove fences after mmio writes since they aren't necessary
- Fixed Kconfig options to be appropriately hidden
- Export symbols with EXPORT_SYMBOL_GPL, nestled up against the functions
- Make auto-generated code much terser and more readable
- Avoid typedef'ing structures, and remove "_t" suffix from struct names
- Convert remaining C99 comments to C89
- Fixed driver multiline comment style to match davem's preferred style

 arch/tile/Kconfig                         |    2 +
 arch/tile/Makefile                        |    2 +
 arch/tile/gxio/Kconfig                    |   17 +
 arch/tile/gxio/Makefile                   |    7 +
 arch/tile/gxio/dma_queue.c                |  230 ++++
 arch/tile/gxio/iorpc_globals.c            |   89 ++
 arch/tile/gxio/iorpc_mpipe.c              |  463 +++++++
 arch/tile/gxio/iorpc_mpipe_info.c         |   85 ++
 arch/tile/gxio/kiorpc.c                   |   61 +
 arch/tile/gxio/mpipe.c                    |  500 ++++++++
 arch/tile/include/arch/mpipe.h            |  359 ++++++
 arch/tile/include/arch/mpipe_constants.h  |   42 +
 arch/tile/include/arch/mpipe_def.h        |   39 +
 arch/tile/include/arch/mpipe_shm.h        |  509 ++++++++
 arch/tile/include/arch/mpipe_shm_def.h    |   23 +
 arch/tile/include/asm/checksum.h          |   18 +
 arch/tile/include/asm/io.h                |  144 ++-
 arch/tile/include/gxio/common.h           |   38 +
 arch/tile/include/gxio/dma_queue.h        |   59 +
 arch/tile/include/gxio/iorpc_globals.h    |   38 +
 arch/tile/include/gxio/iorpc_mpipe.h      |  124 ++
 arch/tile/include/gxio/iorpc_mpipe_info.h |   46 +
 arch/tile/include/gxio/kiorpc.h           |   29 +
 arch/tile/include/gxio/mpipe.h            | 1653 ++++++++++++++++++++++++
 arch/tile/include/hv/drv_mpipe_intf.h     |  602 +++++++++
 arch/tile/include/hv/iorpc.h              |  714 +++++++++++
 arch/tile/lib/checksum.c                  |   15 +-
 drivers/net/ethernet/tile/Kconfig         |    1 +
 drivers/net/ethernet/tile/Makefile        |    4 +-
 drivers/net/ethernet/tile/tilegx.c        | 1952 +++++++++++++++++++++++++++++
 30 files changed, 7821 insertions(+), 44 deletions(-)

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v2 1/6] arch/tile: introduce GXIO IORPC framework for tilegx
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
@ 2012-04-04 20:39                 ` Chris Metcalf
  2012-04-04 20:58                 ` [PATCH v2 3/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
                                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:39 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel

The GXIO I/O RPC subsystem handles exporting I/O hardware resources to
Linux and to applications running under Linux.

For instance, memory which is made available for I/O DMA must be mapped
by an I/O TLB; that means that such memory must be locked down by Linux,
so that it is not swapped or otherwise reused, as long as those I/O
TLB entries are active. Similarly, configuring direct hardware access
introduces new validation requirements. If a user application registers
memory, Linux must ensure that the supplied virtual addresses are valid,
and turn them into client physical addresses. Similarly, when Linux then
supplies those client physical addresses to the Tilera hypervisor, it
must in turn validate those before turning them into the real physical
addresses which are required by the hardware.

To the extent that these sorts of activities were required on previous
TILE architecture processors, they were implemented in a device-specific
fashion. This meant that every I/O device had its own Tilera hypervisor
driver, its own Linux driver, and in some cases its own user-level
library support. There was a large amount of more-or-less functionally
identical code in different places, particularly in the different Linux
drivers. For TILE-Gx, this support has been generalized into a common
framework, known as the I/O RPC framework or just IORPC.

The two "gxio" directories (one for headers, one for sources) start
with just a few files in each with this infrastructure commit, but
after adding support for the on-board I/O shims for networking, PCI,
USB, crypto, compression, I2CS, etc., there end up being about 20 files
in each directory.

More information on the IORPC framework is in the <hv/iorpc.h> header,
included in this commit.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/Kconfig                      |    2 +
 arch/tile/Makefile                     |    2 +
 arch/tile/gxio/Kconfig                 |    5 +
 arch/tile/gxio/Makefile                |    5 +
 arch/tile/gxio/iorpc_globals.c         |   89 ++++
 arch/tile/gxio/kiorpc.c                |   61 +++
 arch/tile/include/gxio/common.h        |   38 ++
 arch/tile/include/gxio/iorpc_globals.h |   38 ++
 arch/tile/include/gxio/kiorpc.h        |   29 ++
 arch/tile/include/hv/iorpc.h           |  714 ++++++++++++++++++++++++++++++++
 10 files changed, 983 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/Kconfig
 create mode 100644 arch/tile/gxio/Makefile
 create mode 100644 arch/tile/gxio/iorpc_globals.c
 create mode 100644 arch/tile/gxio/kiorpc.c
 create mode 100644 arch/tile/include/gxio/common.h
 create mode 100644 arch/tile/include/gxio/iorpc_globals.h
 create mode 100644 arch/tile/include/gxio/kiorpc.h
 create mode 100644 arch/tile/include/hv/iorpc.h

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 2408a26..8a85f94 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -350,6 +350,8 @@ config KERNEL_PL
 	  kernel will be built to run at.  Generally you should use
 	  the default value here.
 
+source "arch/tile/gxio/Kconfig"
+
 endmenu  # Tilera-specific configuration
 
 menu "Bus options"
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 9520bc5..5015144 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -54,6 +54,8 @@ libs-y		+= $(LIBGCC_PATH)
 # See arch/tile/Kbuild for content of core part of the kernel
 core-y		+= arch/tile/
 
+core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
+
 ifdef TILERA_ROOT
 INSTALL_PATH ?= $(TILERA_ROOT)/tile/boot
 endif
diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
new file mode 100644
index 0000000..8eff47f
--- /dev/null
+++ b/arch/tile/gxio/Kconfig
@@ -0,0 +1,5 @@
+# Support direct access to TILE-Gx hardware from user space, via the
+# gxio library, or from kernel space, via kernel IORPC support.
+config TILE_GXIO
+	bool
+	depends on TILEGX
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
new file mode 100644
index 0000000..db1ee28
--- /dev/null
+++ b/arch/tile/gxio/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Tile-Gx device access support.
+#
+
+obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
diff --git a/arch/tile/gxio/iorpc_globals.c b/arch/tile/gxio/iorpc_globals.c
new file mode 100644
index 0000000..e178e90
--- /dev/null
+++ b/arch/tile/gxio/iorpc_globals.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_globals.h"
+
+struct arm_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie)
+{
+	struct arm_pollfd_param temp;
+	struct arm_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_ARM_POLLFD);
+}
+
+EXPORT_SYMBOL(__iorpc_arm_pollfd);
+
+struct close_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie)
+{
+	struct close_pollfd_param temp;
+	struct close_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_CLOSE_POLLFD);
+}
+
+EXPORT_SYMBOL(__iorpc_close_pollfd);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 IORPC_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(__iorpc_check_mmio_offset);
diff --git a/arch/tile/gxio/kiorpc.c b/arch/tile/gxio/kiorpc.c
new file mode 100644
index 0000000..c8096aa
--- /dev/null
+++ b/arch/tile/gxio/kiorpc.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx IORPC support for kernel I/O drivers.
+ */
+
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/kiorpc.h>
+
+#ifdef DEBUG_IORPC
+#define TRACE(FMT, ...) pr_info(SIMPLE_MSG_LINE FMT, ## __VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+/* Create kernel-VA-space MMIO mapping for an on-chip IO device. */
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+			    unsigned long size)
+{
+	pgprot_t mmio_base, prot = { 0 };
+	unsigned long pfn;
+	int err;
+
+	/* Look up the shim's lotar and base PA. */
+	err = __iorpc_get_mmio_base(hv_fd, &mmio_base);
+	if (err) {
+		TRACE("get_mmio_base() failure: %d\n", err);
+		return NULL;
+	}
+
+	/* Make sure the HV driver approves of our offset and size. */
+	err = __iorpc_check_mmio_offset(hv_fd, offset, size);
+	if (err) {
+		TRACE("check_mmio_offset() failure: %d\n", err);
+		return NULL;
+	}
+
+	/*
+	 * mmio_base contains a base pfn and homing coordinates.  Turn
+	 * it into an MMIO pgprot and offset pfn.
+	 */
+	prot = hv_pte_set_lotar(prot, hv_pte_get_lotar(mmio_base));
+	pfn = pte_pfn(mmio_base) + PFN_DOWN(offset);
+
+	return ioremap_prot(PFN_PHYS(pfn), size, prot);
+}
+
+EXPORT_SYMBOL(iorpc_ioremap);
diff --git a/arch/tile/include/gxio/common.h b/arch/tile/include/gxio/common.h
new file mode 100644
index 0000000..29f2f68
--- /dev/null
+++ b/arch/tile/include/gxio/common.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_COMMON_H_
+#define _GXIO_COMMON_H_
+
+/*
+ * Routines shared between the various GXIO device components.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/io.h>
+
+/* Define the standard gxio MMIO functions using kernel functions. */
+#define __gxio_mmio_read8(addr)		readb(addr)
+#define __gxio_mmio_read16(addr)	readw(addr)
+#define __gxio_mmio_read32(addr)	readl(addr)
+#define __gxio_mmio_read64(addr)	readq(addr)
+#define __gxio_mmio_write8(addr, val)	writeb((val), (addr))
+#define __gxio_mmio_write16(addr, val)	writew((val), (addr))
+#define __gxio_mmio_write32(addr, val)	writel((val), (addr))
+#define __gxio_mmio_write64(addr, val)	writeq((val), (addr))
+#define __gxio_mmio_read(addr)		__gxio_mmio_read64(addr)
+#define __gxio_mmio_write(addr, val)	__gxio_mmio_write64((addr), (val))
+
+#endif /* !_GXIO_COMMON_H_ */
diff --git a/arch/tile/include/gxio/iorpc_globals.h b/arch/tile/include/gxio/iorpc_globals.h
new file mode 100644
index 0000000..52c721f
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_globals.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __IORPC_LINUX_RPC_H__
+#define __IORPC_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define IORPC_OP_ARM_POLLFD            IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define IORPC_OP_CLOSE_POLLFD          IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define IORPC_OP_GET_MMIO_BASE         IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define IORPC_OP_CHECK_MMIO_OFFSET     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base);
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size);
+
+#endif /* !__IORPC_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/kiorpc.h b/arch/tile/include/gxio/kiorpc.h
new file mode 100644
index 0000000..ee58209
--- /dev/null
+++ b/arch/tile/include/gxio/kiorpc.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Support routines for kernel IORPC drivers.
+ */
+
+#ifndef _GXIO_KIORPC_H
+#define _GXIO_KIORPC_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+#include <arch/chip.h>
+
+#if CHIP_HAS_MMIO()
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+			    unsigned long size);
+#endif
+
+#endif /* _GXIO_KIORPC_H */
diff --git a/arch/tile/include/hv/iorpc.h b/arch/tile/include/hv/iorpc.h
new file mode 100644
index 0000000..89c72a5
--- /dev/null
+++ b/arch/tile/include/hv/iorpc.h
@@ -0,0 +1,714 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _HV_IORPC_H_
+#define _HV_IORPC_H_
+
+/**
+ *
+ * Error codes and struct definitions for the IO RPC library.
+ *
+ * The hypervisor's IO RPC component provides a convenient way for
+ * driver authors to proxy system calls between user space, linux, and
+ * the hypervisor driver.  The core of the system is a set of Python
+ * files that take ".idl" files as input and generates the following
+ * source code:
+ *
+ * - _rpc_call() routines for use in userspace IO libraries.  These
+ * routines take an argument list specified in the .idl file, pack the
+ * arguments in to a buffer, and read or write that buffer via the
+ * Linux iorpc driver.
+ *
+ * - dispatch_read() and dispatch_write() routines that hypervisor
+ * drivers can use to implement most of their dev_pread() and
+ * dev_pwrite() methods.  These routines decode the incoming parameter
+ * blob, permission check and translate parameters where appropriate,
+ * and then invoke a callback routine for whichever RPC call has
+ * arrived.  The driver simply implements the set of callback
+ * routines.
+ *
+ * The IO RPC system also includes the Linux 'iorpc' driver, which
+ * proxies calls between the userspace library and the hypervisor
+ * driver.  The Linux driver is almost entirely device agnostic; it
+ * watches for special flags indicating cases where a memory buffer
+ * address might need to be translated, etc.  As a result, driver
+ * writers can avoid many of the problem cases related to registering
+ * hardware resources like memory pages or interrupts.  However, the
+ * drivers must be careful to obey the conventions documented below in
+ * order to work properly with the generic Linux iorpc driver.
+ *
+ * @section iorpc_domains Service Domains
+ *
+ * All iorpc-based drivers must support a notion of service domains.
+ * A service domain is basically an application context - state
+ * indicating resources that are allocated to that particular app
+ * which it may access and (perhaps) other applications may not
+ * access.  Drivers can support any number of service domains they
+ * choose.  In some cases the design is limited by a number of service
+ * domains supported by the IO hardware; in other cases the service
+ * domains are a purely software concept and the driver chooses a
+ * maximum number of domains based on how much state memory it is
+ * willing to preallocate.
+ *
+ * For example, the mPIPE driver only supports as many service domains
+ * as are supported by the mPIPE hardware.  This limitation is
+ * required because the hardware implements its own MMIO protection
+ * scheme to allow large MMIO mappings while still protecting small
+ * register ranges within the page that should only be accessed by the
+ * hypervisor.
+ *
+ * In contrast, drivers with no hardware service domain limitations
+ * (for instance the TRIO shim) can implement an arbitrary number of
+ * service domains.  In these cases, each service domain is limited to
+ * a carefully restricted set of legal MMIO addresses if necessary to
+ * keep one application from corrupting another application's state.
+ *
+ * @section iorpc_conventions System Call Conventions
+ *
+ * The driver's open routine is responsible for allocating a new
+ * service domain for each hv_dev_open() call.  By convention, the
+ * return value from open() should be the service domain number on
+ * success, or GXIO_ERR_NO_SVC_DOM if no more service domains are
+ * available.
+ *
+ * The implementations of hv_dev_pread() and hv_dev_pwrite() are
+ * responsible for validating the devhdl value passed up by the
+ * client.  Since the device handle returned by hv_dev_open() should
+ * embed the positive service domain number, drivers should make sure
+ * that DRV_HDL2BITS(devhdl) is a legal service domain.  If the client
+ * passes an illegal service domain number, the routine should return
+ * GXIO_ERR_INVAL_SVC_DOM.  Once the service domain number has been
+ * validated, the driver can copy to/from the client buffer and call
+ * the dispatch_read() or dispatch_write() methods created by the RPC
+ * generator.
+ *
+ * The hv_dev_close() implementation should reset all service domain
+ * state and put the service domain back on a free list for
+ * reallocation by a future application.  In most cases, this will
+ * require executing a hardware reset or drain flow and denying any
+ * MMIO regions that were created for the service domain.
+ *
+ * @section iorpc_data Special Data Types
+ *
+ * The .idl file syntax allows the creation of syscalls with special
+ * parameters that require permission checks or translations as part
+ * of the system call path.  Because of limitations in the code
+ * generator, APIs are generally limited to just one of these special
+ * parameters per system call, and they are sometimes required to be
+ * the first or last parameter to the call.  Special parameters
+ * include:
+ *
+ * @subsection iorpc_mem_buffer MEM_BUFFER
+ *
+ * The MEM_BUFFER() datatype allows user space to "register" memory
+ * buffers with a device.  Registering memory accomplishes two tasks:
+ * Linux keeps track of all buffers that might be modified by a
+ * hardware device, and the hardware device drivers bind registered
+ * buffers to particular hardware resources like ingress NotifRings.
+ * The MEM_BUFFER() idl syntax can take extra flags like ALIGN_64KB,
+ * ALIGN_SELF_SIZE, and FLAGS indicating that memory buffers must have
+ * certain alignment or that the user should be able to pass a "memory
+ * flags" word specifying attributes like nt_hint or IO cache pinning.
+ * The parser will accept multiple MEM_BUFFER() flags.
+ *
+ * Implementations must obey the following conventions when
+ * registering memory buffers via the iorpc flow.  These rules are a
+ * result of the Linux driver implementation, which needs to keep
+ * track of how many times a particular page has been registered with
+ * the hardware so that it can release the page when all those
+ * registrations are cleared.
+ *
+ * - Memory registrations that refer to a resource which has already
+ * been bound must return GXIO_ERR_ALREADY_INIT.  Thus, it is an
+ * error to register memory twice without resetting (i.e. closing) the
+ * resource in between.  This convention keeps the Linux driver from
+ * having to track which particular devices a page is bound to.
+ *
+ * - At present, a memory registration is only cleared when the
+ * service domain is reset.  In this case, the Linux driver simply
+ * closes the HV device file handle and then decrements the reference
+ * counts of all pages that were previously registered with the
+ * device.
+ *
+ * - In the future, we may add a mechanism for unregistering memory.
+ * One possible implementation would require that the user specify
+ * which buffer is currently registered.  The HV would then verify
+ * that that page was actually the one currently mapped and return
+ * success or failure to Linux, which would then only decrement the
+ * page reference count if the addresses were mapped.  Another scheme
+ * might allow Linux to pass a token to the HV to be returned when the
+ * resource is unmapped.
+ *
+ * @subsection iorpc_interrupt INTERRUPT
+ *
+ * The INTERRUPT .idl datatype allows the client to bind hardware
+ * interrupts to a particular combination of IPI parameters - CPU, IPI
+ * PL, and event bit number.  This data is passed via a special
+ * datatype so that the Linux driver can validate the CPU and PL and
+ * the HV generic iorpc code can translate client CPUs to real CPUs.
+ *
+ * @subsection iorpc_pollfd_setup POLLFD_SETUP
+ *
+ * The POLLFD_SETUP .idl datatype allows the client to set up hardware
+ * interrupt bindings which are received by Linux but which are made
+ * visible to user processes as state transitions on a file descriptor;
+ * this allows user processes to use Linux primitives, such as poll(), to
+ * await particular hardware events.  This data is passed via a special
+ * datatype so that the Linux driver may recognize the pollable file
+ * descriptor and translate it to a set of interrupt target information,
+ * and so that the HV generic iorpc code can translate client CPUs to real
+ * CPUs.
+ *
+ * @subsection iorpc_pollfd POLLFD
+ *
+ * The POLLFD .idl datatype allows manipulation of hardware interrupt
+ * bindings set up via the POLLFD_SETUP datatype; common operations are
+ * resetting the state of the requested interrupt events, and unbinding any
+ * bound interrupts.  This data is passed via a special datatype so that
+ * the Linux driver may recognize the pollable file descriptor and
+ * translate it to an interrupt identifier previously supplied by the
+ * hypervisor as the result of an earlier pollfd_setup operation.
+ *
+ * @subsection iorpc_blob BLOB
+ *
+ * The BLOB .idl datatype allows the client to write an arbitrary
+ * length string of bytes up to the hypervisor driver.  This can be
+ * useful for passing up large, arbitrarily structured data like
+ * classifier programs.  The iorpc stack takes care of validating the
+ * buffer VA and CPA as the data passes up to the hypervisor.  Unlike
+ * MEM_BUFFER(), the buffer is not registered - Linux does not bump
+ * page refcounts and the HV driver should not reuse the buffer once
+ * the system call is complete.
+ *
+ * @section iorpc_translation Translating User Space Calls
+ *
+ * The ::iorpc_offset structure describes the formatting of the offset
+ * that is passed to pread() or pwrite() as part of the generated RPC code.
+ * When the user calls up to Linux, the rpc code fills in all the fields of
+ * the offset, including a 16-bit opcode, a 16 bit format indicator, and 32
+ * bits of user-specified "sub-offset".  The opcode indicates which syscall
+ * is being requested.  The format indicates whether there is a "prefix
+ * struct" at the start of the memory buffer passed to pwrite(), and if so
+ * what data is in that prefix struct.  These prefix structs are used to
+ * implement special datatypes like MEM_BUFFER() and INTERRUPT - we arrange
+ * to put data that needs translation and permission checks at the start of
+ * the buffer so that the Linux driver and generic portions of the HV iorpc
+ * code can easily access the data.  The 32 bits of user-specified
+ * "sub-offset" are most useful for pread() calls where the user needs to
+ * also pass in a few bits indicating which register to read, etc.
+ *
+ * The Linux iorpc driver watches for system calls that contain prefix
+ * structs so that it can translate parameters and bump reference
+ * counts as appropriate.  It does not (currently) have any knowledge
+ * of the per-device opcodes - it doesn't care what operation you're
+ * doing to mPIPE, so long as it can do all the generic book-keeping.
+ * The hv/iorpc.h header file defines all of the generic encoding bits
+ * needed to translate iorpc calls without knowing which particular
+ * opcode is being issued.
+ *
+ * @section iorpc_globals Global iorpc Calls
+ *
+ * Implementing mmap() required adding some special iorpc syscalls
+ * that are only called by the Linux driver, never by userspace.
+ * These include get_mmio_base() and check_mmio_offset().  These
+ * routines are described in globals.idl and must be included in every
+ * iorpc driver.  By providing these routines in every driver, Linux's
+ * mmap implementation can easily get the PTE bits it needs and
+ * validate the PA offset without needing to know the per-device
+ * opcodes to perform those tasks.
+ *
+ * @section iorpc_kernel Supporting gxio APIs in the Kernel
+ *
+ * The iorpc code generator also supports generation of kernel code
+ * implementing the gxio APIs.  This capability is currently used by
+ * the mPIPE network driver, and will likely be used by the TRIO root
+ * complex and endpoint drivers and perhaps an in-kernel crypto
+ * driver.  Each driver that wants to instantiate iorpc calls in the
+ * kernel needs to generate a kernel version of the generate rpc code
+ * and (probably) copy any related gxio source files into the kernel.
+ * The mPIPE driver provides a good example of this pattern.
+ */
+
+#ifdef __KERNEL__
+#include <linux/stddef.h>
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__HV__)
+#include <hv/hypervisor.h>
+#elif defined(__KERNEL__)
+#include "hypervisor.h"
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+
+/** Code indicating translation services required within the RPC path.
+ * These indicate whether there is a translatable struct at the start
+ * of the RPC buffer and what information that struct contains.
+ */
+enum iorpc_format_e
+{
+  /** No translation required, no prefix struct. */
+  IORPC_FORMAT_NONE,
+
+  /** No translation required, no prefix struct, no access to this
+   *  operation from user space. */
+  IORPC_FORMAT_NONE_NOUSER,
+
+  /** Prefix struct contains user VA and size. */
+  IORPC_FORMAT_USER_MEM,
+
+  /** Prefix struct contains CPA, size, and homing bits. */
+  IORPC_FORMAT_KERNEL_MEM,
+
+  /** Prefix struct contains interrupt. */
+  IORPC_FORMAT_KERNEL_INTERRUPT,
+
+  /** Prefix struct contains user-level interrupt. */
+  IORPC_FORMAT_USER_INTERRUPT,
+
+  /** Prefix struct contains pollfd_setup (interrupt information). */
+  IORPC_FORMAT_KERNEL_POLLFD_SETUP,
+
+  /** Prefix struct contains user-level pollfd_setup (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD_SETUP,
+
+  /** Prefix struct contains pollfd (interrupt cookie). */
+  IORPC_FORMAT_KERNEL_POLLFD,
+
+  /** Prefix struct contains user-level pollfd (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD,
+};
+
+
+/** Generate an opcode given format and code. */
+#define IORPC_OPCODE(FORMAT, CODE) (((FORMAT) << 16) | (CODE))
+
+/** The offset passed through the read() and write() system calls
+    combines an opcode with 32 bits of user-specified offset. */
+union iorpc_offset
+{
+#ifndef __BIG_ENDIAN__
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint16_t code;              /**< RPC code. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint32_t sub_offset;        /**< caller-specified offset. */
+  };
+
+  uint32_t opcode;              /**< Opcode combines code & format. */
+#else
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint32_t sub_offset;        /**< caller-specified offset. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint16_t code;              /**< RPC code. */
+  };
+
+  struct
+  {
+    uint32_t padding;
+    uint32_t opcode;              /**< Opcode combines code & format. */
+  };
+#endif
+};
+
+
+/** Homing and cache hinting bits that can be used by IO devices. */
+struct iorpc_mem_attr
+{
+  unsigned int lotar_x:4;       /**< lotar X bits (or Gx page_mask). */
+  unsigned int lotar_y:4;       /**< lotar Y bits (or Gx page_offset). */
+  unsigned int hfh:1;           /**< Uses hash-for-home. */
+  unsigned int nt_hint:1;       /**< Non-temporal hint. */
+  unsigned int io_pin:1;        /**< Only fill 'IO' cache ways. */
+};
+
+/** Set the nt_hint bit. */
+#define IORPC_MEM_BUFFER_FLAG_NT_HINT (1 << 0)
+
+/** Set the IO pin bit. */
+#define IORPC_MEM_BUFFER_FLAG_IO_PIN (1 << 1)
+
+
+/** A structure used to describe memory registration.  Different
+    protection levels describe memory differently, so this union
+    contains all the different possible descriptions.  As a request
+    moves up the call chain, each layer translates from one
+    description format to the next.  In particular, the Linux iorpc
+    driver translates user VAs into CPAs and homing parameters. */
+union iorpc_mem_buffer
+{
+  struct
+  {
+    uint64_t va;                /**< User virtual address. */
+    uint64_t size;              /**< Buffer size. */
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  user;                         /**< Buffer as described by user apps. */
+
+  struct
+  {
+    unsigned long long cpa;     /**< Client physical address. */
+#if defined(__KERNEL__) || defined(__HV__)
+    size_t size;                /**< Buffer size. */
+    HV_PTE pte;                 /**< PTE describing memory homing. */
+#else
+    uint64_t size;
+    uint64_t pte;
+#endif
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  kernel;                       /**< Buffer as described by kernel. */
+
+  struct
+  {
+    unsigned long long pa;      /**< Physical address. */
+    size_t size;                /**< Buffer size. */
+    struct iorpc_mem_attr attr;      /**< Homing and locality hint bits. */
+  }
+  hv;                           /**< Buffer parameters for HV driver. */
+};
+
+
+/** A structure used to describe interrupts.  The format differs slightly
+ *  for user and kernel interrupts.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_interrupt
+{
+  struct
+  {
+    int cpu;   /**< CPU. */
+    int event; /**< evt_num */
+  }
+  user;        /**< Interrupt as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< Interrupt as described by the kernel. */
+
+};
+
+
+/** A structure used to describe interrupts used with poll().  The format
+ *  differs significantly for requests from user to kernel, and kernel to
+ *  hypervisor.  As with the mem_buffer_t, translation between the formats
+ *  is done at each level. */
+union iorpc_pollfd_setup
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd_setup as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< pollfd_setup as described by the kernel. */
+
+};
+
+
+/** A structure used to describe previously set up interrupts used with
+ *  poll().  The format differs significantly for requests from user to
+ *  kernel, and kernel to hypervisor.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_pollfd
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd as described by user applications. */
+
+  struct
+  {
+    int cookie; /**< hv cookie returned by the pollfd_setup operation. */
+  }
+  kernel;      /**< pollfd as described by the kernel. */
+
+};
+
+
+/** The various iorpc devices use error codes from -1100 to -1299.
+ *
+ * This range is distinct from netio (-700 to -799), the hypervisor
+ * (-800 to -899), tilepci (-900 to -999), ilib (-1000 to -1099),
+ * gxcr (-1300 to -1399) and gxpci (-1400 to -1499).
+ */
+enum gxio_err_e {
+
+  /** Largest iorpc error number. */
+  GXIO_ERR_MAX = -1101,
+
+
+  /********************************************************/
+  /*                   Generic Error Codes                */
+  /********************************************************/
+
+  /** Bad RPC opcode - possible version incompatibility. */
+  GXIO_ERR_OPCODE = -1101,
+
+  /** Invalid parameter. */
+  GXIO_ERR_INVAL = -1102,
+
+  /** Memory buffer did not meet alignment requirements. */
+  GXIO_ERR_ALIGNMENT = -1103,
+
+  /** Memory buffers must be coherent and cacheable. */
+  GXIO_ERR_COHERENCE = -1104,
+
+  /** Resource already initialized. */
+  GXIO_ERR_ALREADY_INIT = -1105,
+
+  /** No service domains available. */
+  GXIO_ERR_NO_SVC_DOM = -1106,
+
+  /** Illegal service domain number. */
+  GXIO_ERR_INVAL_SVC_DOM = -1107,
+
+  /** Illegal MMIO address. */
+  GXIO_ERR_MMIO_ADDRESS = -1108,
+
+  /** Illegal interrupt binding. */
+  GXIO_ERR_INTERRUPT = -1109,
+
+  /** Unreasonable client memory. */
+  GXIO_ERR_CLIENT_MEMORY = -1110,
+
+  /** No more IOTLB entries. */
+  GXIO_ERR_IOTLB_ENTRY = -1111,
+
+  /** Invalid memory size. */
+  GXIO_ERR_INVAL_MEMORY_SIZE = -1112,
+
+  /** Unsupported operation. */
+  GXIO_ERR_UNSUPPORTED_OP = -1113,
+
+  /** Insufficient DMA credits. */
+  GXIO_ERR_DMA_CREDITS = -1114,
+
+  /** Operation timed out. */
+  GXIO_ERR_TIMEOUT = -1115,
+
+  /** No such device or object. */
+  GXIO_ERR_NO_DEVICE = -1116,
+
+  /** Device or resource busy. */
+  GXIO_ERR_BUSY = -1117,
+
+  /** I/O error. */
+  GXIO_ERR_IO = -1118,
+
+  /** Permissions error. */
+  GXIO_ERR_PERM = -1119,
+
+
+
+  /********************************************************/
+  /*                 Test Device Error Codes              */
+  /********************************************************/
+
+  /** Illegal register number. */
+  GXIO_TEST_ERR_REG_NUMBER = -1120,
+
+  /** Illegal buffer slot. */
+  GXIO_TEST_ERR_BUFFER_SLOT = -1121,
+
+
+  /********************************************************/
+  /*                    MPIPE Error Codes                 */
+  /********************************************************/
+
+
+  /** Invalid buffer size. */
+  GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE = -1131,
+
+  /** Cannot allocate buffer stack. */
+  GXIO_MPIPE_ERR_NO_BUFFER_STACK = -1140,
+
+  /** Invalid buffer stack number. */
+  GXIO_MPIPE_ERR_BAD_BUFFER_STACK = -1141,
+
+  /** Cannot allocate NotifRing. */
+  GXIO_MPIPE_ERR_NO_NOTIF_RING = -1142,
+
+  /** Invalid NotifRing number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_RING = -1143,
+
+  /** Cannot allocate NotifGroup. */
+  GXIO_MPIPE_ERR_NO_NOTIF_GROUP = -1144,
+
+  /** Invalid NotifGroup number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_GROUP = -1145,
+
+  /** Cannot allocate bucket. */
+  GXIO_MPIPE_ERR_NO_BUCKET = -1146,
+
+  /** Invalid bucket number. */
+  GXIO_MPIPE_ERR_BAD_BUCKET = -1147,
+
+  /** Cannot allocate eDMA ring. */
+  GXIO_MPIPE_ERR_NO_EDMA_RING = -1148,
+
+  /** Invalid eDMA ring number. */
+  GXIO_MPIPE_ERR_BAD_EDMA_RING = -1149,
+
+  /** Invalid channel number. */
+  GXIO_MPIPE_ERR_BAD_CHANNEL = -1150,
+
+  /** Bad configuration. */
+  GXIO_MPIPE_ERR_BAD_CONFIG = -1151,
+
+  /** Empty iqueue. */
+  GXIO_MPIPE_ERR_IQUEUE_EMPTY = -1152,
+
+  /** Empty rules. */
+  GXIO_MPIPE_ERR_RULES_EMPTY = -1160,
+
+  /** Full rules. */
+  GXIO_MPIPE_ERR_RULES_FULL = -1161,
+
+  /** Corrupt rules. */
+  GXIO_MPIPE_ERR_RULES_CORRUPT = -1162,
+
+  /** Invalid rules. */
+  GXIO_MPIPE_ERR_RULES_INVALID = -1163,
+
+  /** Classifier is too big. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_BIG = -1170,
+
+  /** Classifier is too complex. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_COMPLEX = -1171,
+
+  /** Classifier has bad header. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_HEADER = -1172,
+
+  /** Classifier has bad contents. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_CONTENTS = -1173,
+
+  /** Classifier encountered invalid symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_SYMBOL = -1174,
+
+  /** Classifier encountered invalid bounds. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_BOUNDS = -1175,
+
+  /** Classifier encountered invalid relocation. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_RELOCATION = -1176,
+
+  /** Classifier encountered undefined symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_UNDEF_SYMBOL = -1177,
+
+
+  /********************************************************/
+  /*                    TRIO  Error Codes                 */
+  /********************************************************/
+
+  /** Cannot allocate memory map region. */
+  GXIO_TRIO_ERR_NO_MEMORY_MAP = -1180,
+
+  /** Invalid memory map region number. */
+  GXIO_TRIO_ERR_BAD_MEMORY_MAP = -1181,
+
+  /** Cannot allocate scatter queue. */
+  GXIO_TRIO_ERR_NO_SCATTER_QUEUE = -1182,
+
+  /** Invalid scatter queue number. */
+  GXIO_TRIO_ERR_BAD_SCATTER_QUEUE = -1183,
+
+  /** Cannot allocate push DMA ring. */
+  GXIO_TRIO_ERR_NO_PUSH_DMA_RING = -1184,
+
+  /** Invalid push DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PUSH_DMA_RING = -1185,
+
+  /** Cannot allocate pull DMA ring. */
+  GXIO_TRIO_ERR_NO_PULL_DMA_RING = -1186,
+
+  /** Invalid pull DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PULL_DMA_RING = -1187,
+
+  /** Cannot allocate PIO region. */
+  GXIO_TRIO_ERR_NO_PIO = -1188,
+
+  /** Invalid PIO region index. */
+  GXIO_TRIO_ERR_BAD_PIO = -1189,
+
+  /** Cannot allocate ASID. */
+  GXIO_TRIO_ERR_NO_ASID = -1190,
+
+  /** Invalid ASID. */
+  GXIO_TRIO_ERR_BAD_ASID = -1191,
+
+
+  /********************************************************/
+  /*                    MICA Error Codes                  */
+  /********************************************************/
+
+  /** No such accelerator type. */
+  GXIO_MICA_ERR_BAD_ACCEL_TYPE = -1220,
+
+  /** Cannot allocate context. */
+  GXIO_MICA_ERR_NO_CONTEXT = -1221,
+
+  /** PKA command queue is full, can't add another command. */
+  GXIO_MICA_ERR_PKA_CMD_QUEUE_FULL = -1222,
+
+  /** PKA result queue is empty, can't get a result from the queue. */
+  GXIO_MICA_ERR_PKA_RESULT_QUEUE_EMPTY = -1223,
+
+  /********************************************************/
+  /*                    GPIO Error Codes                  */
+  /********************************************************/
+
+  /** Pin not available.  Either the physical pin does not exist, or
+   *  it is reserved by the hypervisor for system usage. */
+  GXIO_GPIO_ERR_PIN_UNAVAILABLE = -1240,
+
+  /** Pin busy.  The pin exists, and is available for use via GXIO, but
+   *  it has been attached by some other process or driver. */
+  GXIO_GPIO_ERR_PIN_BUSY = -1241,
+
+  /** Cannot access unattached pin.  One or more of the pins being
+   *  manipulated by this call are not attached to the requesting
+   *  context. */
+  GXIO_GPIO_ERR_PIN_UNATTACHED = -1242,
+
+  /** Invalid I/O mode for pin.  The wiring of the pin in the system
+   *  is such that the I/O mode or electrical control parameters
+   *  requested could cause damage. */
+  GXIO_GPIO_ERR_PIN_INVALID_MODE = -1243,
+
+  /** Smallest iorpc error number. */
+  GXIO_ERR_MIN = -1299
+};
+
+
+#endif /* !_HV_IORPC_H_ */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 4/6] arch/tile: common DMA code for the GXIO IORPC subsystem
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
  2012-04-04 20:39 ` [PATCH 1/6] arch/tile: introduce GXIO IORPC framework for tilegx Chris Metcalf
@ 2012-04-04 20:58 ` Chris Metcalf
  2012-04-06 17:41 ` [PATCH 2/6] arch/tile: fix set_pte() to properly handle kernel MMIO mappings Chris Metcalf
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:58 UTC (permalink / raw)
  To: linux-kernel

The dma_queue support is used by both the mPipe (networking)
and Trio (PCI) hardware shims on tilegx.  This common code is
selected when either of those drivers is built.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/gxio/Kconfig             |    9 ++
 arch/tile/gxio/Makefile            |    1 +
 arch/tile/gxio/dma_queue.c         |  236 ++++++++++++++++++++++++++++++++++++
 arch/tile/include/gxio/dma_queue.h |   59 +++++++++
 4 files changed, 305 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/dma_queue.c
 create mode 100644 arch/tile/include/gxio/dma_queue.h

diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index c2bae7d..b5cd898 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -5,3 +5,12 @@ config TILE_GXIO
 	  This option supports direct access to TILE-Gx hardware from
 	  user space, via the gxio library, or from kernel space, via
 	  kernel IORPC support.
+
+config TILE_GXIO_DMA
+	bool "Tilera Gx I/O DMA support"
+	depends on TILE_GXIO
+	---help---
+	  This option supports direct access to the common I/O DMA facility
+	  within the TILE-Gx mPIPE and Trio hardware.  It is not required
+	  in order to use the gxio library to access mPIPE or Trio from
+	  user space.
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index db1ee28..97ab468 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
+obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c
new file mode 100644
index 0000000..12cb5df
--- /dev/null
+++ b/arch/tile/gxio/dma_queue.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/io.h>
+#include <linux/atomic.h>
+#include <gxio/dma_queue.h>
+#include <hv/iorpc.h>
+
+/* Wait for a memory read to complete. */
+#define wait_for_value(val) \
+	__asm__ __volatile__("move %0, %0" :: "r"(val))
+
+/* The credit counter lives in the high 32 bits. */
+#define DMA_QUEUE_CREDIT_SHIFT 32
+
+/* The index is in the low 16. */
+#define DMA_QUEUE_INDEX_MASK ((1 << 16) - 1)
+
+/*
+ * The hardware descriptor-ring type.
+ * This matches the types used by mpipe (MPIPE_EDMA_POST_REGION_VAL_t)
+ * and trio (TRIO_PUSH_DMA_REGION_VAL_t or TRIO_PULL_DMA_REGION_VAL_t).
+ * See those types for more documentation on the individual fields.
+ */
+typedef union {
+	struct {
+#ifndef __BIG_ENDIAN__
+		uint64_t ring_idx   : 16;
+		uint64_t count      : 16;
+		uint64_t gen        : 1;
+		uint64_t __reserved : 31;
+#else
+		uint64_t __reserved : 31;
+		uint64_t gen        : 1;
+		uint64_t count      : 16;
+		uint64_t ring_idx   : 16;
+#endif
+	};
+	uint64_t word;
+} __gxio_ring_t;
+
+
+void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+			   void *post_region_addr,
+			   unsigned int num_entries)
+{
+	/*
+	 * Limit 65536 entry rings to 65535 credits because we only have a
+	 * 16 bit completion counter.
+	 */
+	int64_t credits = (num_entries < 65536) ? num_entries : 65535;
+
+	memset(dma_queue, 0, sizeof(*dma_queue));
+
+	dma_queue->post_region_addr = post_region_addr;
+	dma_queue->hw_complete_count = 0;
+	dma_queue->credits_and_next_index = credits << DMA_QUEUE_CREDIT_SHIFT;
+}
+
+
+static void __gxio_dma_queue_update_credits(__gxio_dma_queue_t *dma_queue)
+{
+	__gxio_ring_t val;
+	uint64_t count;
+	uint64_t delta;
+	uint64_t new_count;
+
+	/*
+	 * Read the 64-bit completion count without touching the cache, so
+	 * we later avoid having to evict any sharers of this cache line
+	 * when we update it below.
+	 */
+	uint64_t orig_hw_complete_count =
+		cmpxchg(&dma_queue->hw_complete_count, -1, -1);
+
+	/* Make sure the load completes before we access the hardware. */
+	wait_for_value(orig_hw_complete_count);
+
+	/* Read the 16-bit count of how many packets it has completed. */
+	val.word = __gxio_mmio_read(dma_queue->post_region_addr);
+	count = val.count;
+
+	/*
+	 * Calculate the number of completions since we last updated the
+	 * 64-bit counter.  It's safe to ignore the high bits because the
+	 * maximum credit value is 65535.
+	 */
+	delta = (count - orig_hw_complete_count) & 0xffff;
+	if (delta == 0)
+		return;
+
+	/*
+	 * Try to write back the count, advanced by delta.  If we race with
+	 * another thread, this might fail, in which case we return
+	 * immediately on the assumption that some credits are (or at least
+	 * were) available.
+	 */
+	new_count = orig_hw_complete_count + delta;
+	if (cmpxchg(&dma_queue->hw_complete_count,
+		    orig_hw_complete_count, new_count) !=
+	    orig_hw_complete_count)
+		return;
+
+	/*
+	 * We succeeded in advancing the completion count; add back the
+	 * corresponding number of egress credits.
+	 */
+	__insn_fetchadd(&dma_queue->credits_and_next_index,
+			(delta << DMA_QUEUE_CREDIT_SHIFT));
+}
+
+
+/*
+ * A separate 'blocked' method for put() so that backtraces and
+ * profiles will clearly indicate that we're wasting time spinning on
+ * egress availability rather than actually posting commands.
+ */
+int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+					  int64_t modifier)
+{
+	int backoff = 16;
+	int64_t old;
+
+	do {
+		int i;
+		/* Back off to avoid spamming memory networks. */
+		for (i = backoff; i > 0; i--)
+			__insn_mfspr(SPR_PASS);
+
+		/* Check credits again. */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+
+		/* Calculate bounded exponential backoff for next iteration. */
+		if (backoff < 256)
+			backoff *= 2;
+	} while (old + modifier < 0);
+
+	return old;
+}
+
+
+int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+				     unsigned int num, int wait)
+{
+	uint64_t slot;
+	uint64_t complete;
+
+	/*
+	 * Try to reserve 'num' egress command slots.  We do this by
+	 * constructing a constant that subtracts N credits and adds N to
+	 * the index, and using fetchaddgez to only apply it if the credits
+	 * count doesn't go negative.
+	 */
+	int64_t modifier = (((int64_t)(-num)) << DMA_QUEUE_CREDIT_SHIFT) | num;
+	int64_t old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+
+	if (unlikely(old + modifier < 0)) {
+		/*
+		 * We're out of credits.  Try once to get more by checking for
+		 * completed egress commands.  If that fails, wait or fail.
+		 */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+		if (old + modifier < 0) {
+			if (wait)
+				old = __gxio_dma_queue_wait_for_credits(
+					dma_queue, modifier);
+			else
+				return GXIO_ERR_DMA_CREDITS;
+		}
+	}
+
+	/*
+	 * Compute the value for "slot" which will correspond to the
+	 * eventual value of "hw_complete_count".  We combine the low 24
+	 * bits of "old" with the high 40 bits of "hw_complete_count", and
+	 * if the result is LESS than "hw_complete_count", then we handle
+	 * wrapping by adding "1 << 24".  TODO: As a future optimization,
+	 * whenever "hw_complete_count" is modified, we could store the high
+	 * 41 bits of "hw_complete_count" in a separate field, but only when
+	 * they change, and use it instead of "hw_complete_count" above.
+	 * This will reduce the chance of "inval storms".  TODO: As a future
+	 * optimization, we could make a version of this function that simply
+	 * returns "old & 0xffffff", which is "good enough" for many uses.
+	 */
+	complete = ACCESS_ONCE(dma_queue->hw_complete_count);
+	slot = (complete & 0xffffffffff000000) | (old & 0xffffff);
+	if (slot < complete)
+		slot += 0x1000000;
+
+	/*
+	 * If any of our indexes mod 256 were equivalent to 0, go ahead and
+	 * collect some egress credits, and update "hw_complete_count".
+	 */
+	if (unlikely(((slot + num) & 0xff) < num)) {
+		__gxio_dma_queue_update_credits(dma_queue);
+
+		/* Make sure the index doesn't overflow into the credits. */
+#ifdef __BIG_ENDIAN__
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 4) = 0;
+#else
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 3) = 0;
+#endif
+	}
+
+	return slot;
+}
+
+
+int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue, int64_t slot,
+				 int update)
+{
+	if (update) {
+		if (ACCESS_ONCE(dma_queue->hw_complete_count) > slot)
+			return 1;
+
+		__gxio_dma_queue_update_credits(dma_queue);
+	}
+
+	return ACCESS_ONCE(dma_queue->hw_complete_count) > slot;
+}
diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h
new file mode 100644
index 0000000..6e18b88
--- /dev/null
+++ b/arch/tile/include/gxio/dma_queue.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_DMA_QUEUE_H_
+#define _GXIO_DMA_QUEUE_H_
+
+/*
+ * DMA queue management APIs shared between TRIO and mPIPE.
+ */
+
+#include "common.h"
+
+/* State object that tracks a DMA queue's head and tail indices, as
+    well as the number of commands posted and completed.  The
+    structure is accessed via a thread-safe, lock-free algorithm. */
+typedef struct {
+  /* Address of a MPIPE_EDMA_POST_REGION_VAL_t,
+      TRIO_PUSH_DMA_REGION_VAL_t, or TRIO_PULL_DMA_REGION_VAL_t
+      register.  These register have identical encodings and provide
+      information about how many commands have been processed. */
+	void *post_region_addr;
+
+  /* A lazily-updated count of how many commands the hardware has
+      completed. */
+	uint64_t hw_complete_count __attribute__ ((aligned(64)));
+
+  /* High 32 bits are a count of available egress command credits,
+      low 32 bits are the next command index. */
+	int64_t credits_and_next_index;
+} __gxio_dma_queue_t;
+
+/* Initialize a dma queue. */
+void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+			   void *post_region_addr, unsigned int num_entries);
+
+/* Try to reserve credits, potentially blocking. */
+int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+				     unsigned int num, int wait);
+
+/* Wait for credits to become available. */
+int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+					  int64_t modifier);
+
+/* Check whether a particular slot has completed. */
+int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue, int64_t slot,
+				 int update);
+
+#endif /* !_GXIO_DMA_QUEUE_H_ */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 3/6] arch/tile: common DMA code for the GXIO IORPC subsystem
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
  2012-04-04 20:39                 ` [PATCH v2 1/6] arch/tile: introduce GXIO IORPC framework " Chris Metcalf
@ 2012-04-04 20:58                 ` Chris Metcalf
  2012-04-06 17:52                 ` [PATCH v2 2/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
                                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-04 20:58 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel

The dma_queue support is used by both the mPipe (networking)
and Trio (PCI) hardware shims on tilegx.  This common code is
selected when either of those drivers is built.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/gxio/Kconfig             |    6 +
 arch/tile/gxio/Makefile            |    1 +
 arch/tile/gxio/dma_queue.c         |  230 ++++++++++++++++++++++++++++++++++++
 arch/tile/include/gxio/dma_queue.h |   59 +++++++++
 4 files changed, 296 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/dma_queue.c
 create mode 100644 arch/tile/include/gxio/dma_queue.h

diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index 8eff47f..ecd076c 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -3,3 +3,9 @@
 config TILE_GXIO
 	bool
 	depends on TILEGX
+
+# Support direct access to the common I/O DMA facility within the
+# TILE-Gx mPIPE and Trio hardware from kernel space.
+config TILE_GXIO_DMA
+	bool
+	select TILE_GXIO
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index db1ee28..97ab468 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
+obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c
new file mode 100644
index 0000000..2921167
--- /dev/null
+++ b/arch/tile/gxio/dma_queue.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/io.h>
+#include <linux/atomic.h>
+#include <gxio/dma_queue.h>
+#include <hv/iorpc.h>
+
+/* Wait for a memory read to complete. */
+#define wait_for_value(val)                             \
+  __asm__ __volatile__("move %0, %0" :: "r"(val))
+
+/* The credit counter lives in the high 32 bits. */
+#define DMA_QUEUE_CREDIT_SHIFT 32
+
+/* The index is in the low 16. */
+#define DMA_QUEUE_INDEX_MASK ((1 << 16) - 1)
+
+/*
+ * The hardware descriptor-ring type.
+ * This matches the types used by mpipe (MPIPE_EDMA_POST_REGION_VAL_t)
+ * and trio (TRIO_PUSH_DMA_REGION_VAL_t or TRIO_PULL_DMA_REGION_VAL_t).
+ * See those types for more documentation on the individual fields.
+ */
+typedef union {
+	struct {
+#ifndef __BIG_ENDIAN__
+		uint64_t ring_idx:16;
+		uint64_t count:16;
+		uint64_t gen:1;
+		uint64_t __reserved:31;
+#else
+		uint64_t __reserved:31;
+		uint64_t gen:1;
+		uint64_t count:16;
+		uint64_t ring_idx:16;
+#endif
+	};
+	uint64_t word;
+} __gxio_ring_t;
+
+void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+			   void *post_region_addr, unsigned int num_entries)
+{
+	/*
+	 * Limit 65536 entry rings to 65535 credits because we only have a
+	 * 16 bit completion counter.
+	 */
+	int64_t credits = (num_entries < 65536) ? num_entries : 65535;
+
+	memset(dma_queue, 0, sizeof(*dma_queue));
+
+	dma_queue->post_region_addr = post_region_addr;
+	dma_queue->hw_complete_count = 0;
+	dma_queue->credits_and_next_index = credits << DMA_QUEUE_CREDIT_SHIFT;
+}
+
+static void __gxio_dma_queue_update_credits(__gxio_dma_queue_t *dma_queue)
+{
+	__gxio_ring_t val;
+	uint64_t count;
+	uint64_t delta;
+	uint64_t new_count;
+
+	/*
+	 * Read the 64-bit completion count without touching the cache, so
+	 * we later avoid having to evict any sharers of this cache line
+	 * when we update it below.
+	 */
+	uint64_t orig_hw_complete_count =
+		cmpxchg(&dma_queue->hw_complete_count, -1, -1);
+
+	/* Make sure the load completes before we access the hardware. */
+	wait_for_value(orig_hw_complete_count);
+
+	/* Read the 16-bit count of how many packets it has completed. */
+	val.word = __gxio_mmio_read(dma_queue->post_region_addr);
+	count = val.count;
+
+	/*
+	 * Calculate the number of completions since we last updated the
+	 * 64-bit counter.  It's safe to ignore the high bits because the
+	 * maximum credit value is 65535.
+	 */
+	delta = (count - orig_hw_complete_count) & 0xffff;
+	if (delta == 0)
+		return;
+
+	/*
+	 * Try to write back the count, advanced by delta.  If we race with
+	 * another thread, this might fail, in which case we return
+	 * immediately on the assumption that some credits are (or at least
+	 * were) available.
+	 */
+	new_count = orig_hw_complete_count + delta;
+	if (cmpxchg(&dma_queue->hw_complete_count,
+		    orig_hw_complete_count, new_count) !=
+	    orig_hw_complete_count)
+		return;
+
+	/*
+	 * We succeeded in advancing the completion count; add back the
+	 * corresponding number of egress credits.
+	 */
+	__insn_fetchadd(&dma_queue->credits_and_next_index,
+			(delta << DMA_QUEUE_CREDIT_SHIFT));
+}
+
+/*
+ * A separate 'blocked' method for put() so that backtraces and
+ * profiles will clearly indicate that we're wasting time spinning on
+ * egress availability rather than actually posting commands.
+ */
+int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+					  int64_t modifier)
+{
+	int backoff = 16;
+	int64_t old;
+
+	do {
+		int i;
+		/* Back off to avoid spamming memory networks. */
+		for (i = backoff; i > 0; i--)
+			__insn_mfspr(SPR_PASS);
+
+		/* Check credits again. */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+
+		/* Calculate bounded exponential backoff for next iteration. */
+		if (backoff < 256)
+			backoff *= 2;
+	} while (old + modifier < 0);
+
+	return old;
+}
+
+int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+				     unsigned int num, int wait)
+{
+	uint64_t slot;
+	uint64_t complete;
+
+	/*
+	 * Try to reserve 'num' egress command slots.  We do this by
+	 * constructing a constant that subtracts N credits and adds N to
+	 * the index, and using fetchaddgez to only apply it if the credits
+	 * count doesn't go negative.
+	 */
+	int64_t modifier = (((int64_t)(-num)) << DMA_QUEUE_CREDIT_SHIFT) | num;
+	int64_t old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+
+	if (unlikely(old + modifier < 0)) {
+		/*
+		 * We're out of credits.  Try once to get more by checking for
+		 * completed egress commands.  If that fails, wait or fail.
+		 */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+		if (old + modifier < 0) {
+			if (wait)
+				old = __gxio_dma_queue_wait_for_credits
+					(dma_queue, modifier);
+			else
+				return GXIO_ERR_DMA_CREDITS;
+		}
+	}
+
+	/*
+	 * Compute the value for "slot" which will correspond to the
+	 * eventual value of "hw_complete_count".  We combine the low 24
+	 * bits of "old" with the high 40 bits of "hw_complete_count", and
+	 * if the result is LESS than "hw_complete_count", then we handle
+	 * wrapping by adding "1 << 24".  TODO: As a future optimization,
+	 * whenever "hw_complete_count" is modified, we could store the high
+	 * 41 bits of "hw_complete_count" in a separate field, but only when
+	 * they change, and use it instead of "hw_complete_count" above.
+	 * This will reduce the chance of "inval storms".  TODO: As a future
+	 * optimization, we could make a version of this function that simply
+	 * returns "old & 0xffffff", which is "good enough" for many uses.
+	 */
+	complete = ACCESS_ONCE(dma_queue->hw_complete_count);
+	slot = (complete & 0xffffffffff000000) | (old & 0xffffff);
+	if (slot < complete)
+		slot += 0x1000000;
+
+	/*
+	 * If any of our indexes mod 256 were equivalent to 0, go ahead and
+	 * collect some egress credits, and update "hw_complete_count".
+	 */
+	if (unlikely(((slot + num) & 0xff) < num)) {
+		__gxio_dma_queue_update_credits(dma_queue);
+
+		/* Make sure the index doesn't overflow into the credits. */
+#ifdef __BIG_ENDIAN__
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 4) = 0;
+#else
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 3) = 0;
+#endif
+	}
+
+	return slot;
+}
+
+int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue, int64_t slot,
+				 int update)
+{
+	if (update) {
+		if (ACCESS_ONCE(dma_queue->hw_complete_count) > slot)
+			return 1;
+
+		__gxio_dma_queue_update_credits(dma_queue);
+	}
+
+	return ACCESS_ONCE(dma_queue->hw_complete_count) > slot;
+}
diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h
new file mode 100644
index 0000000..3a61a6d
--- /dev/null
+++ b/arch/tile/include/gxio/dma_queue.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_DMA_QUEUE_H_
+#define _GXIO_DMA_QUEUE_H_
+
+/*
+ * DMA queue management APIs shared between TRIO and mPIPE.
+ */
+
+#include "common.h"
+
+/* State object that tracks a DMA queue's head and tail indices, as
+    well as the number of commands posted and completed.  The
+    structure is accessed via a thread-safe, lock-free algorithm. */
+typedef struct {
+	/* Address of a MPIPE_EDMA_POST_REGION_VAL_t,
+	   TRIO_PUSH_DMA_REGION_VAL_t, or TRIO_PULL_DMA_REGION_VAL_t
+	   register.  These register have identical encodings and provide
+	   information about how many commands have been processed. */
+	void *post_region_addr;
+
+	/* A lazily-updated count of how many commands the hardware has
+	   completed. */
+	uint64_t hw_complete_count __attribute__ ((aligned(64)));
+
+	/* High 32 bits are a count of available egress command credits,
+	   low 32 bits are the next command index. */
+	int64_t credits_and_next_index;
+} __gxio_dma_queue_t;
+
+/* Initialize a dma queue. */
+void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+			   void *post_region_addr, unsigned int num_entries);
+
+/* Try to reserve credits, potentially blocking. */
+int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+				     unsigned int num, int wait);
+
+/* Wait for credits to become available. */
+int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+					  int64_t modifier);
+
+/* Check whether a particular slot has completed. */
+int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue, int64_t slot,
+				 int update);
+
+#endif /* !_GXIO_DMA_QUEUE_H_ */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 2/6] arch/tile: fix set_pte() to properly handle kernel MMIO mappings
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
  2012-04-04 20:39 ` [PATCH 1/6] arch/tile: introduce GXIO IORPC framework for tilegx Chris Metcalf
  2012-04-04 20:58 ` [PATCH 4/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
@ 2012-04-06 17:41 ` Chris Metcalf
  2012-04-06 17:52 ` [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 17:41 UTC (permalink / raw)
  To: linux-kernel

We can't look at the 'struct page' for such mappings.  And while we're
at it, also avoid trying to look up the 'struct page' for memory
mappings that are not being managed by Linux, or non-present PTEs.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/mm/pgtable.c |   16 ++++++++++++----
 1 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 211558e..591621f 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -453,10 +453,18 @@ void __set_pte(pte_t *ptep, pte_t pte)
 
 void set_pte(pte_t *ptep, pte_t pte)
 {
-	struct page *page = pfn_to_page(pte_pfn(pte));
-
-	/* Update the home of a PTE if necessary */
-	pte = pte_set_home(pte, page_home(page));
+	if (pte_present(pte) &&
+	    (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
+		/* The PTE actually references physical memory. */
+		unsigned long pfn = pte_pfn(pte);
+		if (pfn_valid(pfn)) {
+			/* Update the home of the PTE from the struct page. */
+			pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
+		} else if (hv_pte_get_mode(pte) == 0) {
+			/* remap_pfn_range(), etc, must supply PTE mode. */
+			panic("set_pte(): out-of-range PFN and mode 0\n");
+		}
+	}
 
 	__set_pte(ptep, pte);
 }
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc.
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
                   ` (2 preceding siblings ...)
  2012-04-06 17:41 ` [PATCH 2/6] arch/tile: fix set_pte() to properly handle kernel MMIO mappings Chris Metcalf
@ 2012-04-06 17:52 ` Chris Metcalf
  2012-04-09 13:24   ` Arnd Bergmann
  2012-04-06 20:38 ` [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
  2012-04-06 20:42 ` [PATCH 6/6] tilegx network driver: initial support Chris Metcalf
  5 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 17:52 UTC (permalink / raw)
  To: linux-kernel

Add support for MMIO read/write on tilegx to support GXIO IORPC access.
Similar to the asm-generic version, but we include memory fences on
the writes to be conservative.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/include/asm/io.h |  110 ++++++++++++++++++++++++++++++++-----------
 1 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index d2152de..d923841 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -62,6 +62,58 @@ extern void iounmap(volatile void __iomem *addr);
 #define mm_ptov(addr)		((void *)phys_to_virt(addr))
 #define mm_vtop(addr)		((unsigned long)virt_to_phys(addr))
 
+#if CHIP_HAS_MMIO()
+
+static inline u8 __raw_readb(const volatile void __iomem *addr)
+{
+	return *(const volatile u8 __force *)addr;
+}
+
+static inline u16 __raw_readw(const volatile void __iomem *addr)
+{
+	return le16_to_cpu(*(const volatile u16 __force *)addr);
+}
+
+static inline u32 __raw_readl(const volatile void __iomem *addr)
+{
+	return le32_to_cpu(*(const volatile u32 __force *)addr);
+}
+
+static inline u64 __raw_readq(const volatile void __iomem *addr)
+{
+	return le64_to_cpu(*(const volatile u64 __force *)addr);
+}
+
+static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
+{
+	__insn_mf();
+	*(volatile u8 __force *)addr = val;
+	__insn_mf();
+}
+
+static inline void __raw_writew(u16 val, volatile void __iomem *addr)
+{
+	__insn_mf();
+	*(volatile u16 __force *)addr = cpu_to_le16(val);
+	__insn_mf();
+}
+
+static inline void __raw_writel(u32 val, volatile void __iomem *addr)
+{
+	__insn_mf();
+	*(volatile u32 __force *)addr = cpu_to_le32(val);
+	__insn_mf();
+}
+
+static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
+{
+	__insn_mf();
+	*(volatile u64 __force *)addr = cpu_to_le64(val);
+	__insn_mf();
+}
+
+#else /* CHIP_HAS_MMIO() */
+
 #ifdef CONFIG_PCI
 
 extern u8 _tile_readb(unsigned long addr);
@@ -73,10 +125,19 @@ extern void _tile_writew(u16 val, unsigned long addr);
 extern void _tile_writel(u32 val, unsigned long addr);
 extern void _tile_writeq(u64 val, unsigned long addr);
 
-#else
+#define __raw_readb(addr) _tile_readb((unsigned long)addr)
+#define __raw_readw(addr) _tile_readw((unsigned long)addr)
+#define __raw_readl(addr) _tile_readl((unsigned long)addr)
+#define __raw_readq(addr) _tile_readq((unsigned long)addr)
+#define __raw_writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
+#define __raw_writew(val, addr) _tile_writew(val, (unsigned long)addr)
+#define __raw_writel(val, addr) _tile_writel(val, (unsigned long)addr)
+#define __raw_writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
+
+#else /* CONFIG_PCI */
 
 /*
- * The Tile architecture does not support IOMEM unless PCI is enabled.
+ * The tilepro architecture does not support IOMEM unless PCI is enabled.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -88,65 +149,58 @@ static inline int iomem_panic(void)
 	return 0;
 }
 
-static inline u8 _tile_readb(unsigned long addr)
+static inline u8 readb(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u16 _tile_readw(unsigned long addr)
+static inline u16 _readw(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u32 _tile_readl(unsigned long addr)
+static inline u32 readl(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u64 _tile_readq(unsigned long addr)
+static inline u64 readq(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline void _tile_writeb(u8  val, unsigned long addr)
+static inline void writeb(u8  val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writew(u16 val, unsigned long addr)
+static inline void writew(u16 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writel(u32 val, unsigned long addr)
+static inline void writel(u32 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writeq(u64 val, unsigned long addr)
+static inline void writeq(u64 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-#endif
+#endif /* CONFIG_PCI */
+
+#endif /* CHIP_HAS_MMIO() */
 
-#define readb(addr) _tile_readb((unsigned long)addr)
-#define readw(addr) _tile_readw((unsigned long)addr)
-#define readl(addr) _tile_readl((unsigned long)addr)
-#define readq(addr) _tile_readq((unsigned long)addr)
-#define writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
-#define writew(val, addr) _tile_writew(val, (unsigned long)addr)
-#define writel(val, addr) _tile_writel(val, (unsigned long)addr)
-#define writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
-
-#define __raw_readb readb
-#define __raw_readw readw
-#define __raw_readl readl
-#define __raw_readq readq
-#define __raw_writeb writeb
-#define __raw_writew writew
-#define __raw_writel writel
-#define __raw_writeq writeq
+#define readb __raw_readb
+#define readw __raw_readw
+#define readl __raw_readl
+#define readq __raw_readq
+#define writeb __raw_writeb
+#define writew __raw_writew
+#define writel __raw_writel
+#define writeq __raw_writeq
 
 #define readb_relaxed readb
 #define readw_relaxed readw
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 2/6] arch/tile: support MMIO-based readb/writeb etc.
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
  2012-04-04 20:39                 ` [PATCH v2 1/6] arch/tile: introduce GXIO IORPC framework " Chris Metcalf
  2012-04-04 20:58                 ` [PATCH v2 3/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
@ 2012-04-06 17:52                 ` Chris Metcalf
  2012-04-06 20:38                 ` [PATCH v2 4/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
                                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 17:52 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel

Add support for MMIO read/write on tilegx to support GXIO IORPC access.
Similar to the asm-generic version, but we include memory fences on
the writes to be conservative.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/include/asm/io.h |  144 +++++++++++++++++++++++++++++++++++---------
 1 files changed, 116 insertions(+), 28 deletions(-)

diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index d2152de..2a9b293 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -62,6 +62,92 @@ extern void iounmap(volatile void __iomem *addr);
 #define mm_ptov(addr)		((void *)phys_to_virt(addr))
 #define mm_vtop(addr)		((unsigned long)virt_to_phys(addr))
 
+#if CHIP_HAS_MMIO()
+
+/*
+ * We use inline assembly to guarantee that the compiler does not
+ * split an access into multiple byte-sized accesses as it might
+ * sometimes do if a register data structure is marked "packed".
+ * Obviously on tile we can't tolerate such an access being
+ * actually unaligned, but we want to avoid the case where the
+ * compiler conservatively would generate multiple accesses even
+ * for an aligned read or write.
+ */
+
+static inline u8 __raw_readb(const volatile void __iomem *addr)
+{
+	return *(const volatile u8 __force *)addr;
+}
+
+static inline u16 __raw_readw(const volatile void __iomem *addr)
+{
+	u16 ret;
+	asm volatile("ld2u %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le16_to_cpu(ret);
+}
+
+static inline u32 __raw_readl(const volatile void __iomem *addr)
+{
+	u32 ret;
+	/* Sign-extend to conform to u32 ABI sign-extension convention. */
+	asm volatile("ld4s %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le32_to_cpu(ret);
+}
+
+static inline u64 __raw_readq(const volatile void __iomem *addr)
+{
+	u64 ret;
+	asm volatile("ld %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le64_to_cpu(ret);
+}
+
+static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
+{
+	*(volatile u8 __force *)addr = val;
+}
+
+static inline void __raw_writew(u16 val, volatile void __iomem *addr)
+{
+	asm volatile("st2 %0, %1" :: "r" (addr), "r" (cpu_to_le16(val)));
+}
+
+static inline void __raw_writel(u32 val, volatile void __iomem *addr)
+{
+	asm volatile("st4 %0, %1" :: "r" (addr), "r" (cpu_to_le32(val)));
+}
+
+static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
+{
+	asm volatile("st %0, %1" :: "r" (addr), "r" (cpu_to_le64(val)));
+}
+
+/*
+ * The on-chip I/O hardware on tilegx is configured with VA=PA for the
+ * kernel's PA range.  The low-level APIs and field names use "va" and
+ * "void *" nomenclature, to be consistent with the general notion
+ * that the addresses in question are virtualizable, but in the kernel
+ * context we are actually manipulating PA values.  (In other contexts,
+ * e.g. access from user space, we do in fact use real virtual addresses
+ * in the va fields.)  To allow readers of the code to understand what's
+ * happening, we direct their attention to this comment by using the
+ * following two functions that just duplicate __va() and __pa().
+ */
+typedef unsigned long tile_io_addr_t;
+static inline tile_io_addr_t va_to_tile_io_addr(void *va)
+{
+	BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(tile_io_addr_t));
+	return __pa(va);
+}
+static inline void *tile_io_addr_to_va(tile_io_addr_t tile_io_addr)
+{
+	return __va(tile_io_addr);
+}
+
+#else /* CHIP_HAS_MMIO() */
+
 #ifdef CONFIG_PCI
 
 extern u8 _tile_readb(unsigned long addr);
@@ -73,10 +159,19 @@ extern void _tile_writew(u16 val, unsigned long addr);
 extern void _tile_writel(u32 val, unsigned long addr);
 extern void _tile_writeq(u64 val, unsigned long addr);
 
-#else
+#define __raw_readb(addr) _tile_readb((unsigned long)addr)
+#define __raw_readw(addr) _tile_readw((unsigned long)addr)
+#define __raw_readl(addr) _tile_readl((unsigned long)addr)
+#define __raw_readq(addr) _tile_readq((unsigned long)addr)
+#define __raw_writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
+#define __raw_writew(val, addr) _tile_writew(val, (unsigned long)addr)
+#define __raw_writel(val, addr) _tile_writel(val, (unsigned long)addr)
+#define __raw_writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
+
+#else /* CONFIG_PCI */
 
 /*
- * The Tile architecture does not support IOMEM unless PCI is enabled.
+ * The tilepro architecture does not support IOMEM unless PCI is enabled.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -88,65 +183,58 @@ static inline int iomem_panic(void)
 	return 0;
 }
 
-static inline u8 _tile_readb(unsigned long addr)
+static inline u8 readb(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u16 _tile_readw(unsigned long addr)
+static inline u16 _readw(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u32 _tile_readl(unsigned long addr)
+static inline u32 readl(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u64 _tile_readq(unsigned long addr)
+static inline u64 readq(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline void _tile_writeb(u8  val, unsigned long addr)
+static inline void writeb(u8  val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writew(u16 val, unsigned long addr)
+static inline void writew(u16 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writel(u32 val, unsigned long addr)
+static inline void writel(u32 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writeq(u64 val, unsigned long addr)
+static inline void writeq(u64 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-#endif
+#endif /* CONFIG_PCI */
+
+#endif /* CHIP_HAS_MMIO() */
 
-#define readb(addr) _tile_readb((unsigned long)addr)
-#define readw(addr) _tile_readw((unsigned long)addr)
-#define readl(addr) _tile_readl((unsigned long)addr)
-#define readq(addr) _tile_readq((unsigned long)addr)
-#define writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
-#define writew(val, addr) _tile_writew(val, (unsigned long)addr)
-#define writel(val, addr) _tile_writel(val, (unsigned long)addr)
-#define writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
-
-#define __raw_readb readb
-#define __raw_readw readw
-#define __raw_readl readl
-#define __raw_readq readq
-#define __raw_writeb writeb
-#define __raw_writew writew
-#define __raw_writel writel
-#define __raw_writeq writeq
+#define readb __raw_readb
+#define readw __raw_readw
+#define readl __raw_readl
+#define readq __raw_readq
+#define writeb __raw_writeb
+#define writew __raw_writew
+#define writel __raw_writel
+#define writeq __raw_writeq
 
 #define readb_relaxed readb
 #define readw_relaxed readw
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
                   ` (3 preceding siblings ...)
  2012-04-06 17:52 ` [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
@ 2012-04-06 20:38 ` Chris Metcalf
  2012-04-09 13:34   ` Arnd Bergmann
  2012-04-06 20:42 ` [PATCH 6/6] tilegx network driver: initial support Chris Metcalf
  5 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 20:38 UTC (permalink / raw)
  To: linux-kernel

The TILE-Gx chip includes a packet-processing network engine called
mPIPE ("Multicore Programmable Intelligent Packet Engine").  This
change adds support for using the mPIPE engine from within the
kernel.  The engine has more functionality than is exposed here,
but to keep the kernel code and binary simpler, this is a subset
of the full API designed to enable standard Linux networking only.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/gxio/Kconfig                    |    9 +
 arch/tile/gxio/Makefile                   |    1 +
 arch/tile/gxio/iorpc_mpipe.c              |  571 +++++++++
 arch/tile/gxio/iorpc_mpipe_info.c         |   95 ++
 arch/tile/gxio/mpipe.c                    |  631 +++++++++
 arch/tile/include/arch/mpipe.h            |  321 +++++
 arch/tile/include/arch/mpipe_constants.h  |   43 +
 arch/tile/include/arch/mpipe_def.h        |   39 +
 arch/tile/include/arch/mpipe_shm.h        |  421 ++++++
 arch/tile/include/arch/mpipe_shm_def.h    |   23 +
 arch/tile/include/gxio/iorpc_mpipe.h      |  124 ++
 arch/tile/include/gxio/iorpc_mpipe_info.h |   46 +
 arch/tile/include/gxio/mpipe.h            | 1986 +++++++++++++++++++++++++++++
 arch/tile/include/hv/drv_mpipe_intf.h     |  602 +++++++++
 14 files changed, 4912 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/iorpc_mpipe.c
 create mode 100644 arch/tile/gxio/iorpc_mpipe_info.c
 create mode 100644 arch/tile/gxio/mpipe.c
 create mode 100644 arch/tile/include/arch/mpipe.h
 create mode 100644 arch/tile/include/arch/mpipe_constants.h
 create mode 100644 arch/tile/include/arch/mpipe_def.h
 create mode 100644 arch/tile/include/arch/mpipe_shm.h
 create mode 100644 arch/tile/include/arch/mpipe_shm_def.h
 create mode 100644 arch/tile/include/gxio/iorpc_mpipe.h
 create mode 100644 arch/tile/include/gxio/iorpc_mpipe_info.h
 create mode 100644 arch/tile/include/gxio/mpipe.h
 create mode 100644 arch/tile/include/hv/drv_mpipe_intf.h

diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index b5cd898..ec20e8c 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -14,3 +14,12 @@ config TILE_GXIO_DMA
 	  within the TILE-Gx mPIPE and Trio hardware.  It is not required
 	  in order to use the gxio library to access mPIPE or Trio from
 	  user space.
+
+config TILE_GXIO_MPIPE
+	bool "Tilera Gx mPIPE I/O support"
+	select TILE_GXIO
+	select TILE_GXIO_DMA
+	---help---
+	  This option supports direct access to the TILE-Gx mPIPE hardware
+	  from kernel space.  It is not required in order to use the gxio
+	  library to access mPIPE from user space.
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index 97ab468..130eec4 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
 obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
+obj-$(CONFIG_TILE_GXIO_MPIPE) += mpipe.o iorpc_mpipe.o iorpc_mpipe_info.o
diff --git a/arch/tile/gxio/iorpc_mpipe.c b/arch/tile/gxio/iorpc_mpipe.c
new file mode 100644
index 0000000..da43e62
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe.h"
+
+typedef struct {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+} alloc_buffer_stacks_param_t;
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	alloc_buffer_stacks_param_t temp;
+	alloc_buffer_stacks_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buffer_stacks);
+
+typedef struct {
+	iorpc_mem_buffer_t buffer;
+	unsigned int stack;
+	unsigned int buffer_size_enum;
+} init_buffer_stack_aux_param_t;
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t * context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum)
+{
+	uint64_t __offset;
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	init_buffer_stack_aux_param_t temp;
+	init_buffer_stack_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->stack = stack;
+	params->buffer_size_enum = buffer_size_enum;
+
+	__offset = GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack_aux);
+
+
+typedef struct {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+} alloc_notif_rings_param_t;
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t * context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	alloc_notif_rings_param_t temp;
+	alloc_notif_rings_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_rings);
+
+typedef struct {
+	iorpc_mem_buffer_t buffer;
+	unsigned int ring;
+} init_notif_ring_aux_param_t;
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring)
+{
+	uint64_t __offset;
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	init_notif_ring_aux_param_t temp;
+	init_notif_ring_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+
+	__offset = GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_ring_aux);
+
+typedef struct {
+	iorpc_interrupt_t interrupt;
+	unsigned int ring;
+} request_notif_ring_interrupt_param_t;
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring)
+{
+	uint64_t __offset;
+	int __result;
+	request_notif_ring_interrupt_param_t temp;
+	request_notif_ring_interrupt_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+	params->ring = ring;
+
+	__offset = GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_request_notif_ring_interrupt);
+
+typedef struct {
+	unsigned int ring;
+} enable_notif_ring_interrupt_param_t;
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					   unsigned int ring)
+{
+	uint64_t __offset;
+	int __result;
+	enable_notif_ring_interrupt_param_t temp;
+	enable_notif_ring_interrupt_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->ring = ring;
+
+	__offset = GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_enable_notif_ring_interrupt);
+
+typedef struct {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+} alloc_notif_groups_param_t;
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t * context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	alloc_notif_groups_param_t temp;
+	alloc_notif_groups_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_groups);
+
+typedef struct {
+	unsigned int group;
+	gxio_mpipe_notif_group_bits_t bits;
+} init_notif_group_param_t;
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t * context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits)
+{
+	uint64_t __offset;
+	int __result;
+	init_notif_group_param_t temp;
+	init_notif_group_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->group = group;
+	params->bits = bits;
+
+	__offset = GXIO_MPIPE_OP_INIT_NOTIF_GROUP;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_group);
+
+typedef struct {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+} alloc_buckets_param_t;
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t * context, unsigned int count,
+			     unsigned int first, unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	alloc_buckets_param_t temp;
+	alloc_buckets_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_ALLOC_BUCKETS;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buckets);
+
+typedef struct {
+	unsigned int bucket;
+	MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info;
+} init_bucket_param_t;
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t * context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info)
+{
+	uint64_t __offset;
+	int __result;
+	init_bucket_param_t temp;
+	init_bucket_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->bucket = bucket;
+	params->bucket_info = bucket_info;
+
+	__offset = GXIO_MPIPE_OP_INIT_BUCKET;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_bucket);
+
+typedef struct {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+} alloc_edma_rings_param_t;
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t * context,
+				unsigned int count, unsigned int first,
+				unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	alloc_edma_rings_param_t temp;
+	alloc_edma_rings_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_ALLOC_EDMA_RINGS;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_edma_rings);
+
+typedef struct {
+	iorpc_mem_buffer_t buffer;
+	unsigned int ring;
+	unsigned int channel;
+} init_edma_ring_aux_param_t;
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel)
+{
+	uint64_t __offset;
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	init_edma_ring_aux_param_t temp;
+	init_edma_ring_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+	params->channel = channel;
+
+	__offset = GXIO_MPIPE_OP_INIT_EDMA_RING_AUX;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_edma_ring_aux);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t * context, const void *blob,
+			    size_t blob_size)
+{
+	uint64_t __offset;
+	int __result;
+	size_t __size = blob_size;
+	const void *params = blob;
+
+	__offset = GXIO_MPIPE_OP_COMMIT_RULES;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_commit_rules);
+
+typedef struct {
+	unsigned int iotlb;
+	HV_PTE pte;
+	unsigned int flags;
+} register_client_memory_param_t;
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t * context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	register_client_memory_param_t temp;
+	register_client_memory_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->iotlb = iotlb;
+	params->pte = pte;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_register_client_memory);
+
+typedef struct {
+	_gxio_mpipe_link_name_t name;
+	unsigned int flags;
+} link_open_aux_param_t;
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t * context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags)
+{
+	uint64_t __offset;
+	int __result;
+	link_open_aux_param_t temp;
+	link_open_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->name = name;
+	params->flags = flags;
+
+	__offset = GXIO_MPIPE_OP_LINK_OPEN_AUX;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_open_aux);
+
+typedef struct {
+	int mac;
+} link_close_aux_param_t;
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t * context, int mac)
+{
+	uint64_t __offset;
+	int __result;
+	link_close_aux_param_t temp;
+	link_close_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->mac = mac;
+
+	__offset = GXIO_MPIPE_OP_LINK_CLOSE_AUX;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_close_aux);
+
+
+typedef struct {
+	iorpc_pollfd_t pollfd;
+} arm_pollfd_param_t;
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie)
+{
+	uint64_t __offset;
+	int __result;
+	arm_pollfd_param_t temp;
+	arm_pollfd_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	__offset = GXIO_MPIPE_OP_ARM_POLLFD;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_arm_pollfd);
+
+typedef struct {
+	iorpc_pollfd_t pollfd;
+} close_pollfd_param_t;
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie)
+{
+	uint64_t __offset;
+	int __result;
+	close_pollfd_param_t temp;
+	close_pollfd_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	__offset = GXIO_MPIPE_OP_CLOSE_POLLFD;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_close_pollfd);
+
+typedef struct {
+	HV_PTE base;
+} get_mmio_base_param_t;
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t * context, HV_PTE *base)
+{
+	uint64_t __offset;
+	int __result;
+	get_mmio_base_param_t temp;
+	get_mmio_base_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__offset = GXIO_MPIPE_OP_GET_MMIO_BASE;
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, __size,
+			 __offset);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_get_mmio_base);
+
+typedef struct {
+	unsigned long offset;
+	unsigned long size;
+} check_mmio_offset_param_t;
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t * context,
+				 unsigned long offset, unsigned long size)
+{
+	uint64_t __offset;
+	int __result;
+	check_mmio_offset_param_t temp;
+	check_mmio_offset_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->offset = offset;
+	params->size = size;
+
+	__offset = GXIO_MPIPE_OP_CHECK_MMIO_OFFSET;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_mpipe_info.c b/arch/tile/gxio/iorpc_mpipe_info.c
new file mode 100644
index 0000000..d3fe0c8
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe_info.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe_info.h"
+
+
+typedef struct {
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+} enumerate_aux_param_t;
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t * context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t * name,
+				  _gxio_mpipe_link_mac_t * mac)
+{
+	uint64_t __offset;
+	int __result;
+	enumerate_aux_param_t temp;
+	enumerate_aux_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__offset = (((uint64_t) idx << 32) | GXIO_MPIPE_INFO_OP_ENUMERATE_AUX);
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, __size,
+			 __offset);
+	*name = params->name;
+	*mac = params->mac;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_enumerate_aux);
+
+typedef struct {
+	HV_PTE base;
+} get_mmio_base_param_t;
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t * context,
+				  HV_PTE *base)
+{
+	uint64_t __offset;
+	int __result;
+	get_mmio_base_param_t temp;
+	get_mmio_base_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	__offset = GXIO_MPIPE_INFO_OP_GET_MMIO_BASE;
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, __size,
+			 __offset);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_get_mmio_base);
+
+typedef struct {
+	unsigned long offset;
+	unsigned long size;
+} check_mmio_offset_param_t;
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t * context,
+				      unsigned long offset, unsigned long size)
+{
+	uint64_t __offset;
+	int __result;
+	check_mmio_offset_param_t temp;
+	check_mmio_offset_param_t *params = &temp;
+	size_t __size = sizeof(*params);
+
+	params->offset = offset;
+	params->size = size;
+
+	__offset = GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET;
+	__result =
+	    hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, __size,
+			  __offset);
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_check_mmio_offset);
diff --git a/arch/tile/gxio/mpipe.c b/arch/tile/gxio/mpipe.c
new file mode 100644
index 0000000..524bfd1
--- /dev/null
+++ b/arch/tile/gxio/mpipe.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of mpipe gxio calls.
+ */
+
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_mpipe.h>
+#include <gxio/iorpc_mpipe_info.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+
+/* HACK: Avoid pointless "shadow" warnings. */
+#define link link_shadow
+
+int gxio_mpipe_init(gxio_mpipe_context_t * context, unsigned int mpipe_index)
+{
+	char file[32];
+
+	int fd;
+	int i;
+
+	snprintf(file, sizeof(file), "mpipe/%d/iorpc", mpipe_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	/* Map in the MMIO space. */
+	context->mmio_cfg_base = (void __force *)
+	    iorpc_ioremap(fd, HV_MPIPE_CONFIG_MMIO_OFFSET,
+			  HV_MPIPE_CONFIG_MMIO_SIZE);
+	if (context->mmio_cfg_base == NULL)
+		goto cfg_failed;
+
+	context->mmio_fast_base = (void __force *)
+	    iorpc_ioremap(fd, HV_MPIPE_FAST_MMIO_OFFSET,
+			  HV_MPIPE_FAST_MMIO_SIZE);
+	if (context->mmio_fast_base == NULL)
+		goto fast_failed;
+
+	/* Initialize the stacks. */
+	for (i = 0; i < 8; i++)
+		context->__stacks.stacks[i] = 255;
+
+	return 0;
+
+      fast_failed:
+	iounmap((void __force __iomem *)(context->mmio_cfg_base));
+      cfg_failed:
+	hv_dev_close(context->fd);
+	return -ENODEV;
+}
+
+static int16_t gxio_mpipe_buffer_sizes[8] =
+    { 128, 256, 512, 1024, 1664, 4096, 10368, 16384 };
+
+gxio_mpipe_buffer_size_enum_t
+gxio_mpipe_buffer_size_to_buffer_size_enum(size_t size)
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		if (size <= gxio_mpipe_buffer_sizes[i])
+			break;
+	return i;
+}
+
+size_t
+gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+					   buffer_size_enum)
+{
+	if (buffer_size_enum > 7)
+		buffer_size_enum = 7;
+
+	return gxio_mpipe_buffer_sizes[buffer_size_enum];
+}
+
+size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers)
+{
+	const int BUFFERS_PER_LINE = 12;
+
+	/* Count the number of cachlines. */
+	unsigned long lines =
+	    (buffers + BUFFERS_PER_LINE - 1) / BUFFERS_PER_LINE;
+
+	/* Convert to bytes. */
+	return lines * CHIP_L2_LINE_SIZE();
+}
+
+int
+gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t * context,
+			     unsigned int stack,
+			     gxio_mpipe_buffer_size_enum_t buffer_size_enum,
+			     void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	int result;
+
+	memset(mem, 0, mem_size);
+
+	result =
+	    gxio_mpipe_init_buffer_stack_aux(context, mem, mem_size, mem_flags,
+					     stack, buffer_size_enum);
+	if (result < 0)
+		return result;
+
+	/* Save the stack. */
+	context->__stacks.stacks[buffer_size_enum] = stack;
+
+	return 0;
+}
+
+int
+gxio_mpipe_init_notif_ring(gxio_mpipe_context_t * context,
+			   unsigned int ring,
+			   void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	return gxio_mpipe_init_notif_ring_aux(context, mem, mem_size, mem_flags,
+					      ring);
+}
+
+int
+gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t * context,
+					unsigned int group,
+					unsigned int ring,
+					unsigned int num_rings,
+					unsigned int bucket,
+					unsigned int num_buckets,
+					gxio_mpipe_bucket_mode_t mode)
+{
+	int i;
+	int result;
+
+	/* ISSUE: Use "gxio_mpipe_bucket_info_t"? */
+	MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info = { {
+						       .group = group,
+						       .mode = mode,
+						       }
+	};
+
+	gxio_mpipe_notif_group_bits_t bits = { {0} };
+
+	for (i = 0; i < num_rings; i++)
+		gxio_mpipe_notif_group_add_ring(&bits, ring + i);
+
+	result = gxio_mpipe_init_notif_group(context, group, bits);
+	if (result != 0)
+		return result;
+
+	for (i = 0; i < num_buckets; i++) {
+		bucket_info.notifring = ring + (i % num_rings);
+
+		result =
+		    gxio_mpipe_init_bucket(context, bucket + i, bucket_info);
+		if (result != 0)
+			return result;
+	}
+
+	return 0;
+}
+
+int
+gxio_mpipe_init_edma_ring(gxio_mpipe_context_t * context,
+			  unsigned int ring, unsigned int channel,
+			  void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	memset(mem, 0, mem_size);
+
+	return gxio_mpipe_init_edma_ring_aux(context, mem, mem_size, mem_flags,
+					     ring, channel);
+}
+
+void
+gxio_mpipe_rules_init(gxio_mpipe_rules_t * rules,
+		      gxio_mpipe_context_t * context)
+{
+	rules->context = context;
+	memset(&rules->list, 0, sizeof(rules->list));
+}
+
+int
+gxio_mpipe_rules_begin(gxio_mpipe_rules_t * rules,
+		       unsigned int bucket, unsigned int num_buckets,
+		       gxio_mpipe_rules_stacks_t * stacks)
+{
+	int i;
+	int stack = 255;
+
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	/* Current rule. */
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	unsigned int head = list->tail;
+
+	/*
+	 * Align next rule properly.
+	 *Note that "dmacs_and_vlans" will also be aligned.
+	 */
+	unsigned int pad = 0;
+	while (((head + pad) % __alignof__(gxio_mpipe_rules_rule_t)) != 0)
+		pad++;
+
+	/*
+	 * Verify room.
+	 * ISSUE: Mark rules as broken on error?
+	 */
+	if (head + pad + sizeof(*rule) >= sizeof(list->rules))
+		return GXIO_MPIPE_ERR_RULES_FULL;
+
+	/* Verify num_buckets is a power of 2. */
+	if (__builtin_popcount(num_buckets) != 1)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Add padding to previous rule. */
+	rule->size += pad;
+
+	/* Start a new rule. */
+	list->head = head + pad;
+
+	rule = (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Default some values. */
+	rule->headroom = 2;
+	rule->tailroom = 0;
+	rule->capacity = 16384;
+
+	/* Save the bucket info. */
+	rule->bucket_mask = num_buckets - 1;
+	rule->bucket_first = bucket;
+
+	for (i = 8 - 1; i >= 0; i--) {
+		int maybe =
+		    stacks ? stacks->stacks[i] : rules->context->__stacks.
+		    stacks[i];
+		if (maybe != 255)
+			stack = maybe;
+		rule->stacks.stacks[i] = stack;
+	}
+
+	if (stack == 255)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* NOTE: Only entries at the end of the array can be 255. */
+	for (i = 8 - 1; i > 0; i--) {
+		if (rule->stacks.stacks[i] == 255) {
+			rule->stacks.stacks[i] = stack;
+			rule->capacity =
+			    gxio_mpipe_buffer_size_enum_to_buffer_size(i - 1);
+		}
+	}
+
+	rule->size = sizeof(*rule);
+	list->tail = list->head + rule->size;
+
+	return 0;
+}
+
+int
+gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t * rules, unsigned int channel)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify channel. */
+	if (channel >= 32)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->channel_bits |= (1UL << channel);
+
+	return 0;
+}
+
+int gxio_mpipe_rules_set_priority(gxio_mpipe_rules_t * rules, int priority)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->priority = priority;
+
+	return 0;
+}
+
+int gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t * rules, uint8_t headroom)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->headroom = headroom;
+
+	return 0;
+}
+
+int gxio_mpipe_rules_set_tailroom(gxio_mpipe_rules_t * rules, uint8_t tailroom)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->tailroom = tailroom;
+
+	return 0;
+}
+
+int gxio_mpipe_rules_set_capacity(gxio_mpipe_rules_t * rules, uint16_t capacity)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->capacity = capacity;
+
+	return 0;
+}
+
+int
+gxio_mpipe_rules_add_dmac(gxio_mpipe_rules_t * rules,
+			  gxio_mpipe_rules_dmac_t dmac)
+{
+	int i;
+	uint8_t *base;
+	uint8_t *ptr;
+
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	base = rule->dmacs_and_vlans;
+
+	/* Collapse duplicates. */
+	for (i = 0; i < rule->num_dmacs; i++) {
+		uint8_t *old = base + i * sizeof(dmac);
+		if (memcmp(old, &dmac, sizeof(dmac)) == 0)
+			return 0;
+	}
+
+	/*
+	 * Verify room.
+	 * ISSUE: Mark rules as broken on error?
+	 */
+	if (list->tail + sizeof(dmac) >= sizeof(list->rules))
+		return GXIO_MPIPE_ERR_RULES_FULL;
+
+	ptr = base + rule->num_dmacs * sizeof(dmac);
+
+	/* Slide down any vlans. */
+	if (rule->num_vlans != 0)
+		memmove(ptr + sizeof(dmac), ptr, rule->num_vlans * 2);
+
+	*(gxio_mpipe_rules_dmac_t *) ptr = dmac;
+
+	list->tail += sizeof(dmac);
+	rule->size += sizeof(dmac);
+	rule->num_dmacs++;
+
+	return 0;
+}
+
+int
+gxio_mpipe_rules_add_vlan(gxio_mpipe_rules_t * rules,
+			  gxio_mpipe_rules_vlan_t vlan)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+	    (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	uint8_t *base;
+	int i;
+	uint8_t *ptr;
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	base = rule->dmacs_and_vlans + rule->num_dmacs * 6;
+
+	/* Collapse duplicates. */
+	for (i = 0; i < rule->num_vlans; i++) {
+		uint8_t *old = base + i * sizeof(vlan);
+		if (*(gxio_mpipe_rules_vlan_t *) old == vlan)
+			return 0;
+	}
+
+	/*
+	 * Verify room.
+	 * ISSUE: Mark rules as broken on error?
+	 */
+	if (list->tail + sizeof(vlan) >= sizeof(list->rules))
+		return GXIO_MPIPE_ERR_RULES_FULL;
+
+	ptr = base + rule->num_vlans * sizeof(vlan);
+
+	*(gxio_mpipe_rules_vlan_t *) ptr = vlan;
+
+	list->tail += sizeof(vlan);
+	rule->size += sizeof(vlan);
+	rule->num_vlans++;
+
+	return 0;
+}
+
+int gxio_mpipe_rules_commit(gxio_mpipe_rules_t * rules)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+	unsigned int size =
+	    offsetof(gxio_mpipe_rules_list_t, rules) + list->tail;
+	return gxio_mpipe_commit_rules(rules->context, list, size);
+}
+
+int
+gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t * iqueue,
+		       gxio_mpipe_context_t * context,
+		       unsigned int ring,
+		       void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_idesc_t);
+
+	iqueue->context = context;
+	iqueue->idescs = (gxio_mpipe_idesc_t *) mem;
+	iqueue->ring = ring;
+	iqueue->num_entries = num_entries;
+	iqueue->mask_num_entries = num_entries - 1;
+	iqueue->log2_num_entries = __builtin_ctz(num_entries);
+	iqueue->head = 1;
+#ifdef __BIG_ENDIAN__
+	iqueue->swapped = 0;
+#endif
+
+	/* Initialize the "tail". */
+	__gxio_mmio_write(mem, iqueue->head);
+
+	return gxio_mpipe_init_notif_ring(context, ring, mem, mem_size,
+					  mem_flags);
+}
+
+int
+gxio_mpipe_equeue_init(gxio_mpipe_equeue_t * equeue,
+		       gxio_mpipe_context_t * context,
+		       unsigned int edma_ring_id,
+		       unsigned int channel,
+		       void *mem, unsigned int mem_size, unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_edesc_t);
+
+	/* Offset used to read number of completed commands. */
+	MPIPE_EDMA_POST_REGION_ADDR_t offset;
+
+	int result = gxio_mpipe_init_edma_ring(context, edma_ring_id, channel,
+					       mem, mem_size, mem_flags);
+	if (result < 0)
+		return result;
+
+	memset(equeue, 0, sizeof(*equeue));
+
+	offset.word = 0;
+	offset.region =
+	    MPIPE_MMIO_ADDR__REGION_VAL_EDMA - MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = edma_ring_id;
+
+	__gxio_dma_queue_init(&equeue->dma_queue,
+			      context->mmio_fast_base + offset.word,
+			      num_entries);
+	equeue->edescs = mem;
+	equeue->mask_num_entries = num_entries - 1;
+	equeue->log2_num_entries = __builtin_ctz(num_entries);
+
+	return 0;
+}
+
+/* Get our internal context used for link name access.  This context is
+ *  special in that it is not associated with an mPIPE service domain.
+ */
+static gxio_mpipe_context_t *_gxio_get_link_context(void)
+{
+	static gxio_mpipe_context_t context;
+	static gxio_mpipe_context_t *contextp;
+	static int tried_open = 0;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	if (!tried_open) {
+		int i = 0;
+		tried_open = 1;
+
+		/*
+		 * "4" here is the maximum possible number of mPIPE shims; it's
+		 * an exaggeration but we shouldn't ever go beyond 2 anyway.
+		 */
+		for (i = 0; i < 4; i++) {
+			char file[80];
+
+			snprintf(file, sizeof(file), "mpipe/%d/iorpc_info", i);
+			context.fd = hv_dev_open((HV_VirtAddr) file, 0);
+			if (context.fd < 0)
+				continue;
+
+			contextp = &context;
+			break;
+		}
+	}
+
+	mutex_unlock(&mutex);
+
+	return contextp;
+}
+
+int gxio_mpipe_link_enumerate_mac(int idx, char *link_name, uint8_t * link_mac)
+{
+	int rv;
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+
+	gxio_mpipe_context_t *context = _gxio_get_link_context();
+	if (!context)
+		return GXIO_ERR_NO_DEVICE;
+
+	rv = gxio_mpipe_info_enumerate_aux(context, idx, &name, &mac);
+	if (rv >= 0) {
+		strncpy(link_name, name.name, sizeof(name.name));
+		memcpy(link_mac, mac.mac, sizeof(mac.mac));
+	}
+
+	return rv;
+}
+
+int
+gxio_mpipe_link_open(gxio_mpipe_link_t * link, gxio_mpipe_context_t * context,
+		     const char *link_name, unsigned int flags)
+{
+	_gxio_mpipe_link_name_t name;
+	int rv;
+
+	strncpy(name.name, link_name, sizeof(name.name));
+	name.name[GXIO_MPIPE_LINK_NAME_LEN - 1] = '\0';
+
+	rv = gxio_mpipe_link_open_aux(context, name, flags);
+	if (rv < 0)
+		return rv;
+
+	link->context = context;
+	link->channel = rv >> 8;
+	link->mac = rv & 0xFF;
+
+	return 0;
+}
+
+int gxio_mpipe_link_close(gxio_mpipe_link_t * link)
+{
+	return gxio_mpipe_link_close_aux(link->context, link->mac);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init);
+EXPORT_SYMBOL(gxio_mpipe_buffer_size_to_buffer_size_enum);
+EXPORT_SYMBOL(gxio_mpipe_buffer_size_enum_to_buffer_size);
+EXPORT_SYMBOL(gxio_mpipe_calc_buffer_stack_bytes);
+EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack);
+EXPORT_SYMBOL(gxio_mpipe_init_notif_ring);
+EXPORT_SYMBOL(gxio_mpipe_init_notif_group_and_buckets);
+EXPORT_SYMBOL(gxio_mpipe_rules_init);
+EXPORT_SYMBOL(gxio_mpipe_rules_begin);
+EXPORT_SYMBOL(gxio_mpipe_rules_add_channel);
+EXPORT_SYMBOL(gxio_mpipe_rules_set_priority);
+EXPORT_SYMBOL(gxio_mpipe_rules_set_headroom);
+EXPORT_SYMBOL(gxio_mpipe_rules_set_tailroom);
+EXPORT_SYMBOL(gxio_mpipe_rules_set_capacity);
+EXPORT_SYMBOL(gxio_mpipe_rules_add_dmac);
+EXPORT_SYMBOL(gxio_mpipe_rules_add_vlan);
+EXPORT_SYMBOL(gxio_mpipe_rules_commit);
+EXPORT_SYMBOL(gxio_mpipe_iqueue_init);
+EXPORT_SYMBOL(gxio_mpipe_equeue_init);
+EXPORT_SYMBOL(gxio_mpipe_link_enumerate_mac);
+EXPORT_SYMBOL(gxio_mpipe_link_open);
+EXPORT_SYMBOL(gxio_mpipe_link_close);
diff --git a/arch/tile/include/arch/mpipe.h b/arch/tile/include/arch/mpipe.h
new file mode 100644
index 0000000..5199534
--- /dev/null
+++ b/arch/tile/include/arch/mpipe.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_H__
+#define __ARCH_MPIPE_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_def.h>
+
+#ifndef __ASSEMBLER__
+
+// MMIO Ingress DMA Release Region Address.
+// This is a description of the physical addresses used to manipulate ingress
+// credit counters.  Accesses to this address space should use an address of
+// this form and a value like that specified in IDMA_RELEASE_REGION_VAL.
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // Reserved.
+    uint_reg_t __reserved_0  : 3;
+    // NotifRing to be released
+    uint_reg_t ring          : 8;
+    // Bucket to be released
+    uint_reg_t bucket        : 13;
+    // Enable NotifRing release
+    uint_reg_t ring_enable   : 1;
+    // Enable Bucket release
+    uint_reg_t bucket_enable : 1;
+    // This field of the address selects the region (address space) to be
+    // accessed.  For the iDMA release region, this field must be 4.
+    uint_reg_t region        : 3;
+    // Reserved.
+    uint_reg_t __reserved_1  : 6;
+    // This field of the address indexes the 32 entry service domain table.
+    uint_reg_t svc_dom       : 5;
+    // Reserved.
+    uint_reg_t __reserved_2  : 24;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved_2  : 24;
+    uint_reg_t svc_dom       : 5;
+    uint_reg_t __reserved_1  : 6;
+    uint_reg_t region        : 3;
+    uint_reg_t bucket_enable : 1;
+    uint_reg_t ring_enable   : 1;
+    uint_reg_t bucket        : 13;
+    uint_reg_t ring          : 8;
+    uint_reg_t __reserved_0  : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_ADDR_t;
+
+// MMIO Ingress DMA Release Region Value - Release NotifRing and/or Bucket.
+// Provides release of the associated NotifRing.  The address of the MMIO
+// operation is described in IDMA_RELEASE_REGION_ADDR.
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // Number of packets being released.  The load balancer's count of
+    // inflight packets will be decremented by this amount for the associated
+    // Bucket and/or NotifRing
+    uint_reg_t count      : 16;
+    // Reserved.
+    uint_reg_t __reserved : 48;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved : 48;
+    uint_reg_t count      : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_VAL_t;
+
+// MMIO Buffer Stack Manager Region Address.
+// This MMIO region is used for posting or fetching buffers to/from the
+// buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+// the top of stack if one is available.  On an MMIO store, this pushes a
+// buffer to the stack.  The value read or written is described in
+// BSM_REGION_VAL.
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // Reserved.
+    uint_reg_t __reserved_0 : 3;
+    // BufferStack being accessed.
+    uint_reg_t stack        : 5;
+    // Reserved.
+    uint_reg_t __reserved_1 : 18;
+    // This field of the address selects the region (address space) to be
+    // accessed.  For the buffer stack manager region, this field must be 6.
+    uint_reg_t region       : 3;
+    // Reserved.
+    uint_reg_t __reserved_2 : 6;
+    // This field of the address indexes the 32 entry service domain table.
+    uint_reg_t svc_dom      : 5;
+    // Reserved.
+    uint_reg_t __reserved_3 : 24;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 18;
+    uint_reg_t stack        : 5;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_ADDR_t;
+
+// MMIO Buffer Stack Manager Region Value.
+// This MMIO region is used for posting or fetching buffers to/from the
+// buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+// the top of stack if one is available. On an MMIO store, this pushes a
+// buffer to the stack.  The address of the MMIO operation is described in
+// BSM_REGION_ADDR.
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // Reserved.
+    uint_reg_t __reserved_0 : 7;
+    // Base virtual address of the buffer.  Must be sign extended by consumer.
+    int_reg_t va           : 35;
+    // Reserved.
+    uint_reg_t __reserved_1 : 6;
+    // Index of the buffer stack to which this buffer belongs.  Ignored on
+    // writes since the offset bits specify the stack being accessed.
+    uint_reg_t stack_idx    : 5;
+    // Reserved.
+    uint_reg_t __reserved_2 : 5;
+    // Reads as one to indicate that this is a hardware managed buffer.
+    // Ignored on writes since all buffers on a given stack are the same size.
+    uint_reg_t hwb          : 1;
+    // Encoded size of buffer (ignored on writes):
+    // 0 = 128 bytes
+    // 1 = 256 bytes
+    // 2 = 512 bytes
+    // 3 = 1024 bytes
+    // 4 = 1664 bytes
+    // 5 = 4096 bytes
+    // 6 = 10368 bytes
+    // 7 = 16384 bytes
+    uint_reg_t size         : 3;
+    // Valid indication for the buffer.  Ignored on writes.
+    // 0 : Valid buffer descriptor popped from stack.
+    // 3 : Could not pop a buffer from the stack.  Either the stack is empty,
+    // or the hardware's prefetch buffer is empty for this stack.
+    uint_reg_t c            : 2;
+#else   // __BIG_ENDIAN__
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_2 : 5;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_1 : 6;
+    int_reg_t va           : 35;
+    uint_reg_t __reserved_0 : 7;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_VAL_t;
+
+// MMIO Egress DMA Post Region Address.
+// Used to post descriptor locations to the eDMA descriptor engine.  The
+// value to be written is described in EDMA_POST_REGION_VAL
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // Reserved.
+    uint_reg_t __reserved_0 : 3;
+    // eDMA ring being accessed
+    uint_reg_t ring         : 5;
+    // Reserved.
+    uint_reg_t __reserved_1 : 18;
+    // This field of the address selects the region (address space) to be
+    // accessed.  For the egress DMA post region, this field must be 5.
+    uint_reg_t region       : 3;
+    // Reserved.
+    uint_reg_t __reserved_2 : 6;
+    // This field of the address indexes the 32 entry service domain table.
+    uint_reg_t svc_dom      : 5;
+    // Reserved.
+    uint_reg_t __reserved_3 : 24;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 18;
+    uint_reg_t ring         : 5;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_ADDR_t;
+
+// MMIO Egress DMA Post Region Value.
+// Used to post descriptor locations to the eDMA descriptor engine.  The
+// address is described in EDMA_POST_REGION_ADDR.
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // For writes, this specifies the current ring tail pointer prior to any
+    // post.  For example, to post 1 or more descriptors starting at location
+    // 23, this would contain 23 (not 24).  On writes, this index must be
+    // masked based on the ring size.  The new tail pointer after this post
+    // is COUNT+RING_IDX (masked by the ring size).
+    //
+    // For reads, this provides the hardware descriptor fetcher's head
+    // pointer.  The descriptors prior to the head pointer, however, may not
+    // yet have been processed so this indicator is only used to determine
+    // how full the ring is and if software may post more descriptors.
+    uint_reg_t ring_idx   : 16;
+    // For writes, this specifies number of contiguous descriptors that are
+    // being posted.  Software may post up to RingSize descriptors with a
+    // single MMIO store.  A zero in this field on a write will "wake up" an
+    // eDMA ring and cause it fetch descriptors regardless of the hardware's
+    // current view of the state of the tail pointer.
+    //
+    // For reads, this field provides a rolling count of the number of
+    // descriptors that have been completely processed.  This may be used by
+    // software to determine when buffers associated with a descriptor may be
+    // returned or reused.  When the ring's flush bit is cleared by software
+    // (after having been set by HW or SW), the COUNT will be cleared.
+    uint_reg_t count      : 16;
+    // For writes, this specifies the generation number of the tail being
+    // posted. Note that if tail+cnt wraps to the beginning of the ring, the
+    // eDMA hardware assumes that the descriptors posted at the beginning of
+    // the ring are also valid so it is okay to post around the wrap point.
+    //
+    // For reads, this is the current generation number.  Valid descriptors
+    // will have the inverse of this generation number.
+    uint_reg_t gen        : 1;
+    // Reserved.
+    uint_reg_t __reserved : 31;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved : 31;
+    uint_reg_t gen        : 1;
+    uint_reg_t count      : 16;
+    uint_reg_t ring_idx   : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_VAL_t;
+
+// Load Balancer Bucket Status Data.
+// Read/Write data for load balancer Bucket-Status Table. 4160 entries
+// indexed by LBL_INIT_CTL.IDX when LBL_INIT_CTL.STRUCT_SEL is BSTS_TBL
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    // NotifRing currently assigned to this bucket.
+    uint_reg_t notifring  : 8;
+    // Current reference count.
+    uint_reg_t count      : 16;
+    // Group associated with this bucket.
+    uint_reg_t group      : 5;
+    // Mode select for this bucket.
+    uint_reg_t mode       : 3;
+    // Reserved.
+    uint_reg_t __reserved : 32;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved : 32;
+    uint_reg_t mode       : 3;
+    uint_reg_t group      : 5;
+    uint_reg_t count      : 16;
+    uint_reg_t notifring  : 8;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_LBL_INIT_DAT_BSTS_TBL_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_H__) */
diff --git a/arch/tile/include/arch/mpipe_constants.h b/arch/tile/include/arch/mpipe_constants.h
new file mode 100644
index 0000000..315d80f
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_constants.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+
+
+#ifndef __ARCH_MPIPE_CONSTANTS_H__
+#define __ARCH_MPIPE_CONSTANTS_H__
+
+#define MPIPE_NUM_CLASSIFIERS 10
+#define MPIPE_CLS_MHZ 1200
+
+#define MPIPE_NUM_EDMA_RINGS 32
+
+#define MPIPE_NUM_SGMII_MACS 16
+#define MPIPE_NUM_XAUI_MACS 4
+#define MPIPE_NUM_LOOPBACK_CHANNELS 4
+#define MPIPE_NUM_NON_LB_CHANNELS 28
+
+#define MPIPE_NUM_IPKT_BLOCKS 1536
+
+#define MPIPE_NUM_BUCKETS 4160
+
+#define MPIPE_NUM_NOTIF_RINGS 256
+
+#define MPIPE_NUM_NOTIF_GROUPS 32
+
+#define MPIPE_NUM_TLBS_PER_ASID 16
+#define MPIPE_TLB_IDX_WIDTH 4
+
+#define MPIPE_MMIO_NUM_SVC_DOM 32
+
+#endif /* __ARCH_MPIPE_CONSTANTS_H__ */
diff --git a/arch/tile/include/arch/mpipe_def.h b/arch/tile/include/arch/mpipe_def.h
new file mode 100644
index 0000000..4998850
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_def.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_DEF_H__
+#define __ARCH_MPIPE_DEF_H__
+#define MPIPE_MMIO_ADDR__REGION_SHIFT 26
+#define MPIPE_MMIO_ADDR__REGION_VAL_CFG 0x0
+#define MPIPE_MMIO_ADDR__REGION_VAL_IDMA 0x4
+#define MPIPE_MMIO_ADDR__REGION_VAL_EDMA 0x5
+#define MPIPE_MMIO_ADDR__REGION_VAL_BSM 0x6
+#define MPIPE_BSM_REGION_VAL__VA_SHIFT 7
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128 0x0
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256 0x1
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512 0x2
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024 0x3
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664 0x4
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096 0x5
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368 0x6
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384 0x7
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA 0x0
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED 0x1
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK 0x2
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY 0x3
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND 0x7
+#define MPIPE_LBL_NR_STATE__FIRST_WORD 0x2138
+#endif /* !defined(__ARCH_MPIPE_DEF_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm.h b/arch/tile/include/arch/mpipe_shm.h
new file mode 100644
index 0000000..deeda1d
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm.h
@@ -0,0 +1,421 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+
+#ifndef __ARCH_MPIPE_SHM_H__
+#define __ARCH_MPIPE_SHM_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_shm_def.h>
+
+#ifndef __ASSEMBLER__
+//! MPIPE eDMA Descriptor.
+//! The eDMA descriptor is written by software and consumed by hardware.  It
+//! is used to specify the location of egress packet data to be sent out of
+//! the chip via one of the packet interfaces.
+
+__extension__
+typedef union
+{
+  struct
+  {
+    // Word 0
+
+#ifndef __BIG_ENDIAN__
+    //! Generation number.  Used to indicate a valid descriptor in ring.  When
+    //! a new descriptor is written into the ring, software must toggle this
+    //! bit.  The net effect is that the GEN bit being written into new
+    //! descriptors toggles each time the ring tail pointer wraps.
+    uint_reg_t gen        : 1;
+    //! Reserved.  Must be zero.
+    uint_reg_t r0         : 7;
+    //! Checksum generation enabled for this transfer.
+    uint_reg_t csum       : 1;
+    //! Nothing to be sent.  Used, for example, when software has dropped a
+    //! packet but still wishes to return all of the associated buffers.
+    uint_reg_t ns         : 1;
+    //! Notification interrupt will be delivered when packet has been egressed.
+    uint_reg_t notif      : 1;
+    //! Boundary indicator.  When 1, this transfer includes the EOP for this
+    //! command.  Must be clear on all but the last descriptor for an egress
+    //! packet.
+    uint_reg_t bound      : 1;
+    //! Reserved.  Must be zero.
+    uint_reg_t r1         : 4;
+    //! Number of bytes to be sent for this descriptor.  When zero, no data
+    //! will be moved and the buffer descriptor will be ignored.  If the
+    //! buffer descriptor indicates that it is chained, the low 7 bits of the
+    //! VA indicate the offset within the first buffer (e.g. 127 bytes is the
+    //! maximum offset into the first buffer).  If the size exceeds a single
+    //! buffer, subsequent buffer descriptors will be fetched prior to
+    //! processing the next eDMA descriptor in the ring.
+    uint_reg_t xfer_size  : 14;
+    //! Reserved.  Must be zero.
+    uint_reg_t r2         : 2;
+    //! Destination of checksum relative to CSUM_START relative to the first
+    //! byte moved by this descriptor.  Must be zero if CSUM=0 in this
+    //! descriptor.  Must be less than XFER_SIZE (e.g. the first byte of the
+    //! CSUM_DEST must be within the span of this descriptor).
+    uint_reg_t csum_dest  : 8;
+    //! Start byte of checksum relative to the first byte moved by this
+    //! descriptor.  If this is not the first descriptor for the egress
+    //! packet, CSUM_START is still relative to the first byte in this
+    //! descriptor.  Must be zero if CSUM=0 in this descriptor.
+    uint_reg_t csum_start : 8;
+    //! Initial value for 16-bit 1's compliment checksum if enabled via CSUM.
+    //! Specified in network order.  That is, bits[7:0] will be added to the
+    //! byte pointed to by CSUM_START and bits[15:8] will be added to the byte
+    //! pointed to by CSUM_START+1 (with appropriate 1's compliment carries).
+    //! Must be zero if CSUM=0 in this descriptor.
+    uint_reg_t csum_seed  : 16;
+#else   // __BIG_ENDIAN__
+    uint_reg_t csum_seed  : 16;
+    uint_reg_t csum_start : 8;
+    uint_reg_t csum_dest  : 8;
+    uint_reg_t r2         : 2;
+    uint_reg_t xfer_size  : 14;
+    uint_reg_t r1         : 4;
+    uint_reg_t bound      : 1;
+    uint_reg_t notif      : 1;
+    uint_reg_t ns         : 1;
+    uint_reg_t csum       : 1;
+    uint_reg_t r0         : 7;
+    uint_reg_t gen        : 1;
+#endif
+
+    // Word 1
+
+#ifndef __BIG_ENDIAN__
+    //! Virtual address.  Must be sign extended by consumer.
+    int_reg_t va           : 42;
+    //! Reserved.
+    uint_reg_t __reserved_0 : 6;
+    //! Index of the buffer stack to which this buffer belongs.
+    uint_reg_t stack_idx    : 5;
+    //! Reserved.
+    uint_reg_t __reserved_1 : 3;
+    //! Instance ID.  For devices that support more than one mPIPE instance,
+    //! this field indicates the buffer owner.  If the INST field does not
+    //! match the mPIPE's instance number when a packet is egressed, buffers
+    //! with HWB set will be returned to the other mPIPE instance.
+    uint_reg_t inst         : 1;
+    //! Reserved.
+    uint_reg_t __reserved_2 : 1;
+    //! Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+    //! indicates whether the buffer will be released to the buffer stack
+    //! manager.  When 0, software is responsible for releasing the buffer.
+    uint_reg_t hwb          : 1;
+    //! Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+    //! descriptors.  For eDMA descriptors, indicates the buffer size if .c
+    //! indicates a chained packet.  If an eDMA descriptor is not chained and
+    //! the .hwb bit is not set, this field is ignored and the size is
+    //! specified by the .xfer_size field.
+    //! 0 = 128 bytes
+    //! 1 = 256 bytes
+    //! 2 = 512 bytes
+    //! 3 = 1024 bytes
+    //! 4 = 1664 bytes
+    //! 5 = 4096 bytes
+    //! 6 = 10368 bytes
+    //! 7 = 16384 bytes
+    uint_reg_t size         : 3;
+    //! Chaining configuration for the buffer.  Indicates that an ingress
+    //! packet or egress command is chained across multiple buffers, with each
+    //! buffer's size indicated by the .size field.
+    uint_reg_t c            : 2;
+#else   // __BIG_ENDIAN__
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_2 : 1;
+    uint_reg_t inst         : 1;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_0 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  //! Word access
+  uint_reg_t words[2];
+} MPIPE_EDMA_DESC_t;
+
+//! MPIPE Packet Descriptor.
+//! The packet descriptor is filled by the mPIPE's classification,
+//! load-balancing, and buffer management services.  Some fields are consumed
+//! by mPIPE hardware, and others are consumed by Tile software.
+
+__extension__
+typedef union
+{
+  struct
+  {
+    // Word 0
+
+#ifndef __BIG_ENDIAN__
+    //! Notification ring into which this packet descriptor is written.
+    //! Typically written by load balancer, but can be overridden by
+    //! classification program if NR is asserted.
+    uint_reg_t notif_ring   : 8;
+    //! Source channel for this packet.  Written by mPIPE DMA hardware.
+    uint_reg_t channel      : 5;
+    //! Reserved.
+    uint_reg_t __reserved_0 : 1;
+    //! MAC Error.
+    //! Generated by the MAC interface.  Asserted if there was an overrun of
+    //! the MAC's receive FIFO.  This condition generally only occurs if the
+    //! mPIPE clock is running too slowly.
+    uint_reg_t me           : 1;
+    //! Truncation Error.
+    //! Written by the iDMA hardware.  Asserted if packet was truncated due to
+    //! insufficient space in iPkt buffer
+    uint_reg_t tr           : 1;
+    //! Written by the iDMA hardware.  Indicates the number of bytes written
+    //! to Tile memory.  In general, this is the actual size of the packet as
+    //! received from the MAC.  But if the packet is truncated due to running
+    //! out of buffers or due to the iPkt buffer filling up, then the L2_SIZE
+    //! will be reduced to reflect the actual number of valid bytes written to
+    //! Tile memory.
+    uint_reg_t l2_size      : 14;
+    //! CRC Error.
+    //! Generated by the MAC.  Asserted if MAC indicated an L2 CRC error or
+    //! other L2 error (bad length etc.) on the packet.
+    uint_reg_t ce           : 1;
+    //! Cut Through.
+    //! Written by the iDMA hardware.  Asserted if packet was not completely
+    //! received before being sent to classifier.  L2_Size will indicate
+    //! number of bytes received so far.
+    uint_reg_t ct           : 1;
+    //! Written by the classification program.  Used by the load balancer to
+    //! select the ring into which this packet descriptor is written.
+    uint_reg_t bucket_id    : 13;
+    //! Reserved.
+    uint_reg_t __reserved_1 : 3;
+    //! Checksum.
+    //! Written by classification program.  When 1, the checksum engine will
+    //! perform checksum based on the CSUM_SEED, CSUM_START, and CSUM_BYTES
+    //! fields.  The result will be placed in CSUM_VAL.
+    uint_reg_t cs           : 1;
+    //! Notification Ring Select.
+    //! Written by the classification program.  When 1, the NotifRingIDX is
+    //! set by classification program rather than being set by load balancer.
+    uint_reg_t nr           : 1;
+    //! Written by classification program.  Indicates whether packet and
+    //! descriptor should both be dropped, both be delivered, or only the
+    //! descriptor should be delivered.
+    uint_reg_t dest         : 2;
+    //! General Purpose Sequence Number Enable.
+    //! Written by the classification program.  When 1, the GP_SQN_SEL field
+    //! contains the sequence number selector and the GP_SQN field will be
+    //! replaced with the associated sequence number.  When clear, the GP_SQN
+    //! field is left intact and be used as "Custom" bytes.
+    uint_reg_t sq           : 1;
+    //! TimeStamp Enable.
+    //! Enable TimeStamp insertion.  When clear, timestamp field may be filled
+    //! with custom data by classifier.  When set, hardware inserts the
+    //! timestamp when the start of packet is received from the MAC.
+    uint_reg_t ts           : 1;
+    //! Packet Sequence Number Enable.
+    //! Enable PacketSQN insertion.  When clear, PacketSQN field may be filled
+    //! with custom data by classifier.  When set, hardware inserts the packet
+    //! sequence number when the packet descriptor is written to a
+    //! notification ring.
+    uint_reg_t ps           : 1;
+    //! Buffer Error.
+    //! Written by the iDMA hardware.  Asserted if iDMA ran out of buffers
+    //! while writing the packet. Software must still return any buffer
+    //! descriptors whose C field indicates a valid descriptor was consumed.
+    uint_reg_t be           : 1;
+    //! Written by  the classification program.  The associated counter is
+    //! incremented when the packet is sent.
+    uint_reg_t ctr0         : 5;
+    //! Reserved.
+    uint_reg_t __reserved_2 : 3;
+#else   // __BIG_ENDIAN__
+    uint_reg_t __reserved_2 : 3;
+    uint_reg_t ctr0         : 5;
+    uint_reg_t be           : 1;
+    uint_reg_t ps           : 1;
+    uint_reg_t ts           : 1;
+    uint_reg_t sq           : 1;
+    uint_reg_t dest         : 2;
+    uint_reg_t nr           : 1;
+    uint_reg_t cs           : 1;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t bucket_id    : 13;
+    uint_reg_t ct           : 1;
+    uint_reg_t ce           : 1;
+    uint_reg_t l2_size      : 14;
+    uint_reg_t tr           : 1;
+    uint_reg_t me           : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t channel      : 5;
+    uint_reg_t notif_ring   : 8;
+#endif
+
+    // Word 1
+
+#ifndef __BIG_ENDIAN__
+    //! Written by  the classification program.  The associated counter is
+    //! incremented when the packet is sent.
+    uint_reg_t ctr1          : 5;
+    //! Reserved.
+    uint_reg_t __reserved_3  : 3;
+    //! Written by classification program.  Indicates the start byte for
+    //! checksum.  Relative to 1st byte received from MAC.
+    uint_reg_t csum_start    : 8;
+    //! Checksum seed written by classification program.  Overwritten with
+    //! resultant checksum if CS bit is asserted.  The endianness of the CSUM
+    //! value bits when viewed by Tile software match the packet byte order.
+    //! That is, bits[7:0] of the resulting checksum value correspond to
+    //! earlier (more significant) bytes in the packet.  To avoid classifier
+    //! software from having to byte swap the CSUM_SEED, the iDMA checksum
+    //! engine byte swaps the classifier's result before seeding the checksum
+    //! calculation.  Thus, the CSUM_START byte of packet data is added to
+    //! bits[15:8] of the CSUM_SEED field generated by the classifier.  This
+    //! byte swap will be visible to Tile software if the CS bit is clear.
+    uint_reg_t csum_seed_val : 16;
+    //! Written by  the classification program.  Not interpreted by mPIPE
+    //! hardware.
+    uint_reg_t custom0       : 32;
+#else   // __BIG_ENDIAN__
+    uint_reg_t custom0       : 32;
+    uint_reg_t csum_seed_val : 16;
+    uint_reg_t csum_start    : 8;
+    uint_reg_t __reserved_3  : 3;
+    uint_reg_t ctr1          : 5;
+#endif
+
+    // Word 2
+
+#ifndef __BIG_ENDIAN__
+    //! Written by  the classification program.  Not interpreted by mPIPE
+    //! hardware.
+    uint_reg_t custom1 : 64;
+#else   // __BIG_ENDIAN__
+    uint_reg_t custom1 : 64;
+#endif
+
+    // Word 3
+
+#ifndef __BIG_ENDIAN__
+    //! Written by  the classification program.  Not interpreted by mPIPE
+    //! hardware.
+    uint_reg_t custom2 : 64;
+#else   // __BIG_ENDIAN__
+    uint_reg_t custom2 : 64;
+#endif
+
+    // Word 4
+
+#ifndef __BIG_ENDIAN__
+    //! Written by  the classification program.  Not interpreted by mPIPE
+    //! hardware.
+    uint_reg_t custom3 : 64;
+#else   // __BIG_ENDIAN__
+    uint_reg_t custom3 : 64;
+#endif
+
+    // Word 5
+
+#ifndef __BIG_ENDIAN__
+    //! Sequence number applied when packet is distributed.   Classifier
+    //! selects which sequence number is to be applied by writing the 13-bit
+    //! SQN-selector into this field.
+    uint_reg_t gp_sqn     : 16;
+    //! Written by notification hardware.  The packet sequence number is
+    //! incremented for each packet that wasn't dropped.
+    uint_reg_t packet_sqn : 48;
+#else   // __BIG_ENDIAN__
+    uint_reg_t packet_sqn : 48;
+    uint_reg_t gp_sqn     : 16;
+#endif
+
+    // Word 6
+
+#ifndef __BIG_ENDIAN__
+    //! Written by hardware when the start-of-packet is received by the mPIPE
+    //! from the MAC.  This is the nanoseconds part of the packet timestamp.
+    uint_reg_t time_stamp_ns  : 32;
+    //! Written by hardware when the start-of-packet is received by the mPIPE
+    //! from the MAC.  This is the seconds part of the packet timestamp.
+    uint_reg_t time_stamp_sec : 32;
+#else   // __BIG_ENDIAN__
+    uint_reg_t time_stamp_sec : 32;
+    uint_reg_t time_stamp_ns  : 32;
+#endif
+
+    // Word 7
+
+#ifndef __BIG_ENDIAN__
+    //! Virtual address.  Must be sign extended by consumer.
+    int_reg_t va           : 42;
+    //! Reserved.
+    uint_reg_t __reserved_4 : 6;
+    //! Index of the buffer stack to which this buffer belongs.
+    uint_reg_t stack_idx    : 5;
+    //! Reserved.
+    uint_reg_t __reserved_5 : 3;
+    //! Instance ID.  For devices that support more than one mPIPE instance,
+    //! this field indicates the buffer owner.  If the INST field does not
+    //! match the mPIPE's instance number when a packet is egressed, buffers
+    //! with HWB set will be returned to the other mPIPE instance.
+    uint_reg_t inst         : 1;
+    //! Reserved.
+    uint_reg_t __reserved_6 : 1;
+    //! Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+    //! indicates whether the buffer will be released to the buffer stack
+    //! manager.  When 0, software is responsible for releasing the buffer.
+    uint_reg_t hwb          : 1;
+    //! Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+    //! descriptors.  For eDMA descriptors, indicates the buffer size if .c
+    //! indicates a chained packet.  If an eDMA descriptor is not chained and
+    //! the .hwb bit is not set, this field is ignored and the size is
+    //! specified by the .xfer_size field.
+    //! 0 = 128 bytes
+    //! 1 = 256 bytes
+    //! 2 = 512 bytes
+    //! 3 = 1024 bytes
+    //! 4 = 1664 bytes
+    //! 5 = 4096 bytes
+    //! 6 = 10368 bytes
+    //! 7 = 16384 bytes
+    uint_reg_t size         : 3;
+    //! Chaining configuration for the buffer.  Indicates that an ingress
+    //! packet or egress command is chained across multiple buffers, with each
+    //! buffer's size indicated by the .size field.
+    uint_reg_t c            : 2;
+#else   // __BIG_ENDIAN__
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_6 : 1;
+    uint_reg_t inst         : 1;
+    uint_reg_t __reserved_5 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_4 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  //! Word access
+  uint_reg_t words[8];
+} MPIPE_PDESC_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_SHM_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm_def.h b/arch/tile/include/arch/mpipe_shm_def.h
new file mode 100644
index 0000000..5ede122
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm_def.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_SHM_DEF_H__
+#define __ARCH_MPIPE_SHM_DEF_H__
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_UNCHAINED 0x0
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_CHAINED 0x1
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY 0x2
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_INVALID 0x3
+#endif /* !defined(__ARCH_MPIPE_SHM_DEF_H__) */
diff --git a/arch/tile/include/gxio/iorpc_mpipe.h b/arch/tile/include/gxio/iorpc_mpipe.h
new file mode 100644
index 0000000..98d79bc
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_LINUX_RPC_H__
+#define __GXIO_MPIPE_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1200)
+#define GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1201)
+
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1203)
+#define GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1204)
+#define GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1205)
+#define GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1206)
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1207)
+#define GXIO_MPIPE_OP_INIT_NOTIF_GROUP IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1208)
+#define GXIO_MPIPE_OP_ALLOC_BUCKETS    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1209)
+#define GXIO_MPIPE_OP_INIT_BUCKET      IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120a)
+#define GXIO_MPIPE_OP_ALLOC_EDMA_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120b)
+#define GXIO_MPIPE_OP_INIT_EDMA_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x120c)
+
+#define GXIO_MPIPE_OP_COMMIT_RULES     IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120f)
+#define GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1210)
+#define GXIO_MPIPE_OP_LINK_OPEN_AUX    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1211)
+#define GXIO_MPIPE_OP_LINK_CLOSE_AUX   IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1212)
+
+#define GXIO_MPIPE_OP_ARM_POLLFD       IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define GXIO_MPIPE_OP_CLOSE_POLLFD     IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define GXIO_MPIPE_OP_GET_MMIO_BASE    IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t * context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum);
+
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t * context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags);
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring);
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring);
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					   unsigned int ring);
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t * context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags);
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t * context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits);
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t * context, unsigned int count,
+			     unsigned int first, unsigned int flags);
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t * context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info);
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t * context,
+				unsigned int count, unsigned int first,
+				unsigned int flags);
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t * context, const void *blob,
+			    size_t blob_size);
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t * context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags);
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t * context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags);
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t * context, int mac);
+
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie);
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie);
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t * context, HV_PTE *base);
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t * context,
+				 unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_mpipe_info.h b/arch/tile/include/gxio/iorpc_mpipe_info.h
new file mode 100644
index 0000000..965a94d
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe_info.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_INFO_LINUX_RPC_H__
+#define __GXIO_MPIPE_INFO_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+
+#define GXIO_MPIPE_INFO_OP_ENUMERATE_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1251)
+#define GXIO_MPIPE_INFO_OP_GET_MMIO_BASE IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t * context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t * name,
+				  _gxio_mpipe_link_mac_t * mac);
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t * context,
+				  HV_PTE *base);
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t * context,
+				      unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_INFO_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/mpipe.h b/arch/tile/include/gxio/mpipe.h
new file mode 100644
index 0000000..7c9df46
--- /dev/null
+++ b/arch/tile/include/gxio/mpipe.h
@@ -0,0 +1,1986 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_MPIPE_H_
+#define _GXIO_MPIPE_H_
+
+/*
+ *
+ * An API for allocating, configuring, and manipulating mPIPE hardware
+ * resources.
+ */
+
+#include "common.h"
+#include "dma_queue.h"
+
+#include <arch/mpipe_def.h>
+#include <arch/mpipe_shm.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * The TILE-Gx mPIPE&tm; shim provides Ethernet connectivity, packet
+ * classification, and packet load balancing services.  The
+ * gxio_mpipe_ API, declared in <gxio/mpipe.h>, allows applications to
+ * allocate mPIPE IO channels, configure packet distribution
+ * parameters, and send and receive Ethernet packets.  The API is
+ * designed to be a minimal wrapper around the mPIPE hardware, making
+ * system calls only where necessary to preserve inter-process
+ * protection guarantees.
+ *
+ * The APIs described below allow the programmer to allocate and
+ * configure mPIPE resources.  As described below, the mPIPE is a
+ * single shared hardware device that provides partitionable resources
+ * that are shared between all applications in the system.  The
+ * gxio_mpipe_ API allows userspace code to make resource request
+ * calls to the hypervisor, which in turns keeps track of the
+ * resources in use by all applications, maintains protection
+ * guarantees, and resets resources upon application shutdown.
+ *
+ * We strongly recommend reading the mPIPE section of the IO Device
+ * Guide (UG404) before working with this API.  Most functions in the
+ * gxio_mpipe_ API are directly analogous to hardware interfaces and
+ * the documentation assumes that the reader understands those
+ * hardware interfaces.
+ *
+ * @section mpipe__ingress mPIPE Ingress Hardware Resources
+ *
+ * The mPIPE ingress hardware provides extensive hardware offload for
+ * tasks like packet header parsing, load balancing, and memory
+ * management.  This section provides a brief introduction to the
+ * hardware components and the gxio_mpipe_ calls used to manage them;
+ * see the IO Device Guide for a much more detailed description of the
+ * mPIPE's capabilities.
+ *
+ * When a packet arrives at one of the mPIPE's Ethernet MACs, it is
+ * assigned a channel number indicating which MAC received it.  It
+ * then proceeds through the following hardware pipeline:
+ *
+ * @subsection mpipe__classification Classification
+ *
+ * A set of classification processors run header parsing code on each
+ * incoming packet, extracting information including the destination
+ * MAC address, VLAN, Ethernet type, and five-tuple hash.  Some of
+ * this information is then used to choose which buffer stack will be
+ * used to hold the packet, and which bucket will be used by the load
+ * balancer to determine which application will receive the packet.
+ *
+ * The rules by which the buffer stack and bucket are chosen can be
+ * configured via the @ref gxio_mpipe_classifier API.  A given app can
+ * specify multiple rules, each one specifying a bucket range, and a
+ * set of buffer stacks, to be used for packets matching the rule.
+ * Each rule can optionally specify a restricted set of channels,
+ * VLANs, and/or dMACs, in which it is interested.  By default, a
+ * given rule starts out matching all channels associated with the
+ * mPIPE context's set of open links; all VLANs; and all dMACs.
+ * Subsequent restrictions can then be added.
+ *
+ * @subsection mpipe__load_balancing Load Balancing
+ *
+ * The mPIPE load balancer is responsible for choosing the NotifRing
+ * to which the packet will be delivered.  This decision is based on
+ * the bucket number indicated by the classification program.  In
+ * general, the bucket number is based on some number of low bits of
+ * the packet's flow hash (applications that aren't interested in flow
+ * hashing use a single bucket).  Each load balancer bucket keeps a
+ * record of the NotifRing to which packets directed to that bucket
+ * are currently being delivered.  Based on the bucket's load
+ * balancing mode (@ref gxio_mpipe_bucket_mode_t), the load balancer
+ * either forwards the packet to the previously assigned NotifRing or
+ * decides to choose a new NotifRing.  If a new NotifRing is required,
+ * the load balancer chooses the least loaded ring in the NotifGroup
+ * associated with the bucket.
+ *
+ * The load balancer is a shared resource.  Each application needs to
+ * explicitly allocate NotifRings, NotifGroups, and buckets, using
+ * gxio_mpipe_alloc_notif_rings(), gxio_mpipe_alloc_notif_groups(),
+ * and gxio_mpipe_alloc_buckets().  Then the application needs to
+ * configure them using gxio_mpipe_init_notif_ring() and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * @subsection mpipe__buffers Buffer Selection and Packet Delivery
+ *
+ * Once the load balancer has chosen the destination NotifRing, the
+ * mPIPE DMA engine pops at least one buffer off of the 'buffer stack'
+ * chosen by the classification program and DMAs the packet data into
+ * that buffer.  Each buffer stack provides a hardware-accelerated
+ * stack of data buffers with the same size.  If the packet data is
+ * larger than the buffers provided by the chosen buffer stack, the
+ * mPIPE hardware pops off multiple buffers and chains the packet data
+ * through a multi-buffer linked list.  Once the packet data is
+ * delivered to the buffer(s), the mPIPE hardware writes the
+ * ::gxio_mpipe_idesc_t metadata object (calculated by the classifier)
+ * into the NotifRing and increments the number of packets delivered
+ * to that ring.
+ *
+ * Applications can push buffers onto a buffer stack by calling
+ * gxio_mpipe_push_buffer() or by egressing a packet with the
+ * ::gxio_mpipe_edesc_t::hwb bit set, indicating that the egressed
+ * buffers should be returned to the stack.
+ *
+ * Applications can allocate and initialize buffer stacks with the
+ * gxio_mpipe_alloc_buffer_stacks() and gxio_mpipe_init_buffer_stack()
+ * APIs.
+ *
+ * The application must also register the memory pages that will hold
+ * packets.  This requires calling gxio_mpipe_register_page() for each
+ * memory page that will hold packets allocated by the application for
+ * a given buffer stack.  Since each buffer stack is limited to 16
+ * registered pages, it may be necessary to use huge pages, or even
+ * extremely huge pages, to hold all the buffers.
+ *
+ * @subsection mpipe__iqueue NotifRings
+ *
+ * Each NotifRing is a region of shared memory, allocated by the
+ * application, to which the mPIPE delivers packet descriptors
+ * (::gxio_mpipe_idesc_t).  The application can allocate them via
+ * gxio_mpipe_alloc_notif_rings().  The application can then either
+ * explicitly initialize them with gxio_mpipe_init_notif_ring() and
+ * then read from them manually, or can make use of the convenience
+ * wrappers provided by @ref gxio_mpipe_wrappers.
+ *
+ * @section mpipe__egress mPIPE Egress Hardware
+ *
+ * Applications use eDMA rings to queue packets for egress.  The
+ * application can allocate them via gxio_mpipe_alloc_edma_rings().
+ * The application can then either explicitly initialize them with
+ * gxio_mpipe_init_edma_ring() and then write to them manually, or
+ * can make use of the convenience wrappers provided by
+ * @ref gxio_mpipe_wrappers.
+ *
+ * @section gxio__shortcomings Plans for Future API Revisions
+ *
+ * The API defined here is only an initial version of the mPIPE API.
+ * Future plans include:
+ *
+ * - Higher level wrapper functions to provide common initialization
+ * patterns.  This should help users start writing mPIPE programs
+ * without having to learn the details of the hardware.
+ *
+ * - Support for reset and deallocation of resources, including
+ * cleanup upon application shutdown.
+ *
+ * - Support for calling these APIs in the BME.
+ *
+ * - Support for IO interrupts.
+ *
+ * - Clearer definitions of thread safety guarantees.
+ *
+ * @section gxio__mpipe_examples Examples
+ *
+ * See the following mPIPE example programs for more information about
+ * allocating mPIPE resources and using them in real applications:
+ *
+ * - @ref mpipe/ingress/app.c : Receiving packets.
+ *
+ * - @ref mpipe/forward/app.c : Forwarding packets.
+ *
+ * Note that there are several more examples.
+ */
+
+/* Flags that can be passed to resource allocation functions. */
+enum gxio_mpipe_alloc_flags_e {
+  /* Require an allocation to start at a specified resource index. */
+	GXIO_MPIPE_ALLOC_FIXED = HV_MPIPE_ALLOC_FIXED,
+};
+
+/* Flags that can be passed to memory registration functions. */
+enum gxio_mpipe_mem_flags_e {
+  /* Do not fill L3 when writing, and invalidate lines upon egress. */
+	GXIO_MPIPE_MEM_FLAG_NT_HINT = IORPC_MEM_BUFFER_FLAG_NT_HINT,
+
+  /* L3 cache fills should only populate IO cache ways. */
+	GXIO_MPIPE_MEM_FLAG_IO_PIN = IORPC_MEM_BUFFER_FLAG_IO_PIN,
+};
+
+/* An ingress packet descriptor.  When a packet arrives, the mPIPE
+ * hardware generates this structure and writes it into a
+ * NotifRing.
+ */
+typedef MPIPE_PDESC_t gxio_mpipe_idesc_t;
+
+/* An egress packet descriptor.  Applications write this structure
+ * into eDMA rings and the hardware performs the indicated egress
+ * command.
+ */
+typedef MPIPE_EDMA_DESC_t gxio_mpipe_edesc_t;
+
+/* Get the "va" field from an "idesc".
+ *
+ * This is the address at which the ingress hardware copied the first
+ * byte of the packet.
+ *
+ * If the classifier detected a custom header, then this will point to
+ * the custom header, and gxio_mpipe_idesc_get_l2_start() will point
+ * to the actual L2 header.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_va(gxio_mpipe_idesc_t * idesc)
+{
+	return (unsigned char *)(long)idesc->va;
+}
+
+/* Get the "xfer_size" from an "idesc".
+ *
+ * This is the actual number of packet bytes transferred into memory
+ * by the hardware.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ *
+ * ISSUE: Is this the best name for this?
+ * FIXME: Add more docs about chaining, clipping, etc.
+ */
+static inline unsigned int
+gxio_mpipe_idesc_get_xfer_size(gxio_mpipe_idesc_t * idesc)
+{
+	return idesc->l2_size;
+}
+
+/* Get the "flow_hash" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * The default classifier uses a five-tuple hash (src/dest IP addr,
+ * src/dest port, protocol) for IPv4/IPv6 TCP/UDP packets, a two-tuple
+ * hash (src/dest IP addr) for other IPv4/IPv6 packets, and otherwise
+ * a two-tuple hash (dest/src mac addr).
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint32_t
+gxio_mpipe_idesc_get_flow_hash(gxio_mpipe_idesc_t * idesc)
+{
+	return idesc->custom0;
+}
+
+/* Get the "vlan" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * The default classifier uses a vlan of 0xFFFF unless it processed
+ * a known vlan-encapsulating ethertype, in which case this will be
+ * one of the encapsulated vlans.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint16_t gxio_mpipe_idesc_get_vlan(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 0) & 0xFFFF;
+}
+
+/* Get the "ethertype" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is normally the final ethertype detected by the classifier.
+ *
+ * The default classifier only understands a few basic ethertypes.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint16_t
+gxio_mpipe_idesc_get_ethertype(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 16) & 0xFFFF;
+}
+
+/* Get the "l2_offset" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is the number of bytes between the "va" and the L2 header.
+ *
+ * The L2 header consists of a destination mac address, a source mac
+ * address, and an initial ethertype.  Various initial ethertypes
+ * allow encoding extra information in the L2 header, often including
+ * a vlan, and/or a new ethertype.
+ *
+ * Note that the "l2_offset" will be non-zero if (and only if) the
+ * classifier processed a custom header for the packet.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_l2_offset(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 32) & 0xFF;
+}
+
+/* Get the "l2_start" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_va() plus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_l2_start(gxio_mpipe_idesc_t *
+							   idesc)
+{
+	unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+	return va + gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* Get the "l2_length" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_xfer_size() minus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned int
+gxio_mpipe_idesc_get_l2_length(gxio_mpipe_idesc_t * idesc)
+{
+	unsigned int xfer_size = idesc->l2_size;
+	return xfer_size - gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* Get the "l3_offset" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is the number of bytes between the "va" and the L3 header,
+ * or, if the classifier was unable to identify the L3 header, then
+ * the first byte after the last known ethertype.
+ *
+ * The default classifier knows that ethertype 0x0800 indicates an
+ * IPv4 L3 header and ethertype 0x0866 indicates an IPv6 L3 header.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_l3_offset(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 40) & 0xFF;
+}
+
+/* Get the "l3_start" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_va() plus
+ * gxio_mpipe_idesc_get_l3_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_l3_start(gxio_mpipe_idesc_t *
+							   idesc)
+{
+	unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+	return va + gxio_mpipe_idesc_get_l3_offset(idesc);
+}
+
+/* Get the "l3_length" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_xfer_size() minus
+ * gxio_mpipe_idesc_get_l3_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned int
+gxio_mpipe_idesc_get_l3_length(gxio_mpipe_idesc_t * idesc)
+{
+	unsigned int xfer_size = idesc->l2_size;
+	return xfer_size - gxio_mpipe_idesc_get_l3_offset(idesc);
+}
+
+/* Get the "l4_offset" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is the number of bytes between the "va" and the first byte of
+ * the L4 header, or, if the classifier was unable to properly parse
+ * the L3 header, then zero.
+ *
+ * The default classifier sets "l4_offset" (only) for IPv4/IPv6 packets.
+ *
+ * For IPv4/IPv6 TCP packets, the "l4_start" points at the TCP header.
+ *
+ * For IPv4/IPv6 UDP packets, the "l4_start" points at the UDP header.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_l4_offset(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 48) & 0xFF;
+}
+
+/* Get the "l4_start" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_va() plus
+ * gxio_mpipe_idesc_get_l4_offset().
+ *
+ * Note that if "l4_offset" is zero, then this function is pointless.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_l4_start(gxio_mpipe_idesc_t *
+							   idesc)
+{
+	unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+	return va + gxio_mpipe_idesc_get_l4_offset(idesc);
+}
+
+/* Get the "l4_length" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_xfer_size() minus
+ * gxio_mpipe_idesc_get_l4_offset().
+ *
+ * Note that if "l4_offset" is zero, then this function is pointless.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned int
+gxio_mpipe_idesc_get_l4_length(gxio_mpipe_idesc_t * idesc)
+{
+	unsigned int xfer_size = idesc->l2_size;
+	return xfer_size - gxio_mpipe_idesc_get_l4_offset(idesc);
+}
+
+/* Get the "status" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * The "status" is a set of bit flags.
+ *
+ * Bit 0x80 indicates that the packet is "bad".
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_status(gxio_mpipe_idesc_t * idesc)
+{
+	return (idesc->custom1 >> 56) & 0xFF;
+}
+
+/* Determine if a packet descriptor is "bad".
+ *
+ * This function checks for three "bad" conditions.
+ *
+ * (1) The packet has a bad IPv4 header checksum, as evidenced by
+ * "(gxio_mpipe_idesc_get_status(idesc) & 0x80) != 0".
+ *
+ * (2) The packet has a bad TCP/UDP checksum, as evidenced by
+ * "idesc->cs && idesc->csum_seed_val != 0xFFFF".
+ *
+ * (3) The packet has a buffer error, as evidenced by "idesc->be".
+ *
+ * See also gxio_mpipe_iqueue_drop_if_bad().
+ *
+ * @param idesc An ingress packet descriptor.
+ * @return 1 if packet is "bad", or 0 otherwise.
+ */
+static inline int gxio_mpipe_idesc_is_bad(gxio_mpipe_idesc_t * idesc)
+{
+	return ((gxio_mpipe_idesc_get_status(idesc) & 0x80) ||
+		(idesc->cs && (idesc->csum_seed_val != 0xFFFF)) || idesc->be);
+}
+
+/* Initialize an "edesc" from some fields in an "idesc".
+ *
+ * In particular, this function zeros out the edesc, and then sets
+ * "edesc->bound" to one, and "edesc->xfer_size" to "idesc->l2_size",
+ * and then copies the "va", "stack_idx", "inst", "hwb", "size", and
+ * "c" fields from "idesc", in a fairly efficient manner.
+ *
+ * If "idesc->be" is set, then this function will set "edesc->ns" to
+ * one, and "edesc->xfer_size" to zero.
+ *
+ * @param edesc An egress packet descriptor.
+ * @param idesc An ingress packet descriptor.
+ */
+static inline void
+gxio_mpipe_edesc_copy_idesc(gxio_mpipe_edesc_t * edesc,
+			    gxio_mpipe_idesc_t * idesc)
+{
+	edesc->words[0] = 0;
+	edesc->words[1] = idesc->words[7];
+	edesc->ns = idesc->be;
+	edesc->bound = 1;
+	edesc->xfer_size = idesc->be ? 0 : idesc->l2_size;
+}
+
+/* Set the "va" field in an "edesc".
+ *
+ * @param edesc An egress packet descriptor.
+ * @param va The address of some packet data.
+ */
+static inline void gxio_mpipe_edesc_set_va(gxio_mpipe_edesc_t * edesc, void *va)
+{
+	edesc->va = (unsigned long)va;
+}
+
+/* A context object used to manage mPIPE hardware resources. */
+typedef struct {
+
+  /* File descriptor for calling up to Linux (and thus the HV). */
+	int fd;
+
+  /* The VA at which configuration registers are mapped. */
+	char *mmio_cfg_base;
+
+  /* The VA at which IDMA, EDMA, and buffer manager are mapped. */
+	char *mmio_fast_base;
+
+  /* The "initialized" buffer stacks. */
+	gxio_mpipe_rules_stacks_t __stacks;
+
+} gxio_mpipe_context_t;
+
+/* This is only used internally, but it's most easily made visible here. */
+typedef gxio_mpipe_context_t gxio_mpipe_info_context_t;
+
+/* Initialize an mPIPE context.
+ *
+ * This function allocates an mPIPE "service domain" and maps the MMIO
+ * registers into the caller's VA space.
+ *
+ * @param context Context object to be initialized.
+ * @param mpipe_instance Instance number of mPIPE shim to be controlled via
+ *  context.
+ */
+extern int
+gxio_mpipe_init(gxio_mpipe_context_t * context, unsigned int mpipe_instance);
+
+/*****************************************************************
+ *                         Buffer Stacks                          *
+ ******************************************************************/
+
+/* Allocate a set of buffer stacks.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of stacks required.
+ * @param first Index of first stack if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer stack, or
+ * ::GXIO_MPIPE_ERR_NO_BUFFER_STACK if allocation failed.
+ */
+extern int
+gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t * context,
+			       unsigned int count, unsigned int first,
+			       unsigned int flags);
+
+/* Enum codes for buffer sizes supported by mPIPE. */
+typedef enum {
+  /* 128 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_128 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128,
+  /* 256 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_256 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256,
+  /* 512 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_512 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512,
+  /* 1024 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1024 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024,
+  /* 1664 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1664 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664,
+  /* 4096 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_4096 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096,
+  /* 10368 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_10368 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368,
+  /* 16384 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_16384 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384
+} gxio_mpipe_buffer_size_enum_t;
+
+/* Convert a buffer size in bytes into a buffer size enum. */
+extern gxio_mpipe_buffer_size_enum_t
+gxio_mpipe_buffer_size_to_buffer_size_enum(size_t size);
+
+/* Convert a buffer size enum into a buffer size in bytes. */
+extern size_t
+gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+					   buffer_size_enum);
+
+/* Calculate the number of bytes required to store a given number of
+ * buffers in the memory registered with a buffer stack via
+ * gxio_mpipe_init_buffer_stack().
+ */
+extern size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers);
+
+/* Initialize a buffer stack.  This function binds a region of memory
+ * to be used by the hardware for storing buffer addresses pushed via
+ * gxio_mpipe_push_buffer() or as the result of sending a buffer out
+ * the egress with the 'push to stack when done' bit set.  Once this
+ * function returns, the memory region's contents may be arbitrarily
+ * modified by the hardware at any time and software should not access
+ * the memory region again.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer_size_enum The size of each buffer in the buffer stack,
+ * as an enum.
+ * @param mem The address of the buffer stack.  This memory must be
+ * physically contiguous and aligned to a 64kB boundary.
+ * @param mem_size The size of the buffer stack, in bytes.
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ * @return Zero on success, ::GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE if
+ * buffer_size_enum is invalid, ::GXIO_MPIPE_ERR_BAD_BUFFER_STACK if
+ * stack has not been allocated.
+ */
+extern int
+gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t * context,
+			     unsigned int stack,
+			     gxio_mpipe_buffer_size_enum_t buffer_size_enum,
+			     void *mem, size_t mem_size,
+			     unsigned int mem_flags);
+
+/* Push a buffer onto a previously initialized buffer stack.
+ *
+ * The size of the buffer being pushed must match the size that was
+ * registered with gxio_mpipe_init_buffer_stack().  All packet buffer
+ * addresses are 128-byte aligned; the low 7 bits of the specified
+ * buffer address will be ignored.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer The buffer (the low seven bits are ignored).
+ */
+static inline void
+gxio_mpipe_push_buffer(gxio_mpipe_context_t * context,
+		       unsigned int stack, void *buffer)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+	MPIPE_BSM_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+	    MPIPE_MMIO_ADDR__REGION_VAL_BSM - MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+#if __SIZEOF_POINTER__ == 4
+	val.va = ((unsigned long)buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#else
+	val.va = ((long)buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#endif
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/* Pop a buffer off of a previously initialized buffer stack.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @return The buffer, or NULL is the stack is empty.
+ */
+static inline void *gxio_mpipe_pop_buffer(gxio_mpipe_context_t * context,
+					  unsigned int stack)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+	    MPIPE_MMIO_ADDR__REGION_VAL_BSM - MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+	while (1) {
+		/*
+		 * Case 1: val.c == ..._UNCHAINED, va is non-zero.
+		 * Case 2: val.c == ..._INVALID, va is zero.
+		 * Case 3: val.c == ..._NOT_RDY, va is zero.
+		 */
+		MPIPE_BSM_REGION_VAL_t val;
+		val.word =
+		    __gxio_mmio_read(context->mmio_fast_base + offset.word);
+
+		/*
+		 * Handle case 1 and 2 by returning the buffer (or NULL).
+		 * Handle case 3 by waiting for the prefetch buffer to refill.
+		 */
+		if (val.c != MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY)
+			return (void *)((unsigned long)val.
+					va << MPIPE_BSM_REGION_VAL__VA_SHIFT);
+	}
+}
+
+/*****************************************************************
+ *                          NotifRings                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifRings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that NotifRings are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifRings required.
+ * @param first Index of first NotifRing if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifRing, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_RING if allocation failed.
+ */
+extern int
+gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t * context,
+			     unsigned int count, unsigned int first,
+			     unsigned int flags);
+
+/* Initialize a NotifRing, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_idesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 128, 512,
+ * 2048, or 65536 * sizeof(gxio_mpipe_idesc_t).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_NOTIF_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int
+gxio_mpipe_init_notif_ring(gxio_mpipe_context_t * context,
+			   unsigned int ring,
+			   void *mem, size_t mem_size, unsigned int mem_flags);
+
+/* Configure an interrupt to be sent to a tile on incoming NotifRing
+ *  traffic.  Once an interrupt is sent for a particular ring, no more
+ *  will be sent until gxio_mica_enable_notif_ring_interrupt() is called.
+ *
+ * @param context An initialized mPIPE context.
+ * @param x X coordinate of interrupt target tile.
+ * @param y Y coordinate of interrupt target tile.
+ * @param i Index of the IPI register which will receive the interrupt.
+ * @param e Specific event which will be set in the target IPI register when
+ * the interrupt occurs.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int
+gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					int x, int y, int i, int e,
+					unsigned int ring);
+
+/* Enable an interrupt on incoming NotifRing traffic.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int
+gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t * context,
+				       unsigned int ring);
+
+/* Map all of a client's memory via the given IOTLB.
+ * @param context An initialized mPIPE context.
+ * @param iotlb IOTLB index.
+ * @param pte Page table entry.
+ * @param flags Flags.
+ * @return Zero on success, or a negative error code.
+ */
+extern int
+gxio_mpipe_register_client_memory(gxio_mpipe_context_t * context,
+				  unsigned int iotlb, HV_PTE pte,
+				  unsigned int flags);
+
+/*****************************************************************
+ *                        Notif Groups                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifGroups.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifGroups required.
+ * @param first Index of first NotifGroup if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifGroup, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_GROUP if allocation failed.
+ */
+extern int
+gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t * context,
+			      unsigned int count, unsigned int first,
+			      unsigned int flags);
+
+/* Add a NotifRing to a NotifGroup.  This only sets a bit in the
+ * application's 'group' object; the hardware NotifGroup can be
+ * initialized by passing 'group' to gxio_mpipe_init_notif_group() or
+ * gxio_mpipe_init_notif_group_and_buckets().
+ */
+static inline void
+gxio_mpipe_notif_group_add_ring(gxio_mpipe_notif_group_bits_t * bits, int ring)
+{
+	bits->ring_mask[ring / 64] |= (1ull << (ring % 64));
+}
+
+/* Set a particular NotifGroup bitmask.  Since the load balancer
+ * makes decisions based on both bucket and NotifGroup state, most
+ * applications should use gxio_mpipe_init_notif_group_and_buckets()
+ * rather than using this function to configure just a NotifGroup.
+ */
+extern int
+gxio_mpipe_init_notif_group(gxio_mpipe_context_t * context, unsigned int group,
+			    gxio_mpipe_notif_group_bits_t bits);
+
+/*****************************************************************
+ *                         Load Balancer                          *
+ ******************************************************************/
+
+/* Allocate a set of load balancer buckets.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that buckets are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * Note that the buckets are actually divided into two sub-ranges, of
+ * different sizes, and different chunk sizes, and the range you get
+ * by default is determined by the size of the request.  Allocations
+ * cannot span the two sub-ranges.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of buckets required.
+ * @param first Index of first bucket if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer bucket, or
+ * ::GXIO_MPIPE_ERR_NO_BUCKET if allocation failed.
+ */
+extern int
+gxio_mpipe_alloc_buckets(gxio_mpipe_context_t * context,
+			 unsigned int count, unsigned int first,
+			 unsigned int flags);
+
+/* The legal modes for gxio_mpipe_bucket_info_t and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * All modes except ::GXIO_MPIPE_BUCKET_ROUND_ROBIN expect that the user
+ * will allocate a power-of-two number of buckets and initialize them
+ * to the same mode.  The classifier program then uses the appropriate
+ * number of low bits from the incoming packet's flow hash to choose a
+ * load balancer bucket.  Based on that bucket's load balancing mode,
+ * reference count, and currently active NotifRing, the load balancer
+ * chooses the NotifRing to which the packet will be delivered.
+ */
+typedef enum {
+  /* All packets for a bucket go to the same NotifRing unless the
+   * NotifRing gets full, in which case packets will be dropped.  If
+   * the bucket reference count ever reaches zero, a new NotifRing may
+   * be chosen.
+   */
+	GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY
+	    = MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA,
+
+  /* All packets for a bucket always go to the same NotifRing.
+   */
+	GXIO_MPIPE_BUCKET_STATIC_FLOW_AFFINITY
+	    = MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED,
+
+  /* All packets for a bucket go to the least full NotifRing in the
+   * group, providing load balancing round robin behavior.
+   */
+	GXIO_MPIPE_BUCKET_ROUND_ROBIN
+	    = MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK,
+
+  /* All packets for a bucket go to the same NotifRing unless the
+   * NotifRing gets full, at which point the bucket starts using the
+   * least full NotifRing in the group.  If all NotifRings in the
+   * group are full, packets will be dropped.
+   */
+	GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY
+	    = MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY,
+
+  /* All packets for a bucket go to the same NotifRing unless the
+   * NotifRing gets full, or a random timer fires, at which point the
+   * bucket starts using the least full NotifRing in the group.  If
+   * all NotifRings in the group are full, packets will be dropped.
+   * WARNING: This mode is BROKEN on chips with fewer than 64 tiles.
+   */
+	GXIO_MPIPE_BUCKET_PREFER_FLOW_LOCALITY
+	    = MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND,
+
+} gxio_mpipe_bucket_mode_t;
+
+/* Copy a set of bucket initialization values into the mPIPE
+ * hardware.  Since the load balancer makes decisions based on both
+ * bucket and NotifGroup state, most applications should use
+ * gxio_mpipe_init_notif_group_and_buckets() rather than using this
+ * function to configure a single bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param bucket Bucket index to be initialized.
+ * @param bucket_info Initial reference count, NotifRing index, and mode.
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET on failure.
+ */
+extern int
+gxio_mpipe_init_bucket(gxio_mpipe_context_t * context, unsigned int bucket,
+		       gxio_mpipe_bucket_info_t bucket_info);
+
+/* Initializes a group and range of buckets and range of rings such
+ * that the load balancer runs a particular load balancing function.
+ *
+ * First, the group is initialized with the given rings.
+ *
+ * Second, each bucket is initialized with the mode and group, and a
+ * ring chosen round-robin from the given rings.
+ *
+ * Normally, the classifier picks a bucket, and then the load balancer
+ * picks a ring, based on the bucket's mode, group, and current ring,
+ * possibly updating the bucket's ring.
+ *
+ * @param context An initialized mPIPE context.
+ * @param group The group.
+ * @param ring The first ring.
+ * @param num_rings The number of rings.
+ * @param bucket The first bucket.
+ * @param num_buckets The number of buckets.
+ * @param mode The load balancing mode.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET,
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_GROUP, or
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_RING on failure.
+ */
+extern int
+gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t * context,
+					unsigned int group,
+					unsigned int ring,
+					unsigned int num_rings,
+					unsigned int bucket,
+					unsigned int num_buckets,
+					gxio_mpipe_bucket_mode_t mode);
+
+/* Return credits to a NotifRing and/or bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index, or -1.
+ * @param bucket The bucket, or -1.
+ * @param count The number of credits to return.
+ */
+static inline void
+gxio_mpipe_credit(gxio_mpipe_context_t * context,
+		  int ring, int bucket, unsigned int count)
+{
+	/* NOTE: Fancy struct initialization would break "C89" header test. */
+
+	MPIPE_IDMA_RELEASE_REGION_ADDR_t offset = { {0} };
+	MPIPE_IDMA_RELEASE_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+	    MPIPE_MMIO_ADDR__REGION_VAL_IDMA - MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = ring;
+	offset.bucket = bucket;
+	offset.ring_enable = (ring >= 0);
+	offset.bucket_enable = (bucket >= 0);
+	val.count = count;
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/*****************************************************************
+ *                         Egress Rings                           *
+ ******************************************************************/
+
+/* Allocate a set of eDMA rings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of eDMA rings required.
+ * @param first Index of first eDMA ring if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer eDMA ring, or
+ * ::GXIO_MPIPE_ERR_NO_EDMA_RING if allocation failed.
+ */
+extern int
+gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t * context,
+			    unsigned int count, unsigned int first,
+			    unsigned int flags);
+
+/* Initialize an eDMA ring, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The eDMA ring index.
+ * @param channel The channel to use.  This must be one of the channels
+ * associated with the context's set of open links.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_edesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 512, 2048,
+ * 8192 or 65536, times 16 (i.e. sizeof(gxio_mpipe_edesc_t)).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_EDMA_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int
+gxio_mpipe_init_edma_ring(gxio_mpipe_context_t * context,
+			  unsigned int ring, unsigned int channel,
+			  void *mem, size_t mem_size, unsigned int mem_flags);
+
+/*****************************************************************
+ *                      Classifier Program                        *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for loading or configuring the mPIPE classifier program.
+ *
+ * The mPIPE classification processors all run a special "classifier"
+ * program which, for each incoming packet, parses the packet headers,
+ * encodes some packet metadata in the "idesc", and either drops the
+ * packet, or picks a notif ring to handle the packet, and a buffer
+ * stack to contain the packet, usually based on the channel, VLAN,
+ * dMAC, flow hash, and packet size, under the guidance of the "rules"
+ * API described below.
+ *
+ * @section gxio_mpipe_classifier_default Default Classifier
+ *
+ * The MDE provides a simple "default" classifier program.  It is
+ * shipped as source in "$TILERA_ROOT/src/sys/mpipe/classifier.c",
+ * which serves as its official documentation.  It is shipped as a
+ * binary program in "$TILERA_ROOT/tile/boot/classifier", which is
+ * automatically included in bootroms created by "tile-monitor", and
+ * is automatically loaded by the hypervisor at boot time.
+ *
+ * The L2 analysis handles LLC packets, SNAP packets, and "VLAN
+ * wrappers" (keeping the outer VLAN).
+ *
+ * The L3 analysis handles IPv4 and IPv6, dropping packets with bad
+ * IPv4 header checksums, requesting computation of a TCP/UDP checksum
+ * if appropriate, and hashing the dest and src IP addresses, plus the
+ * ports for TCP/UDP packets, into the flow hash.  No special analysis
+ * is done for "fragmented" packets or "tunneling" protocols.  Thus,
+ * the first fragment of a fragmented TCP/UDP packet is hashed using
+ * src/dest IP address and ports and all subsequent fragments are only
+ * hashed according to src/dest IP address.
+ *
+ * The L3 analysis handles other packets too, hashing the dMAC
+ * smac into a flow hash.
+ *
+ * The channel, VLAN, and dMAC used to pick a "rule" (see the
+ * "rules" APIs below), which in turn is used to pick a buffer stack
+ * (based on the packet size) and a bucket (based on the flow hash).
+ *
+ * To receive traffic matching a particular (channel/VLAN/dMAC
+ * pattern, an application should allocate its own buffer stacks and
+ * load balancer buckets, and map traffic to those stacks and buckets,
+ * as decribed by the "rules" API below.
+ *
+ * Various packet metadata is encoded in the idesc.  The flow hash is
+ * four bytes at 0x0C.  The VLAN is two bytes at 0x10.  The ethtype is
+ * two bytes at 0x12.  The l3 start is one byte at 0x14.  The l4 start
+ * is one byte at 0x15 for IPv4 and IPv6 packets, and otherwise zero.
+ * The protocol is one byte at 0x16 for IPv4 and IPv6 packets, and
+ * otherwise zero.
+ *
+ * @section gxio_mpipe_classifier_custom Custom Classifiers.
+ *
+ * A custom classifier may be created using "tile-mpipe-cc" with a
+ * customized version of the default classifier sources.
+ *
+ * The custom classifier may be included in bootroms using the
+ * "--classifier" option to "tile-monitor", or loaded dynamically
+ * using gxio_mpipe_classifier_load_from_file().
+ *
+ * Be aware that "extreme" customizations may break the assumptions of
+ * the "rules" APIs described below, but simple customizations, such
+ * as adding new packet metadata, should be fine.
+ */
+
+/* A set of classifier rules, plus a context. */
+typedef struct {
+
+  /* The context. */
+	gxio_mpipe_context_t *context;
+
+  /* The actual rules. */
+	gxio_mpipe_rules_list_t list;
+
+} gxio_mpipe_rules_t;
+
+/* Initialize a classifier program rules list.
+ *
+ * This function can be called on a previously initialized rules list
+ * to discard any previously added rules.
+ *
+ * @param rules Rules list to initialize.
+ * @param context An initialized mPIPE context.
+ */
+extern void
+gxio_mpipe_rules_init(gxio_mpipe_rules_t * rules,
+		      gxio_mpipe_context_t * context);
+
+/* Begin a new rule on the indicated rules list.
+ *
+ * Note that an empty rule matches all packets, but an empty rule list
+ * matches no packets.
+ *
+ * @param rules Rules list to which new rule is appended.
+ * @param bucket First load balancer bucket to which packets will be
+ * delivered.
+ * @param num_buckets Number of buckets (must be a power of two) across
+ * which packets will be distributed based on the "flow hash".
+ * @param stacks Either NULL, to assign each packet to the smallest
+ * initialized buffer stack which does not induce chaining (and to
+ * drop packets which exceed the largest initialized buffer stack
+ * buffer size), or an array, with each entry indicating which buffer
+ * stack should be used for packets up to that size (with 255
+ * indicating that those packets should be dropped).
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_begin(gxio_mpipe_rules_t * rules,
+		       unsigned int bucket, unsigned int num_buckets,
+		       gxio_mpipe_rules_stacks_t * stacks);
+
+/* Set the priority of the current rule.
+ *
+ * The default priority is zero.  A negative priority has "higher"
+ * priority, and a positive priority has "lower" priority.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param priority The priority.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_set_priority(gxio_mpipe_rules_t * rules, int priority);
+
+/* Set the headroom of the current rule.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param headroom The headroom.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t * rules, uint8_t headroom);
+
+/* Set the tailroom of the current rule.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param tailroom The tailroom.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_set_tailroom(gxio_mpipe_rules_t * rules, uint8_t tailroom);
+
+/* Set the capacity of the current rule.
+ *
+ * If the packet size, plus the headroom and tailroom, exceed the
+ * capacity, then the packet will be dropped.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param capacity The capacity.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_set_capacity(gxio_mpipe_rules_t * rules, uint16_t capacity);
+
+/* Indicate that packets from a particular channel can be delivered
+ * to the buckets and buffer stacks associated with the current rule.
+ *
+ * Channels added must be associated with links opened by the mPIPE context
+ * used in gxio_mpipe_rules_init().  A rule with no channels is equivalent
+ * to a rule naming all such associated channels.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param channel The channel to add.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t * rules, unsigned int channel);
+
+/* Indicate that packets targetting a particular destination MAC
+ * address can be delivered to the current rule's buckets and buffer
+ * stacks.
+ *
+ * A rule with NO dMACs is equivalent to a rule with ALL dMACs.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param dmac The destination MAC to add.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_add_dmac(gxio_mpipe_rules_t * rules,
+			  gxio_mpipe_rules_dmac_t dmac);
+
+/* Indicate that packets with a particular VLAN can be delivered to
+ * the current rule's buckets and buffer stacks.
+ *
+ * A rule with NO VLANs is equivalent to a rule with ALL VLANs.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param vlan The VLAN to add.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int
+gxio_mpipe_rules_add_vlan(gxio_mpipe_rules_t * rules,
+			  gxio_mpipe_rules_vlan_t vlan);
+
+/* Commit rules.
+ *
+ * The rules are sent to the hypervisor, where they are combined with
+ * the rules from other apps, and used to program the hardware classifier.
+ *
+ * Note that if this function returns an error, then the rules will NOT
+ * have been committed, even if the error is due to interactions with
+ * rules from another app.
+ *
+ * @param rules Rules list to commit.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_commit(gxio_mpipe_rules_t * rules);
+
+/*****************************************************************
+ *                     Ingress Queue Wrapper                      *
+ ******************************************************************/
+
+/*
+ *
+ * Convenience functions for receiving packets from a NotifRing and
+ * sending packets via an eDMA ring.
+ *
+ * The mpipe ingress and egress hardware uses shared memory packet
+ * descriptors to describe packets that have arrived on ingress or
+ * are destined for egress.  These descriptors are stored in shared
+ * memory ring buffers and written or read by hardware as necessary.
+ * The gxio library provides wrapper functions that manage the head and
+ * tail pointers for these rings, allowing the user to easily read or
+ * write packet descriptors.
+ *
+ * The initialization interface for ingress and egress rings is quite
+ * similar.  For example, to create an ingress queue, the user passes
+ * a ::gxio_mpipe_iqueue_t state object, a ring number from
+ * gxio_mpipe_alloc_notif_rings(), and the address of memory to hold a
+ * ring buffer to the gxio_mpipe_iqueue_init() function.  The function
+ * returns success when the state object has been initialized and the
+ * hardware configured to deliver packets to the specified ring
+ * buffer.  Similarly, gxio_mpipe_equeue_init() takes a
+ * ::gxio_mpipe_equeue_t state object, a ring number from
+ * gxio_mpipe_alloc_edma_rings(), and a shared memory buffer.
+ *
+ * @section gxio_mpipe_iqueue Working with Ingress Queues
+ *
+ * Once initialized, the gxio_mpipe_iqueue_t API provides two flows
+ * for getting the ::gxio_mpipe_idesc_t packet descriptor associated
+ * with incoming packets.  The simplest is to call
+ * gxio_mpipe_iqueue_get() or gxio_mpipe_iqueue_try_get().  These
+ * functions copy the oldest packet descriptor out of the NotifRing and
+ * into a descriptor provided by the caller.  They also immediately
+ * inform the hardware that a descriptor has been processed.
+ *
+ * For applications with stringent performance requirements, higher
+ * efficiency can be achieved by avoiding the packet descriptor copy
+ * and processing multiple descriptors at once.  The
+ * gxio_mpipe_iqueue_peek() and gxio_mpipe_iqueue_try_peek() functions
+ * allow such optimizations.  These functions provide a pointer to the
+ * next valid ingress descriptor in the NotifRing's shared memory ring
+ * buffer, and a count of how many contiguous descriptors are ready to
+ * be processed.  The application can then process any number of those
+ * descriptors in place, calling gxio_mpipe_iqueue_consume() to inform
+ * the hardware after each one has been processed.
+ *
+ * @section gxio_mpipe_equeue Working with Egress Queues
+ *
+ * Similarly, the egress queue API provides a high-performance
+ * interface plus a simple wrapper for use in posting
+ * ::gxio_mpipe_edesc_t egress packet descriptors.  The simple
+ * version, gxio_mpipe_equeue_put(), allows the programmer to wait for
+ * an eDMA ring slot to become available and write a single descriptor
+ * into the ring.
+ *
+ * Alternatively, the gxio_mpipe_equeue_reserve() and
+ * gxio_mpipe_equeue_put_at() APIs can be used to reserve multiple
+ * eDMA ring slots and then fill each slot with a
+ * ::gxio_mpipe_edesc_t.  This capability can be used to reduce
+ * per-operation overhead by posting multiple packets with a single
+ * gxio_mpipe_equeue_reserve() call.  It also allows gather operations
+ * to be performed by posting multiple descriptors, one for each
+ * fragment in the final egress packet.
+ *
+ * The 'slot' number returned by gxio_mpipe_reserve() is really a
+ * 63-bit sequence number, the low bits of which indicate the ring
+ * buffer index and the high bits the number of times the application
+ * has gone around the egress ring buffer.  The extra bits allow an
+ * application to check for egress completion by calling
+ * gxio_mpipe_equeue_is_complete() to see whether a particular 'slot'
+ * number has finished.  Given the maximum packet rates of the Gx
+ * processor, the 63-bit slot number will never wrap.
+ *
+ * In practice, most applications use the ::gxio_mpipe_edesc_t::hwb
+ * bit to indicate that the buffers containing egress packet data
+ * should be pushed onto a buffer stack when egress is complete.  In
+ * this case, the programmer generally does not need to know when an
+ * egress operation actually finishes, since there is no need to free
+ * a buffer post-egress.
+ *
+ * @section gxio_mpipe_equeue_ordered Ordered Packet Forwarding
+ *
+ * The gxio_mpipe_equeue_put_at() API call also be used to perform
+ * in-order forwarding.  mPIPE ingress packets can be marked with
+ * sequence numbers stored in ::gxio_mpipe_idesc_t.  If ingress is
+ * configured to provide sequence numbers, an application can use
+ * gxio_mpipe_equeue_put_at() to put packets into the eDMA ring slot
+ * indicated by their ingress sequence number, in effect forcing the
+ * system to perform ordered packets forwarding.  When using this
+ * mechanism, applications should take care to obey the following
+ * rules:
+ *
+ * - Never call gxio_mpipe_equeue_reserve() on an eDMA ring that is
+ * used for ordered forwarding; that function knows nothing about the
+ * descriptors that have been posted given a sequence number.
+ *
+ * - Make sure that the number of ingress buffers is less than the
+ * number of slots in the eDMA ring.  This guarantees that a burst of
+ * ingress packets cannot overflow the slots available in the eDMA
+ * ring buffer.
+ *
+ * - gxio_mpipe_equeue_put_at() must be called once for each ingress
+ * packet.  Skipping a packet will cause the hardware to stall waiting
+ * for the next in-order packet descriptor.
+ *
+ * - If the application chooses to drop a packet rather than forward
+ * it, it can set the ::gxio_mpipe_edesc_t::ns (no send) bit on the
+ * descriptor passed to gxio_mpipe_equeue_put_at() to indicate that no
+ * data should be sent.  If indicated, the buffer will still be pushed
+ * onto the buffer stack when the egress descriptor is processed.
+ */
+
+/* A convenient interface to a NotifRing, for use by a single thread.
+ */
+typedef struct {
+
+  /* The context. */
+	gxio_mpipe_context_t *context;
+
+  /* The actual NotifRing. */
+	gxio_mpipe_idesc_t *idescs;
+
+  /* The number of entries. */
+	unsigned long num_entries;
+
+  /* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+  /* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+  /* The next entry. */
+	unsigned int head;
+
+  /* The NotifRing id. */
+	unsigned int ring;
+
+#ifdef __BIG_ENDIAN__
+  /* The number of byteswapped entries. */
+	unsigned int swapped;
+#endif
+
+} gxio_mpipe_iqueue_t;
+
+/* Initialize an "iqueue".
+ *
+ * Takes the iqueue plus the same args as gxio_mpipe_init_notif_ring().
+ */
+extern int
+gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t * iqueue,
+		       gxio_mpipe_context_t * context,
+		       unsigned int ring,
+		       void *mem, size_t mem_size, unsigned int mem_flags);
+
+/* Advance over some old entries in an iqueue.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param count The number of entries to advance over.
+ */
+static inline void
+gxio_mpipe_iqueue_advance(gxio_mpipe_iqueue_t * iqueue, int count)
+{
+	/* Advance with proper wrap. */
+	int head = iqueue->head + count;
+	iqueue->head =
+	    (head & iqueue->mask_num_entries) +
+	    (head >> iqueue->log2_num_entries);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Track swapped entries. */
+	iqueue->swapped -= count;
+#endif
+}
+
+/* Release the ring and bucket for an old entry in an iqueue.
+ *
+ * Releasing the ring allows more packets to be delivered to the ring.
+ *
+ * Releasing the bucket allows flows using the bucket to be moved to a
+ * new ring when using GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY.
+ *
+ * This function is shorthand for "gxio_mpipe_credit(iqueue->context,
+ * iqueue->ring, idesc->bucket_id, 1)", and it may be more convenient
+ * to make that underlying call, using those values, instead of
+ * tracking the entire "idesc".
+ *
+ * If packet processing is deferred, optimal performance requires that
+ * the releasing be deferred as well.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void
+gxio_mpipe_iqueue_release(gxio_mpipe_iqueue_t * iqueue,
+			  gxio_mpipe_idesc_t * idesc)
+{
+	gxio_mpipe_credit(iqueue->context, iqueue->ring, idesc->bucket_id, 1);
+}
+
+/* Consume a packet from an "iqueue".
+ *
+ * After processing packets peeked at via gxio_mpipe_iqueue_peek()
+ * or gxio_mpipe_iqueue_try_peek(), you must call this function, or
+ * gxio_mpipe_iqueue_advance() plus gxio_mpipe_iqueue_release(), to
+ * advance over those entries, and release their rings and buckets.
+ *
+ * You may call this function as each packet is processed, or you can
+ * wait until several packets have been processed.
+ *
+ * Note that if you are using a single bucket, and you are handling
+ * batches of N packets, then you can replace several calls to this
+ * function with calls to "gxio_mpipe_iqueue_advance(iqueue, N)" and
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, bucket, N)".
+ *
+ * Note that if your classifier sets "idesc->nr", then you should
+ * explicitly call "gxio_mpipe_iqueue_advance(iqueue, idesc)" plus
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, -1, 1)", to
+ * avoid incorrectly crediting the (unused) bucket.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void
+gxio_mpipe_iqueue_consume(gxio_mpipe_iqueue_t * iqueue,
+			  gxio_mpipe_idesc_t * idesc)
+{
+	gxio_mpipe_iqueue_advance(iqueue, 1);
+	gxio_mpipe_iqueue_release(iqueue, idesc);
+}
+
+/* Peek at the next packet(s) in an "iqueue", without waiting.
+ *
+ * If no packets are available, fills idesc_ref with NULL, and then
+ * returns ::GXIO_MPIPE_ERR_IQUEUE_EMPTY.  Otherwise, fills idesc_ref
+ * with the address of the next valid packet descriptor, and returns
+ * the maximum number of valid descriptors which can be processed.
+ * You may process fewer descriptors if desired.
+ *
+ * Call gxio_mpipe_iqueue_consume() on each packet once it has been
+ * processed (or dropped), to allow more packets to be delivered.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc_ref A pointer to a packet descriptor pointer.
+ * @return The (positive) number of packets which can be processed,
+ * or ::GXIO_MPIPE_ERR_IQUEUE_EMPTY if no packets are available.
+ */
+static inline int
+gxio_mpipe_iqueue_try_peek(gxio_mpipe_iqueue_t * iqueue,
+			   gxio_mpipe_idesc_t ** idesc_ref)
+{
+	gxio_mpipe_idesc_t *next;
+
+	uint64_t head = iqueue->head;
+	uint64_t tail = __gxio_mmio_read(iqueue->idescs);
+
+	/* Available entries. */
+	uint64_t avail =
+	    (tail >= head) ? (tail - head) : (iqueue->num_entries - head);
+
+	if (avail == 0) {
+		*idesc_ref = NULL;
+		return GXIO_MPIPE_ERR_IQUEUE_EMPTY;
+	}
+
+	next = &iqueue->idescs[head];
+
+	/* ISSUE: Is this helpful? */
+	__insn_prefetch(next);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Swap new entries directly in memory. */
+	{
+		int i, j;
+		for (i = iqueue->swapped; i < avail; i++) {
+			for (j = 0; j < 8; j++)
+				next[i].words[j] =
+				    __builtin_bswap64(next[i].words[j]);
+		}
+		iqueue->swapped = avail;
+	}
+#endif
+
+	*idesc_ref = next;
+
+	return avail;
+}
+
+/* Drop a packet by pushing its buffer (if appropriate).
+ *
+ * NOTE: The caller must still call gxio_mpipe_iqueue_consume() if idesc
+ * came from gxio_mpipe_iqueue_try_peek() or gxio_mpipe_iqueue_peek().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc A packet descriptor.
+ */
+static inline void
+gxio_mpipe_iqueue_drop(gxio_mpipe_iqueue_t * iqueue, gxio_mpipe_idesc_t * idesc)
+{
+	/* FIXME: Handle "chaining" properly. */
+
+	if (!idesc->be) {
+		unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+		gxio_mpipe_push_buffer(iqueue->context, idesc->stack_idx, va);
+	}
+}
+
+/* Drop a packet if it is "bad".
+ *
+ * If gxio_mpipe_idesc_is_bad(idesc), then call gxio_mpipe_iqueue_drop()
+ * and return 1, else, return 0.
+ *
+ * NOTE: The caller must still call gxio_mpipe_iqueue_consume() if idesc
+ * came from gxio_mpipe_iqueue_try_peek() or gxio_mpipe_iqueue_peek().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc A packet descriptor.
+ * @return 1 if packet was dropped, else 0.
+ */
+static inline int
+gxio_mpipe_iqueue_drop_if_bad(gxio_mpipe_iqueue_t * iqueue,
+			      gxio_mpipe_idesc_t * idesc)
+{
+	if (gxio_mpipe_idesc_is_bad(idesc)) {
+		gxio_mpipe_iqueue_drop(iqueue, idesc);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Return the iqueue fullness.
+ *
+ * The hardware tracks the fullness of each NotifRing in a 3-bit quantized
+ * value readable via user-space MMIO.  0 is empty, 7 is full.  The values
+ * in between are controlled by the MPIPE_LBL_QUANT_THRESH registers.  They
+ * will typically be configured to be exponential such that each incremental
+ * 3-bit fullness value represents an exponentially larger number of packets
+ * in the queue.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @return the 3 bit quantized fullness of the iqueue.
+ */
+static inline int gxio_mpipe_iqueue_get_fullness(gxio_mpipe_iqueue_t * iqueue)
+{
+	void *mmio_addr =
+	    (iqueue->context->mmio_cfg_base + MPIPE_LBL_NR_STATE__FIRST_WORD +
+	     ((iqueue->ring >> 4) << 3));
+
+	uint64_t quant_reg = __gxio_mmio_read(mmio_addr);
+
+	return (quant_reg >> (4 * (iqueue->ring & 0xf))) & 0x7;
+}
+
+/*****************************************************************
+ *                      Egress Queue Wrapper                      *
+ ******************************************************************/
+
+/* A convenient, thread-safe interface to an eDMA ring. */
+typedef struct {
+
+  /* State object for tracking head and tail pointers. */
+	__gxio_dma_queue_t dma_queue;
+
+  /* The ring entries. */
+	gxio_mpipe_edesc_t *edescs;
+
+  /* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+  /* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+} gxio_mpipe_equeue_t;
+
+/* Initialize an "equeue".
+ *
+ * Takes the equeue plus the same args as gxio_mpipe_init_edma_ring().
+ */
+extern int
+gxio_mpipe_equeue_init(gxio_mpipe_equeue_t * equeue,
+		       gxio_mpipe_context_t * context,
+		       unsigned int edma_ring_id,
+		       unsigned int channel,
+		       void *mem, unsigned int mem_size,
+		       unsigned int mem_flags);
+
+/* Reserve slots for eDMA commands.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve.
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t
+gxio_mpipe_equeue_reserve(gxio_mpipe_equeue_t * equeue, unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, 1);
+}
+
+/* Reserve slots for eDMA commands, if possible.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve.
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t
+gxio_mpipe_equeue_try_reserve(gxio_mpipe_equeue_t * equeue, unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, 0);
+}
+
+/*
+ * HACK: This helper function tricks gcc 4.6 into avoiding saving
+ * a copy of "edesc->words[0]" on the stack for no obvious reason.
+ */
+
+static inline void
+gxio_mpipe_equeue_put_at_aux(gxio_mpipe_equeue_t * equeue,
+			     uint_reg_t ew[2], unsigned long slot)
+{
+	unsigned long edma_slot = slot & equeue->mask_num_entries;
+	gxio_mpipe_edesc_t *edesc_p = &equeue->edescs[edma_slot];
+
+	/*
+	 * ISSUE: Could set eDMA ring to be on generation 1 at start, which
+	 * would avoid the negation here, perhaps allowing "__insn_bfins()".
+	 */
+	ew[0] |= !((slot >> equeue->log2_num_entries) & 1);
+
+	/*
+	 * NOTE: We use "__gxio_mpipe_write()", plus the fact that the eDMA
+	 * queue alignment restrictions ensure that these two words are on
+	 * the same cacheline, to force proper ordering between the stores.
+	 */
+	__gxio_mmio_write64(&edesc_p->words[1], ew[1]);
+	__gxio_mmio_write64(&edesc_p->words[0], ew[0]);
+}
+
+/* Post an eDMA command to an eDMA queue at a given egress slot.
+ *
+ * This function copies the supplied edesc into entry "slot mod N" in
+ * the underlying ring, setting the "gen" bit to the appropriate value
+ * based on "(slot mod N*2)", where "N" is the size of the ring.  Note
+ * that the higher bits of slot are unused.
+ *
+ * Normally this function is used to fill in slots reserved by, for
+ * example, gxio_mpipe_equeue_reserve().
+ *
+ * This function can also be used without "reserving" slots, if the
+ * application KNOWS that the ring can never overflow, for example,
+ * by pushing fewer buffers into the buffer stacks than there are
+ * total slots in the equeue.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc eDMA command to be posted.
+ * @param slot An egress slot (only the low bits are actually used).
+ */
+static inline void
+gxio_mpipe_equeue_put_at(gxio_mpipe_equeue_t * equeue,
+			 gxio_mpipe_edesc_t edesc, unsigned long slot)
+{
+	gxio_mpipe_equeue_put_at_aux(equeue, edesc.words, slot);
+}
+
+/* Post a single eDMA command to an eDMA queue.
+ *
+ * This is a convenience wrapper around gxio_mpipe_equeue_reserve()
+ * and gxio_mpipe_equeue_put_at().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc eDMA command to be posted.
+ * @return 0 on success.
+ */
+static inline int
+gxio_mpipe_equeue_put(gxio_mpipe_equeue_t * equeue, gxio_mpipe_edesc_t edesc)
+{
+	int64_t slot = gxio_mpipe_equeue_reserve(equeue, 1);
+	if (slot < 0)
+		return (int)slot;
+
+	gxio_mpipe_equeue_put_at(equeue, edesc, slot);
+
+	return 0;
+}
+
+/* Ask the mPIPE hardware to egress outstanding packets immediately.
+ *
+ * This call is not necessary, but may slightly reduce overall latency.
+ *
+ * Technically, you should flush all gxio_mpipe_equeue_put_at() writes
+ * to memory before calling this function, to ensure the descriptors
+ * are visible in memory before the mPIPE hardware actually looks for
+ * them.  But this should be very rare, and the only side effect would
+ * be increased latency, so it is up to the caller to decide whether
+ * or not to flush memory.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ */
+static inline void gxio_mpipe_equeue_flush(gxio_mpipe_equeue_t * equeue)
+{
+	/* Use "ring_idx = 0" and "count = 0" to "wake up" the eDMA ring. */
+	MPIPE_EDMA_POST_REGION_VAL_t val = { {0} };
+	__insn_flushwb();	/* Flush the write buffers. */
+	__gxio_mmio_write(equeue->dma_queue.post_region_addr, val.word);
+}
+
+/* Determine if a given eDMA command has been completed.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param slot The slot used by the eDMA command.
+ * @param update If true, and the command does not appear to have completed
+ * yet, then update any software cache of the hardware completion counter,
+ * and check again.  This should normally be true.
+ * @return True iff the given eDMA command has been completed.
+ *
+ * ISSUE: This should return "bool" and should take "bool update".
+ */
+static inline int
+gxio_mpipe_equeue_is_complete(gxio_mpipe_equeue_t * equeue, int64_t slot,
+			      int update)
+{
+	return __gxio_dma_queue_is_complete(&equeue->dma_queue, slot, update);
+}
+
+/*****************************************************************
+ *                        Link Management                         *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for manipulating and sensing the state and configuration
+ * of physical network links.
+ *
+ * @section gxio_mpipe_link_perm Link Permissions
+ *
+ * Opening a link (with gxio_mpipe_link_open()) requests a set of link
+ * permissions, which control what may be done with the link, and potentially
+ * what permissions may be granted to other processes.
+ *
+ * Data permission allows the process to receive packets from the link by
+ * specifying the link's channel number in mPIPE packet distribution rules,
+ * and to send packets to the link by using the link's channel number as
+ * the target for an eDMA ring.
+ *
+ * Stats permission allows the process to retrieve link attributes (such as
+ * the speeds it is capable of running at, or whether it is currently up), and
+ * to read and write certain statistics-related registers in the link's MAC.
+ *
+ * Control permission allows the process to retrieve and modify link attributes
+ * (so that it may, for example, bring the link up and take it down), and
+ * read and write many registers in the link's MAC and PHY.
+ *
+ * Any permission may be requested as shared, which allows other processes
+ * to also request shared permission, or exclusive, which prevents other
+ * processes from requesting it.  In keeping with GXIO's typical usage in
+ * an embedded environment, the defaults for all permissions are shared.
+ *
+ * Permissions are granted on a first-come, first-served basis, so if two
+ * applications request an exclusive permission on the same link, the one
+ * to run first will win.  Note, however, that some system components, like
+ * the kernel Ethernet driver, may get an opportunity to open links before
+ * any applications run.
+ *
+ * @section gxio_mpipe_link_names Link Names
+ *
+ * Link names are of the form gbe<em>number</em> (for Gigabit Ethernet),
+ * xgbe<em>number</em> (for 10 Gigabit Ethernet), loop<em>number</em> (for
+ * internal mPIPE loopback), or ilk<em>number</em>/<em>channel</em>
+ * (for Interlaken links); for instance, gbe0, xgbe1, loop3, and
+ * ilk0/12 are all possible link names.  The correspondence between
+ * the link name and an mPIPE instance number or mPIPE channel number is
+ * system-dependent; all links will not exist on all systems, and the set
+ * of numbers used for a particular link type may not start at zero and may
+ * not be contiguous.  Use gxio_mpipe_link_enumerate() to retrieve the set of
+ * links which exist on a system, and always use gxio_mpipe_link_instance()
+ * to determine which mPIPE controls a particular link.
+ *
+ * Note that in some cases, links may share hardware, such as PHYs, or
+ * internal mPIPE buffers; in these cases, only one of the links may be
+ * opened at a time.  This is especially common with xgbe and gbe ports,
+ * since each xgbe port uses 4 SERDES lanes, each of which may also be
+ * configured as one gbe port.
+ *
+ * @section gxio_mpipe_link_states Link States
+ *
+ * The mPIPE link management model revolves around three different states,
+ * which are maintained for each link:
+ *
+ * 1. The <em>current</em> link state: is the link up now, and if so, at
+ *    what speed?
+ *
+ * 2. The <em>desired</em> link state: what do we want the link state to be?
+ *    The system is always working to make this state the current state;
+ *    thus, if the desired state is up, and the link is down, we'll be
+ *    constantly trying to bring it up, automatically.
+ *
+ * 3. The <em>possible</em> link state: what speeds are valid for this
+ *    particular link?  Or, in other words, what are the capabilities of
+ *    the link hardware?
+ *
+ * These link states are not, strictly speaking, related to application
+ * state; they may be manipulated at any time, whether or not the link
+ * is currently being used for data transfer.  However, for convenience,
+ * gxio_mpipe_link_open() and gxio_mpipe_link_close() (or application exit)
+ * can affect the link state.  These implicit link management operations
+ * may be modified or disabled by the use of link open flags.
+ *
+ * From an application, you can use gxio_mpipe_link_get_attr()
+ * and gxio_mpipe_link_set_attr() to manipulate the link states.
+ * gxio_mpipe_link_get_attr() with ::GXIO_MPIPE_LINK_POSSIBLE_STATE
+ * gets you the possible link state.  gxio_mpipe_link_get_attr() with
+ * ::GXIO_MPIPE_LINK_CURRENT_STATE gets you the current link state.
+ * Finally, gxio_mpipe_link_set_attr() and gxio_mpipe_link_get_attr()
+ * with ::GXIO_MPIPE_LINK_DESIRED_STATE allow you to modify or retrieve
+ * the desired link state.
+ *
+ * If you want to manage a link from a part of your application which isn't
+ * involved in packet processing, you can use the ::GXIO_MPIPE_LINK_NO_DATA
+ * flags on a gxio_mpipe_link_open() call.  This opens the link, but does
+ * not request data permission, so it does not conflict with any exclusive
+ * permissions which may be held by other processes.  You can then can use
+ * gxio_mpipe_link_get_attr() and gxio_mpipe_link_set_attr() on this link
+ * object to bring up or take down the link.
+ *
+ * Some links support link state bits which support various loopback
+ * modes. ::GXIO_MPIPE_LINK_LOOP_MAC tests datapaths within the Tile
+ * Processor itself; ::GXIO_MPIPE_LINK_LOOP_PHY tests the datapath between
+ * the Tile Processor and the external physical layer interface chip; and
+ * ::GXIO_MPIPE_LINK_LOOP_EXT tests the entire network datapath with the
+ * aid of an external loopback connector.  In addition to enabling hardware
+ * testing, such configuration can be useful for software testing, as well.
+ *
+ * When LOOP_MAC or LOOP_PHY is enabled, packets transmitted on a channel
+ * will be received by that channel, instead of being emitted on the
+ * physical link, and packets received on the physical link will be ignored.
+ * Other than that, all standard GXIO operations work as you might expect.
+ * Note that loopback operation requires that the link be brought up using
+ * one or more of the GXIO_MPIPE_LINK_SPEED_xxx link state bits.
+ *
+ * Those familiar with previous versions of the MDE on TILEPro hardware
+ * will notice significant similarities between the NetIO link management
+ * model and the mPIPE link management model.  However, the NetIO model
+ * was developed in stages, and some of its features -- for instance,
+ * the default setting of certain flags -- were shaped by the need to be
+ * compatible with previous versions of NetIO.  Since the features provided
+ * by the mPIPE hardware and the mPIPE GXIO library are significantly
+ * different than those provided by NetIO, in some cases, we have made
+ * different choices in the mPIPE link management API.  Thus, please read
+ * this documentation carefully before assuming that mPIPE link management
+ * operations are exactly equivalent to their NetIO counterparts.
+ */
+
+/* An object used to manage mPIPE link state and resources. */
+typedef struct {
+  /* The overall mPIPE context. */
+	gxio_mpipe_context_t *context;
+
+  /* The channel number used by this link. */
+	uint8_t channel;
+
+  /* The MAC index used by this link. */
+	uint8_t mac;
+} gxio_mpipe_link_t;
+
+/* Retrieve one of this system's legal link names, and its MAC address.
+ *
+ * @param index Link name index.  If a system supports N legal link names,
+ *  then indices between 0 and N - 1, inclusive, each correspond to one of
+ *  those names.  Thus, to retrieve all of a system's legal link names,
+ *  call this function in a loop, starting with an index of zero, and
+ *  incrementing it once per iteration until -1 is returned.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  link name.  The buffer should contain space for at least
+ *  ::GXIO_MPIPE_LINK_NAME_LEN bytes; the returned name, including the
+ *  terminating null byte, will be no longer than that.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  MAC address.  The buffer should contain space for at least 6 bytes.
+ * @return Zero if a link name was successfully retrieved; -1 if one was
+ *  not.
+ */
+extern int
+gxio_mpipe_link_enumerate_mac(int index, char *link_name, uint8_t * mac_addr);
+
+/* Open an mPIPE link.
+ *
+ *  A link must be opened before it may be used to send or receive packets,
+ *  and before its state may be examined or changed.  Depending up on the
+ *  link's intended use, one or more link permissions may be requested via
+ *  the flags parameter; see @ref gxio_mpipe_link_perm.  In addition, flags
+ *  may request that the link's state be modified at open time.  See @ref
+ *  gxio_mpipe_link_states and @ref gxio_mpipe_link_open_flags for more detail.
+ *
+ * @param link A link state object, which will be initialized if this
+ *  function completes successfully.
+ * @param context An initialized mPIPE context.
+ * @param link_name Name of the link.
+ * @param flags Zero or more @ref gxio_mpipe_link_open_flags, ORed together.
+ * @return 0 if the link was successfully opened, or a negative error code.
+ *
+ */
+extern int
+gxio_mpipe_link_open(gxio_mpipe_link_t * link, gxio_mpipe_context_t * context,
+		     const char *link_name, unsigned int flags);
+
+/* Close an mPIPE link.
+ *
+ *  Closing a link makes it available for use by other processes.  Once
+ *  a link has been closed, packets may no longer be sent on or received
+ *  from the link, and its state may not be examined or changed.
+ *
+ * @param link A link state object, which will no longer be initialized
+ *  if this function completes successfully.
+ * @return 0 if the link was successfully closed, or a negative error code.
+ *
+ */
+extern int gxio_mpipe_link_close(gxio_mpipe_link_t * link);
+
+/* Return a link's channel number.
+ *
+ * @param link A properly initialized link state object.
+ * @return The channel number for the link.
+ */
+static inline int gxio_mpipe_link_channel(gxio_mpipe_link_t * link)
+{
+	return link->channel;
+}
+
+#endif /* !_GXIO_MPIPE_H_ */
diff --git a/arch/tile/include/hv/drv_mpipe_intf.h b/arch/tile/include/hv/drv_mpipe_intf.h
new file mode 100644
index 0000000..6cdae3b
--- /dev/null
+++ b/arch/tile/include/hv/drv_mpipe_intf.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the mpipe driver.
+ */
+
+#ifndef _SYS_HV_DRV_MPIPE_INTF_H
+#define _SYS_HV_DRV_MPIPE_INTF_H
+
+#include <arch/mpipe.h>
+#include <arch/mpipe_constants.h>
+
+
+/** Number of buffer stacks (32). */
+#define HV_MPIPE_NUM_BUFFER_STACKS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH)
+
+/** Number of NotifRings (256). */
+#define HV_MPIPE_NUM_NOTIF_RINGS (MPIPE_NUM_NOTIF_RINGS)
+
+/** Number of NotifGroups (32). */
+#define HV_MPIPE_NUM_NOTIF_GROUPS (MPIPE_NUM_NOTIF_GROUPS)
+
+/** Number of buckets (4160). */
+#define HV_MPIPE_NUM_BUCKETS (MPIPE_NUM_BUCKETS)
+
+/** Number of "lo" buckets (4096). */
+#define HV_MPIPE_NUM_LO_BUCKETS 4096
+
+/** Number of "hi" buckets (64). */
+#define HV_MPIPE_NUM_HI_BUCKETS \
+  (HV_MPIPE_NUM_BUCKETS - HV_MPIPE_NUM_LO_BUCKETS)
+
+/** Number of edma rings (24). */
+#define HV_MPIPE_NUM_EDMA_RINGS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH)
+
+
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_MPIPE_ALLOC_FIXED 0x01
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_CFG << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_SIZE (64 * 1024)
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_FAST_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_IDMA << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the fast register MMIO region (IDMA, EDMA, buffer stack). */
+#define HV_MPIPE_FAST_MMIO_SIZE \
+  ((MPIPE_MMIO_ADDR__REGION_VAL_BSM + 1 - MPIPE_MMIO_ADDR__REGION_VAL_IDMA) \
+   << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+
+/*
+ * Each type of resource allocation comes in quantized chunks, where
+ * XXX_BITS is the number of chunks, and XXX_RES_PER_BIT is the number
+ * of resources in each chunk.
+ */
+
+/** Number of buffer stack chunks available (32). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH
+
+/** Granularity of buffer stack allocation (1). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_RES_PER_BIT \
+  (HV_MPIPE_NUM_BUFFER_STACKS / HV_MPIPE_ALLOC_BUFFER_STACKS_BITS)
+
+/** Number of NotifRing chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__NOTIF_RING_MASK_WIDTH
+
+/** Granularity of NotifRing allocation (8). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_RINGS / HV_MPIPE_ALLOC_NOTIF_RINGS_BITS)
+
+/** Number of NotifGroup chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS \
+  HV_MPIPE_NUM_NOTIF_GROUPS
+
+/** Granularity of NotifGroup allocation (1). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_GROUPS / HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS)
+
+/** Number of lo bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_LO_WIDTH
+
+/** Granularity of lo bucket allocation (256). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_LO_BUCKETS / HV_MPIPE_ALLOC_LO_BUCKETS_BITS)
+
+/** Number of hi bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_HI_WIDTH
+
+/** Granularity of hi bucket allocation (4). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_HI_BUCKETS / HV_MPIPE_ALLOC_HI_BUCKETS_BITS)
+
+/** Number of eDMA ring chunks available (24). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH
+
+/** Granularity of eDMA ring allocation (1). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_EDMA_RINGS / HV_MPIPE_ALLOC_EDMA_RINGS_BITS)
+
+
+
+
+/** Bit vector encoding which NotifRings are in a NotifGroup. */
+typedef struct
+{
+  /** The actual bits. */
+  uint64_t ring_mask[4];
+
+} gxio_mpipe_notif_group_bits_t;
+
+
+/** Another name for MPIPE_LBL_INIT_DAT_BSTS_TBL_t. */
+typedef MPIPE_LBL_INIT_DAT_BSTS_TBL_t gxio_mpipe_bucket_info_t;
+
+
+
+/** Eight buffer stack ids. */
+typedef struct
+{
+  /** The stacks. */
+  uint8_t stacks[8];
+
+} gxio_mpipe_rules_stacks_t;
+
+
+/** A destination mac address. */
+typedef struct
+{
+  /** The octets. */
+  uint8_t octets[6];
+
+} gxio_mpipe_rules_dmac_t;
+
+
+/** A vlan. */
+typedef uint16_t gxio_mpipe_rules_vlan_t;
+
+
+
+/** Maximum number of characters in a link name. */
+#define GXIO_MPIPE_LINK_NAME_LEN  32
+
+
+/** Structure holding a link name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_LINK_NAME_LEN];
+}
+_gxio_mpipe_link_name_t;
+
+/** Maximum number of characters in a symbol name. */
+#define GXIO_MPIPE_SYMBOL_NAME_LEN  128
+
+
+/** Structure holding a symbol name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_SYMBOL_NAME_LEN];
+}
+_gxio_mpipe_symbol_name_t;
+
+
+/** Structure holding a MAC address. */
+typedef struct
+{
+  /** The address. */
+  uint8_t mac[6];
+}
+_gxio_mpipe_link_mac_t;
+
+
+
+/** Request shared data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  Other processes may also
+ *  request shared data permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_DATA               0x00000001UL
+
+/** Do not request data permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_DATA            0x00000002UL
+
+/** Request exclusive data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  No other processes may
+ *  request data permission on this link, and if any process already has
+ *  data permission on it, this open will fail.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_DATA          0x00000004UL
+
+/** Request shared stats permission -- that is, the ability to read and write
+ *  registers which contain link statistics, and to get link attributes --
+ *  on the specified link.  Other processes may also request shared stats
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_STATS              0x00000008UL
+
+/** Do not request stats permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_STATS           0x00000010UL
+
+/** Request exclusive stats permission -- that is, the ability to read and
+ *  write registers which contain link statistics, and to get link
+ *  attributes -- on the specified link.  No other processes may request
+ *  stats permission on this link, and if any process already
+ *  has stats permission on it, this open will fail.
+ *
+ *  Requesting exclusive stats permission is normally a very bad idea, since
+ *  it prevents programs like mpipe-stat from providing information on this
+ *  link.  Applications should only do this if they use MAC statistics
+ *  registers, and cannot tolerate any of the clear-on-read registers being
+ *  reset by other statistics programs.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_STATS         0x00000020UL
+
+/** Request shared control permission -- that is, the ability to modify link
+ *  attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  Other processes may also request shared control
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_CTL                0x00000040UL
+
+/** Do not request control permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_CTL             0x00000080UL
+
+/** Request exclusive control permission -- that is, the ability to modify
+ *  link attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  No other processes may request control permission on
+ *  this link, and if any process already has control permission on it,
+ *  this open will fail.
+ *
+ *  Requesting exclusive control permission is not always a good idea, since
+ *  it prevents programs like mpipe-link from configuring the link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_CTL           0x00000100UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; do not
+ *  change the desired state of the link when it is closed or the process
+ *  exits.  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UP            0x00000200UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; when the
+ *  link is closed or this process exits, if no other process has the link
+ *  open, set the desired state of the link to down.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UPDOWN        0x00000400UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; when the link is closed or this process exits, if no other
+ *  process has the link open, set the desired state of the link to down.
+ *  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_DOWN          0x00000800UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; do not change the desired state of the link when it is
+ *  closed or the process exits.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_NONE          0x00001000UL
+
+/** Request that this open call not complete until the network link is up.
+ *  The process will wait as long as necessary for this to happen;
+ *  applications which wish to abandon waiting for the link after a
+ *  specific time period should not specify this flag when opening a link,
+ *  but should instead call gxio_mpipe_link_wait() afterward.  The link
+ *  must be opened with stats permission.  Note that this flag by itself
+ *  does not change the desired link state; if other open flags or previous
+ *  link state changes have not requested a desired state of up, the open
+ *  call will never complete.  This flag is not available to kernel
+ *  clients.
+ */
+#define GXIO_MPIPE_LINK_WAIT               0x00002000UL
+
+
+/*
+ * Note: link attributes must fit in 24 bits, since we use the top 8 bits
+ * of the IORPC offset word for the channel number.
+ */
+
+/** Determine whether jumbo frames may be received.  If this attribute's
+ *  value value is nonzero, the MAC will accept frames of up to 10240 bytes.
+ *  If the value is zero, the MAC will only accept frames of up to 1544
+ *  bytes.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_JUMBO      0x010000
+
+/** Determine whether to send pause frames on this link if the mPIPE packet
+ *  FIFO is nearly full.  If the value is zero, pause frames are not sent.
+ *  If the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times.
+ *
+ *  Bear in mind that in almost all circumstances, the mPIPE packet FIFO
+ *  will never fill up, since mPIPE will empty it as fast as or faster than
+ *  the incoming data rate, by either delivering or dropping packets.  The
+ *  only situation in which this is not true is if the memory and cache
+ *  subsystem is extremely heavily loaded, and mPIPE cannot perform DMA of
+ *  packet data to memory in a timely fashion.  In particular, pause frames
+ *  will <em>not</em> be sent if packets cannot be delivered because
+ *  NotifRings are full, buckets are full, or buffers are not available in
+ *  a buffer stack. */
+#define GXIO_MPIPE_LINK_SEND_PAUSE         0x020000
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, mPIPE shim will suspend output on the link's
+ *  channel when a pause frame is received.  If the value is zero, pause
+ *  frames will be ignored.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_PAUSE      0x030000
+
+/** Interface MAC address.  The value is a 6-byte MAC address, in the least
+ *  significant 48 bits of the value; in other words, an address which would
+ *  be printed as '12:34:56:78:90:AB' in IEEE 802 canonical format would
+ *  be returned as 0x12345678ab.
+ *
+ *  Depending upon the overall system design, a MAC address may or may not
+ *  be available for each interface.  Note that the interface's MAC address
+ *  does not limit the packets received on its channel, although the
+ *  classifier's rules could be configured to do that.  Similarly, the MAC
+ *  address is not used when transmitting packets, although applications
+ *  could certainly decide to use the assigned address as a source MAC
+ *  address when doing so.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified.
+ */
+#define GXIO_MPIPE_LINK_MAC                0x040000
+
+/** Determine whether to discard egress packets on link down. If this value
+ *  is nonzero, packets sent on this link while the link is down will be
+ *  discarded.  If this value is zero, no packets will be sent on this link
+ *  while it is down.  The default value is one. */
+#define GXIO_MPIPE_LINK_DISCARD_IF_DOWN    0x050000
+
+/** Possible link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate link modes which are actually supported by
+ *  the hardware.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified. */
+#define GXIO_MPIPE_LINK_POSSIBLE_STATE     0x060000
+
+/** Current link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate the current state of the hardware.  If the
+ *  link is down, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will be zero;
+ *  if the link is up, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will
+ *  result in exactly one of the speed values, indicating the current speed.
+ *  This attribute may only be retrieved with gxio_mpipe_link_get_attr(); it
+ *  may not be modified. */
+#define GXIO_MPIPE_LINK_CURRENT_STATE      0x070000
+
+/** Desired link state. The value is a conbination of flags, which specify
+ *  the desired state for the link.  With gxio_mpipe_link_set_attr(), this
+ *  will, in the background, attempt to bring up the link using whichever of
+ *  the requested flags are reasonable, or take down the link if the flags
+ *  are zero.  The actual link up or down operation may happen after this
+ *  call completes.  If the link state changes in the future, the system
+ *  will continue to try to get back to the desired link state; for
+ *  instance, if the link is brought up successfully, and then the network
+ *  cable is disconnected, the link will go down.  However, the desired
+ *  state of the link is still up, so if the cable is reconnected, the link
+ *  will be brought up again.
+ *
+ *  With gxio_mpipe_link_set_attr(), this will indicate the desired state
+ *  for the link, as set with a previous gxio_mpipe_link_set_attr() call,
+ *  or implicitly by a gxio_mpipe_link_open() or link close operation.
+ *  This may not reflect the current state of the link; to get that, use
+ *  ::GXIO_MPIPE_LINK_CURRENT_STATE.
+ */
+#define GXIO_MPIPE_LINK_DESIRED_STATE      0x080000
+
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define GXIO_MPIPE_LINK_10M        0x0000000000000001UL
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define GXIO_MPIPE_LINK_100M       0x0000000000000002UL
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define GXIO_MPIPE_LINK_1G         0x0000000000000004UL
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define GXIO_MPIPE_LINK_10G        0x0000000000000008UL
+
+/** Link can run, should run, or is running at 20 Gbps. */
+#define GXIO_MPIPE_LINK_20G        0x0000000000000010UL
+
+/** Link can run, should run, or is running at 25 Gbps. */
+#define GXIO_MPIPE_LINK_25G        0x0000000000000020UL
+
+/** Link can run, should run, or is running at 50 Gbps. */
+#define GXIO_MPIPE_LINK_50G        0x0000000000000040UL
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define GXIO_MPIPE_LINK_ANYSPEED   0x0000000000000800UL
+
+/** All legal link speeds.  This value is provided for use in extracting
+ *  the speed-related subset of the link state flags; it is not intended
+ *  to be set directly as a value for one of the GXIO_MPIPE_LINK_xxx_STATE
+ *  attributes.  A link is up or is requested to be up if its current or
+ *  desired state, respectively, ANDED with this value, is nonzero. */
+#define GXIO_MPIPE_LINK_SPEED_MASK 0x0000000000000FFFUL
+
+/** Link can run, should run, or is running in MAC loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the Tile
+ *  Processor. */
+#define GXIO_MPIPE_LINK_LOOP_MAC   0x0000000000001000UL
+
+/** Link can run, should run, or is running in PHY loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the external
+ *  PHY chip. */
+#define GXIO_MPIPE_LINK_LOOP_PHY   0x0000000000002000UL
+
+/** Link can run, should run, or is running in external loopback mode.
+ *  This requires that an external loopback plug be installed on the
+ *  Ethernet port.  Note that only some links require that this be
+ *  configured via the gxio_mpipe_link routines; other links can do
+ *  external loopack with the plug and no special configuration. */
+#define GXIO_MPIPE_LINK_LOOP_EXT   0x0000000000004000UL
+
+/** All legal loopback types. */
+#define GXIO_MPIPE_LINK_LOOP_MASK  0x000000000000F000UL
+
+/** Link can run, should run, or is running in full-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_FDX        0x0000000000010000UL
+
+/** Link can run, should run, or is running in half-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_HDX        0x0000000000020000UL
+
+
+/** An individual rule. */
+typedef struct
+{
+  /** The total size. */
+  uint16_t size;
+
+  /** The priority. */
+  int16_t priority;
+
+  /** The "headroom" in each buffer. */
+  uint8_t headroom;
+
+  /** The "tailroom" in each buffer. */
+  uint8_t tailroom;
+
+  /** The "capacity" of the largest buffer. */
+  uint16_t capacity;
+
+  /** The mask for converting a flow hash into a bucket. */
+  uint16_t bucket_mask;
+
+  /** The offset for converting a flow hash into a bucket. */
+  uint16_t bucket_first;
+
+  /** The buffer stack ids. */
+  gxio_mpipe_rules_stacks_t stacks;
+
+  /** The actual channels. */
+  uint32_t channel_bits;
+
+  /** The number of dmacs. */
+  uint16_t num_dmacs;
+
+  /** The number of vlans. */
+  uint16_t num_vlans;
+
+  /** The actual dmacs and vlans. */
+  uint8_t dmacs_and_vlans[];
+
+} gxio_mpipe_rules_rule_t;
+
+
+/** A list of classifier rules. */
+typedef struct
+{
+  /** The offset to the end of the current rule. */
+  uint16_t tail;
+
+  /** The offset to the start of the current rule. */
+  uint16_t head;
+
+  /** The actual rules. */
+  uint8_t rules[4096 - 4];
+
+} gxio_mpipe_rules_list_t;
+
+
+
+
+/** mPIPE statistics structure. These counters include all relevant
+ *  events occurring on all links within the mPIPE shim. */
+typedef struct
+{
+  /** Number of ingress packets dropped for any reason. */
+  uint64_t ingress_drops;
+  /** Number of ingress packets dropped because a buffer stack was empty. */
+  uint64_t ingress_drops_no_buf;
+  /** Number of ingress packets dropped or truncated due to lack of space in
+   *  the iPkt buffer. */
+  uint64_t ingress_drops_ipkt;
+  /** Number of ingress packets dropped by the classifier or load balancer */
+  uint64_t ingress_drops_cls_lb;
+  /** Total number of ingress packets. */
+  uint64_t ingress_packets;
+  /** Total number of egress packets. */
+  uint64_t egress_packets;
+  /** Total number of ingress bytes. */
+  uint64_t ingress_bytes;
+  /** Total number of egress bytes. */
+  uint64_t egress_bytes;
+}
+gxio_mpipe_stats_t;
+
+
+#endif /* _SYS_HV_DRV_MPIPE_INTF_H */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 4/6] arch/tile: provide kernel support for the tilegx mPIPE shim
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
                                   ` (2 preceding siblings ...)
  2012-04-06 17:52                 ` [PATCH v2 2/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
@ 2012-04-06 20:38                 ` Chris Metcalf
  2012-04-06 20:42                 ` [PATCH v2 6/6] tilegx network driver: initial support Chris Metcalf
  2012-04-28 19:41                 ` [PATCH v2 5/6] arch/tile: break out the "csum a long" function to <asm/checksum.h> Chris Metcalf
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 20:38 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel

The TILE-Gx chip includes a packet-processing network engine called
mPIPE ("Multicore Programmable Intelligent Packet Engine").  This
change adds support for using the mPIPE engine from within the
kernel.  The engine has more functionality than is exposed here,
but to keep the kernel code and binary simpler, this is a subset
of the full API designed to enable standard Linux networking only.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/gxio/Kconfig                    |    6 +
 arch/tile/gxio/Makefile                   |    1 +
 arch/tile/gxio/iorpc_mpipe.c              |  463 ++++++++
 arch/tile/gxio/iorpc_mpipe_info.c         |   85 ++
 arch/tile/gxio/mpipe.c                    |  500 +++++++++
 arch/tile/include/arch/mpipe.h            |  359 +++++++
 arch/tile/include/arch/mpipe_constants.h  |   42 +
 arch/tile/include/arch/mpipe_def.h        |   39 +
 arch/tile/include/arch/mpipe_shm.h        |  509 +++++++++
 arch/tile/include/arch/mpipe_shm_def.h    |   23 +
 arch/tile/include/gxio/iorpc_mpipe.h      |  124 +++
 arch/tile/include/gxio/iorpc_mpipe_info.h |   46 +
 arch/tile/include/gxio/mpipe.h            | 1653 +++++++++++++++++++++++++++++
 arch/tile/include/hv/drv_mpipe_intf.h     |  602 +++++++++++
 14 files changed, 4452 insertions(+), 0 deletions(-)
 create mode 100644 arch/tile/gxio/iorpc_mpipe.c
 create mode 100644 arch/tile/gxio/iorpc_mpipe_info.c
 create mode 100644 arch/tile/gxio/mpipe.c
 create mode 100644 arch/tile/include/arch/mpipe.h
 create mode 100644 arch/tile/include/arch/mpipe_constants.h
 create mode 100644 arch/tile/include/arch/mpipe_def.h
 create mode 100644 arch/tile/include/arch/mpipe_shm.h
 create mode 100644 arch/tile/include/arch/mpipe_shm_def.h
 create mode 100644 arch/tile/include/gxio/iorpc_mpipe.h
 create mode 100644 arch/tile/include/gxio/iorpc_mpipe_info.h
 create mode 100644 arch/tile/include/gxio/mpipe.h
 create mode 100644 arch/tile/include/hv/drv_mpipe_intf.h

diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
index ecd076c..8aeebb7 100644
--- a/arch/tile/gxio/Kconfig
+++ b/arch/tile/gxio/Kconfig
@@ -9,3 +9,9 @@ config TILE_GXIO
 config TILE_GXIO_DMA
 	bool
 	select TILE_GXIO
+
+# Support direct access to the TILE-Gx mPIPE hardware from kernel space.
+config TILE_GXIO_MPIPE
+	bool
+	select TILE_GXIO
+	select TILE_GXIO_DMA
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
index 97ab468..130eec4 100644
--- a/arch/tile/gxio/Makefile
+++ b/arch/tile/gxio/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
 obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
+obj-$(CONFIG_TILE_GXIO_MPIPE) += mpipe.o iorpc_mpipe.o iorpc_mpipe_info.o
diff --git a/arch/tile/gxio/iorpc_mpipe.c b/arch/tile/gxio/iorpc_mpipe.c
new file mode 100644
index 0000000..029a6d4
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe.h"
+
+struct alloc_buffer_stacks_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	struct alloc_buffer_stacks_param temp;
+	struct alloc_buffer_stacks_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buffer_stacks);
+
+struct init_buffer_stack_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int stack;
+	unsigned int buffer_size_enum;
+};
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t * context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_buffer_stack_aux_param temp;
+	struct init_buffer_stack_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->stack = stack;
+	params->buffer_size_enum = buffer_size_enum;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack_aux);
+
+
+struct alloc_notif_rings_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t * context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags)
+{
+	struct alloc_notif_rings_param temp;
+	struct alloc_notif_rings_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_rings);
+
+struct init_notif_ring_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int ring;
+};
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_notif_ring_aux_param temp;
+	struct init_notif_ring_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_ring_aux);
+
+struct request_notif_ring_interrupt_param {
+	union iorpc_interrupt interrupt;
+	unsigned int ring;
+};
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring)
+{
+	struct request_notif_ring_interrupt_param temp;
+	struct request_notif_ring_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_request_notif_ring_interrupt);
+
+struct enable_notif_ring_interrupt_param {
+	unsigned int ring;
+};
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					   unsigned int ring)
+{
+	struct enable_notif_ring_interrupt_param temp;
+	struct enable_notif_ring_interrupt_param *params = &temp;
+
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_enable_notif_ring_interrupt);
+
+struct alloc_notif_groups_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t * context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags)
+{
+	struct alloc_notif_groups_param temp;
+	struct alloc_notif_groups_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_groups);
+
+struct init_notif_group_param {
+	unsigned int group;
+	gxio_mpipe_notif_group_bits_t bits;
+};
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t * context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits)
+{
+	struct init_notif_group_param temp;
+	struct init_notif_group_param *params = &temp;
+
+	params->group = group;
+	params->bits = bits;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_NOTIF_GROUP);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_group);
+
+struct alloc_buckets_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t * context, unsigned int count,
+			     unsigned int first, unsigned int flags)
+{
+	struct alloc_buckets_param temp;
+	struct alloc_buckets_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_BUCKETS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buckets);
+
+struct init_bucket_param {
+	unsigned int bucket;
+	MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info;
+};
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t * context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info)
+{
+	struct init_bucket_param temp;
+	struct init_bucket_param *params = &temp;
+
+	params->bucket = bucket;
+	params->bucket_info = bucket_info;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_BUCKET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_bucket);
+
+struct alloc_edma_rings_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t * context,
+				unsigned int count, unsigned int first,
+				unsigned int flags)
+{
+	struct alloc_edma_rings_param temp;
+	struct alloc_edma_rings_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_EDMA_RINGS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_edma_rings);
+
+struct init_edma_ring_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int ring;
+	unsigned int channel;
+};
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_edma_ring_aux_param temp;
+	struct init_edma_ring_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+	params->channel = channel;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_EDMA_RING_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_edma_ring_aux);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t * context, const void *blob,
+			    size_t blob_size)
+{
+	const void *params = blob;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, blob_size,
+			     GXIO_MPIPE_OP_COMMIT_RULES);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_commit_rules);
+
+struct register_client_memory_param {
+	unsigned int iotlb;
+	HV_PTE pte;
+	unsigned int flags;
+};
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t * context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags)
+{
+	struct register_client_memory_param temp;
+	struct register_client_memory_param *params = &temp;
+
+	params->iotlb = iotlb;
+	params->pte = pte;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_register_client_memory);
+
+struct link_open_aux_param {
+	_gxio_mpipe_link_name_t name;
+	unsigned int flags;
+};
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t * context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags)
+{
+	struct link_open_aux_param temp;
+	struct link_open_aux_param *params = &temp;
+
+	params->name = name;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_LINK_OPEN_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_open_aux);
+
+struct link_close_aux_param {
+	int mac;
+};
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t * context, int mac)
+{
+	struct link_close_aux_param temp;
+	struct link_close_aux_param *params = &temp;
+
+	params->mac = mac;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_LINK_CLOSE_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_close_aux);
+
+
+struct arm_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie)
+{
+	struct arm_pollfd_param temp;
+	struct arm_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ARM_POLLFD);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_arm_pollfd);
+
+struct close_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie)
+{
+	struct close_pollfd_param temp;
+	struct close_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_CLOSE_POLLFD);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_close_pollfd);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t * context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_MPIPE_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t * context,
+				 unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_mpipe_info.c b/arch/tile/gxio/iorpc_mpipe_info.c
new file mode 100644
index 0000000..d0254aa
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe_info.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe_info.h"
+
+
+struct enumerate_aux_param {
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+};
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t * context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t * name,
+				  _gxio_mpipe_link_mac_t * mac)
+{
+	int __result;
+	struct enumerate_aux_param temp;
+	struct enumerate_aux_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 (((uint64_t) idx << 32) |
+			  GXIO_MPIPE_INFO_OP_ENUMERATE_AUX));
+	*name = params->name;
+	*mac = params->mac;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_enumerate_aux);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t * context,
+				  HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_MPIPE_INFO_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t * context,
+				      unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_check_mmio_offset);
diff --git a/arch/tile/gxio/mpipe.c b/arch/tile/gxio/mpipe.c
new file mode 100644
index 0000000..83a7783
--- /dev/null
+++ b/arch/tile/gxio/mpipe.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of mpipe gxio calls.
+ */
+
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_mpipe.h>
+#include <gxio/iorpc_mpipe_info.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+
+/* HACK: Avoid pointless "shadow" warnings. */
+#define link link_shadow
+
+int gxio_mpipe_init(gxio_mpipe_context_t *context, unsigned int mpipe_index)
+{
+	char file[32];
+
+	int fd;
+	int i;
+
+	snprintf(file, sizeof(file), "mpipe/%d/iorpc", mpipe_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	/* Map in the MMIO space. */
+	context->mmio_cfg_base = (void __force *)
+		iorpc_ioremap(fd, HV_MPIPE_CONFIG_MMIO_OFFSET,
+			      HV_MPIPE_CONFIG_MMIO_SIZE);
+	if (context->mmio_cfg_base == NULL)
+		goto cfg_failed;
+
+	context->mmio_fast_base = (void __force *)
+		iorpc_ioremap(fd, HV_MPIPE_FAST_MMIO_OFFSET,
+			      HV_MPIPE_FAST_MMIO_SIZE);
+	if (context->mmio_fast_base == NULL)
+		goto fast_failed;
+
+	/* Initialize the stacks. */
+	for (i = 0; i < 8; i++)
+		context->__stacks.stacks[i] = 255;
+
+	return 0;
+
+      fast_failed:
+	iounmap((void __force __iomem *)(context->mmio_cfg_base));
+      cfg_failed:
+	hv_dev_close(context->fd);
+	return -ENODEV;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init);
+
+static int16_t gxio_mpipe_buffer_sizes[8] =
+	{ 128, 256, 512, 1024, 1664, 4096, 10368, 16384 };
+
+gxio_mpipe_buffer_size_enum_t gxio_mpipe_buffer_size_to_buffer_size_enum(size_t
+									 size)
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		if (size <= gxio_mpipe_buffer_sizes[i])
+			break;
+	return i;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_buffer_size_to_buffer_size_enum);
+
+size_t gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+						  buffer_size_enum)
+{
+	if (buffer_size_enum > 7)
+		buffer_size_enum = 7;
+
+	return gxio_mpipe_buffer_sizes[buffer_size_enum];
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_buffer_size_enum_to_buffer_size);
+
+size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers)
+{
+	const int BUFFERS_PER_LINE = 12;
+
+	/* Count the number of cachlines. */
+	unsigned long lines =
+		(buffers + BUFFERS_PER_LINE - 1) / BUFFERS_PER_LINE;
+
+	/* Convert to bytes. */
+	return lines * CHIP_L2_LINE_SIZE();
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_calc_buffer_stack_bytes);
+
+int gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t *context,
+				 unsigned int stack,
+				 gxio_mpipe_buffer_size_enum_t
+				 buffer_size_enum, void *mem, size_t mem_size,
+				 unsigned int mem_flags)
+{
+	int result;
+
+	memset(mem, 0, mem_size);
+
+	result = gxio_mpipe_init_buffer_stack_aux(context, mem, mem_size,
+						  mem_flags, stack,
+						  buffer_size_enum);
+	if (result < 0)
+		return result;
+
+	/* Save the stack. */
+	context->__stacks.stacks[buffer_size_enum] = stack;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_buffer_stack);
+
+int gxio_mpipe_init_notif_ring(gxio_mpipe_context_t *context,
+			       unsigned int ring,
+			       void *mem, size_t mem_size,
+			       unsigned int mem_flags)
+{
+	return gxio_mpipe_init_notif_ring_aux(context, mem, mem_size,
+					      mem_flags, ring);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_notif_ring);
+
+int gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t *context,
+					    unsigned int group,
+					    unsigned int ring,
+					    unsigned int num_rings,
+					    unsigned int bucket,
+					    unsigned int num_buckets,
+					    gxio_mpipe_bucket_mode_t mode)
+{
+	int i;
+	int result;
+
+	gxio_mpipe_bucket_info_t bucket_info = { {
+						  .group = group,
+						  .mode = mode,
+						  }
+	};
+
+	gxio_mpipe_notif_group_bits_t bits = { {0} };
+
+	for (i = 0; i < num_rings; i++)
+		gxio_mpipe_notif_group_add_ring(&bits, ring + i);
+
+	result = gxio_mpipe_init_notif_group(context, group, bits);
+	if (result != 0)
+		return result;
+
+	for (i = 0; i < num_buckets; i++) {
+		bucket_info.notifring = ring + (i % num_rings);
+
+		result = gxio_mpipe_init_bucket(context, bucket + i,
+						bucket_info);
+		if (result != 0)
+			return result;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_notif_group_and_buckets);
+
+int gxio_mpipe_init_edma_ring(gxio_mpipe_context_t *context,
+			      unsigned int ring, unsigned int channel,
+			      void *mem, size_t mem_size,
+			      unsigned int mem_flags)
+{
+	memset(mem, 0, mem_size);
+
+	return gxio_mpipe_init_edma_ring_aux(context, mem, mem_size, mem_flags,
+					     ring, channel);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_edma_ring);
+
+void gxio_mpipe_rules_init(gxio_mpipe_rules_t *rules,
+			   gxio_mpipe_context_t *context)
+{
+	rules->context = context;
+	memset(&rules->list, 0, sizeof(rules->list));
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_init);
+
+int gxio_mpipe_rules_begin(gxio_mpipe_rules_t *rules,
+			   unsigned int bucket, unsigned int num_buckets,
+			   gxio_mpipe_rules_stacks_t *stacks)
+{
+	int i;
+	int stack = 255;
+
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	/* Current rule. */
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	unsigned int head = list->tail;
+
+	/*
+	 * Align next rule properly.
+	 *Note that "dmacs_and_vlans" will also be aligned.
+	 */
+	unsigned int pad = 0;
+	while (((head + pad) % __alignof__(gxio_mpipe_rules_rule_t)) != 0)
+		pad++;
+
+	/*
+	 * Verify room.
+	 * ISSUE: Mark rules as broken on error?
+	 */
+	if (head + pad + sizeof(*rule) >= sizeof(list->rules))
+		return GXIO_MPIPE_ERR_RULES_FULL;
+
+	/* Verify num_buckets is a power of 2. */
+	if (__builtin_popcount(num_buckets) != 1)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Add padding to previous rule. */
+	rule->size += pad;
+
+	/* Start a new rule. */
+	list->head = head + pad;
+
+	rule = (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Default some values. */
+	rule->headroom = 2;
+	rule->tailroom = 0;
+	rule->capacity = 16384;
+
+	/* Save the bucket info. */
+	rule->bucket_mask = num_buckets - 1;
+	rule->bucket_first = bucket;
+
+	for (i = 8 - 1; i >= 0; i--) {
+		int maybe =
+			stacks ? stacks->stacks[i] : rules->context->__stacks.
+			stacks[i];
+		if (maybe != 255)
+			stack = maybe;
+		rule->stacks.stacks[i] = stack;
+	}
+
+	if (stack == 255)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* NOTE: Only entries at the end of the array can be 255. */
+	for (i = 8 - 1; i > 0; i--) {
+		if (rule->stacks.stacks[i] == 255) {
+			rule->stacks.stacks[i] = stack;
+			rule->capacity =
+				gxio_mpipe_buffer_size_enum_to_buffer_size(i -
+									   1);
+		}
+	}
+
+	rule->size = sizeof(*rule);
+	list->tail = list->head + rule->size;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_begin);
+
+int gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t *rules,
+				 unsigned int channel)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify channel. */
+	if (channel >= 32)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->channel_bits |= (1UL << channel);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_add_channel);
+
+int gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t *rules, uint8_t headroom)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->headroom = headroom;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_set_headroom);
+
+int gxio_mpipe_rules_commit(gxio_mpipe_rules_t *rules)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+	unsigned int size =
+		offsetof(gxio_mpipe_rules_list_t, rules) + list->tail;
+	return gxio_mpipe_commit_rules(rules->context, list, size);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_commit);
+
+int gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t *iqueue,
+			   gxio_mpipe_context_t *context,
+			   unsigned int ring,
+			   void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_idesc_t);
+
+	iqueue->context = context;
+	iqueue->idescs = (gxio_mpipe_idesc_t *)mem;
+	iqueue->ring = ring;
+	iqueue->num_entries = num_entries;
+	iqueue->mask_num_entries = num_entries - 1;
+	iqueue->log2_num_entries = __builtin_ctz(num_entries);
+	iqueue->head = 1;
+#ifdef __BIG_ENDIAN__
+	iqueue->swapped = 0;
+#endif
+
+	/* Initialize the "tail". */
+	__gxio_mmio_write(mem, iqueue->head);
+
+	return gxio_mpipe_init_notif_ring(context, ring, mem, mem_size,
+					  mem_flags);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_iqueue_init);
+
+int gxio_mpipe_equeue_init(gxio_mpipe_equeue_t *equeue,
+			   gxio_mpipe_context_t *context,
+			   unsigned int edma_ring_id,
+			   unsigned int channel,
+			   void *mem, unsigned int mem_size,
+			   unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_edesc_t);
+
+	/* Offset used to read number of completed commands. */
+	MPIPE_EDMA_POST_REGION_ADDR_t offset;
+
+	int result = gxio_mpipe_init_edma_ring(context, edma_ring_id, channel,
+					       mem, mem_size, mem_flags);
+	if (result < 0)
+		return result;
+
+	memset(equeue, 0, sizeof(*equeue));
+
+	offset.word = 0;
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_EDMA -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = edma_ring_id;
+
+	__gxio_dma_queue_init(&equeue->dma_queue,
+			      context->mmio_fast_base + offset.word,
+			      num_entries);
+	equeue->edescs = mem;
+	equeue->mask_num_entries = num_entries - 1;
+	equeue->log2_num_entries = __builtin_ctz(num_entries);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_equeue_init);
+
+/* Get our internal context used for link name access.  This context is
+ *  special in that it is not associated with an mPIPE service domain.
+ */
+static gxio_mpipe_context_t *_gxio_get_link_context(void)
+{
+	static gxio_mpipe_context_t context;
+	static gxio_mpipe_context_t *contextp;
+	static int tried_open = 0;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	if (!tried_open) {
+		int i = 0;
+		tried_open = 1;
+
+		/*
+		 * "4" here is the maximum possible number of mPIPE shims; it's
+		 * an exaggeration but we shouldn't ever go beyond 2 anyway.
+		 */
+		for (i = 0; i < 4; i++) {
+			char file[80];
+
+			snprintf(file, sizeof(file), "mpipe/%d/iorpc_info", i);
+			context.fd = hv_dev_open((HV_VirtAddr) file, 0);
+			if (context.fd < 0)
+				continue;
+
+			contextp = &context;
+			break;
+		}
+	}
+
+	mutex_unlock(&mutex);
+
+	return contextp;
+}
+
+int gxio_mpipe_link_enumerate_mac(int idx, char *link_name, uint8_t *link_mac)
+{
+	int rv;
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+
+	gxio_mpipe_context_t *context = _gxio_get_link_context();
+	if (!context)
+		return GXIO_ERR_NO_DEVICE;
+
+	rv = gxio_mpipe_info_enumerate_aux(context, idx, &name, &mac);
+	if (rv >= 0) {
+		strncpy(link_name, name.name, sizeof(name.name));
+		memcpy(link_mac, mac.mac, sizeof(mac.mac));
+	}
+
+	return rv;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_enumerate_mac);
+
+int gxio_mpipe_link_open(gxio_mpipe_link_t *link,
+			 gxio_mpipe_context_t *context, const char *link_name,
+			 unsigned int flags)
+{
+	_gxio_mpipe_link_name_t name;
+	int rv;
+
+	strncpy(name.name, link_name, sizeof(name.name));
+	name.name[GXIO_MPIPE_LINK_NAME_LEN - 1] = '\0';
+
+	rv = gxio_mpipe_link_open_aux(context, name, flags);
+	if (rv < 0)
+		return rv;
+
+	link->context = context;
+	link->channel = rv >> 8;
+	link->mac = rv & 0xFF;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_open);
+
+int gxio_mpipe_link_close(gxio_mpipe_link_t *link)
+{
+	return gxio_mpipe_link_close_aux(link->context, link->mac);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_close);
diff --git a/arch/tile/include/arch/mpipe.h b/arch/tile/include/arch/mpipe.h
new file mode 100644
index 0000000..8a33912
--- /dev/null
+++ b/arch/tile/include/arch/mpipe.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_H__
+#define __ARCH_MPIPE_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_def.h>
+
+#ifndef __ASSEMBLER__
+
+/*
+ * MMIO Ingress DMA Release Region Address.
+ * This is a description of the physical addresses used to manipulate ingress
+ * credit counters.  Accesses to this address space should use an address of
+ * this form and a value like that specified in IDMA_RELEASE_REGION_VAL.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0  : 3;
+    /* NotifRing to be released */
+    uint_reg_t ring          : 8;
+    /* Bucket to be released */
+    uint_reg_t bucket        : 13;
+    /* Enable NotifRing release */
+    uint_reg_t ring_enable   : 1;
+    /* Enable Bucket release */
+    uint_reg_t bucket_enable : 1;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the iDMA release region, this field must be 4.
+     */
+    uint_reg_t region        : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_1  : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom       : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_2  : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2  : 24;
+    uint_reg_t svc_dom       : 5;
+    uint_reg_t __reserved_1  : 6;
+    uint_reg_t region        : 3;
+    uint_reg_t bucket_enable : 1;
+    uint_reg_t ring_enable   : 1;
+    uint_reg_t bucket        : 13;
+    uint_reg_t ring          : 8;
+    uint_reg_t __reserved_0  : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_ADDR_t;
+
+/*
+ * MMIO Ingress DMA Release Region Value - Release NotifRing and/or Bucket.
+ * Provides release of the associated NotifRing.  The address of the MMIO
+ * operation is described in IDMA_RELEASE_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Number of packets being released.  The load balancer's count of
+     * inflight packets will be decremented by this amount for the associated
+     * Bucket and/or NotifRing
+     */
+    uint_reg_t count      : 16;
+    /* Reserved. */
+    uint_reg_t __reserved : 48;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 48;
+    uint_reg_t count      : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_VAL_t;
+
+/*
+ * MMIO Buffer Stack Manager Region Address.
+ * This MMIO region is used for posting or fetching buffers to/from the
+ * buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+ * the top of stack if one is available.  On an MMIO store, this pushes a
+ * buffer to the stack.  The value read or written is described in
+ * BSM_REGION_VAL.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 3;
+    /* BufferStack being accessed. */
+    uint_reg_t stack        : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 18;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the buffer stack manager region, this field must be 6.
+     */
+    uint_reg_t region       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom      : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_3 : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 18;
+    uint_reg_t stack        : 5;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_ADDR_t;
+
+/*
+ * MMIO Buffer Stack Manager Region Value.
+ * This MMIO region is used for posting or fetching buffers to/from the
+ * buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+ * the top of stack if one is available. On an MMIO store, this pushes a
+ * buffer to the stack.  The address of the MMIO operation is described in
+ * BSM_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /*
+     * Base virtual address of the buffer.  Must be sign extended by consumer.
+     */
+    int_reg_t va           : 35;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 6;
+    /*
+     * Index of the buffer stack to which this buffer belongs.  Ignored on
+     * writes since the offset bits specify the stack being accessed.
+     */
+    uint_reg_t stack_idx    : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 5;
+    /*
+     * Reads as one to indicate that this is a hardware managed buffer.
+     * Ignored on writes since all buffers on a given stack are the same size.
+     */
+    uint_reg_t hwb          : 1;
+    /*
+     * Encoded size of buffer (ignored on writes):
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /*
+     * Valid indication for the buffer.  Ignored on writes.
+     * 0 : Valid buffer descriptor popped from stack.
+     * 3 : Could not pop a buffer from the stack.  Either the stack is empty,
+     * or the hardware's prefetch buffer is empty for this stack.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_2 : 5;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_1 : 6;
+    int_reg_t va           : 35;
+    uint_reg_t __reserved_0 : 7;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_VAL_t;
+
+/*
+ * MMIO Egress DMA Post Region Address.
+ * Used to post descriptor locations to the eDMA descriptor engine.  The
+ * value to be written is described in EDMA_POST_REGION_VAL
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 3;
+    /* eDMA ring being accessed */
+    uint_reg_t ring         : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 18;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the egress DMA post region, this field must be 5.
+     */
+    uint_reg_t region       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom      : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_3 : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 18;
+    uint_reg_t ring         : 5;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_ADDR_t;
+
+/*
+ * MMIO Egress DMA Post Region Value.
+ * Used to post descriptor locations to the eDMA descriptor engine.  The
+ * address is described in EDMA_POST_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * For writes, this specifies the current ring tail pointer prior to any
+     * post.  For example, to post 1 or more descriptors starting at location
+     * 23, this would contain 23 (not 24).  On writes, this index must be
+     * masked based on the ring size.  The new tail pointer after this post
+     * is COUNT+RING_IDX (masked by the ring size).
+     *
+     * For reads, this provides the hardware descriptor fetcher's head
+     * pointer.  The descriptors prior to the head pointer, however, may not
+     * yet have been processed so this indicator is only used to determine
+     * how full the ring is and if software may post more descriptors.
+     */
+    uint_reg_t ring_idx   : 16;
+    /*
+     * For writes, this specifies number of contiguous descriptors that are
+     * being posted.  Software may post up to RingSize descriptors with a
+     * single MMIO store.  A zero in this field on a write will "wake up" an
+     * eDMA ring and cause it fetch descriptors regardless of the hardware's
+     * current view of the state of the tail pointer.
+     *
+     * For reads, this field provides a rolling count of the number of
+     * descriptors that have been completely processed.  This may be used by
+     * software to determine when buffers associated with a descriptor may be
+     * returned or reused.  When the ring's flush bit is cleared by software
+     * (after having been set by HW or SW), the COUNT will be cleared.
+     */
+    uint_reg_t count      : 16;
+    /*
+     * For writes, this specifies the generation number of the tail being
+     * posted. Note that if tail+cnt wraps to the beginning of the ring, the
+     * eDMA hardware assumes that the descriptors posted at the beginning of
+     * the ring are also valid so it is okay to post around the wrap point.
+     *
+     * For reads, this is the current generation number.  Valid descriptors
+     * will have the inverse of this generation number.
+     */
+    uint_reg_t gen        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved : 31;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 31;
+    uint_reg_t gen        : 1;
+    uint_reg_t count      : 16;
+    uint_reg_t ring_idx   : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_VAL_t;
+
+/*
+ * Load Balancer Bucket Status Data.
+ * Read/Write data for load balancer Bucket-Status Table. 4160 entries
+ * indexed by LBL_INIT_CTL.IDX when LBL_INIT_CTL.STRUCT_SEL is BSTS_TBL
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* NotifRing currently assigned to this bucket. */
+    uint_reg_t notifring  : 8;
+    /* Current reference count. */
+    uint_reg_t count      : 16;
+    /* Group associated with this bucket. */
+    uint_reg_t group      : 5;
+    /* Mode select for this bucket. */
+    uint_reg_t mode       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 32;
+    uint_reg_t mode       : 3;
+    uint_reg_t group      : 5;
+    uint_reg_t count      : 16;
+    uint_reg_t notifring  : 8;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_LBL_INIT_DAT_BSTS_TBL_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_H__) */
diff --git a/arch/tile/include/arch/mpipe_constants.h b/arch/tile/include/arch/mpipe_constants.h
new file mode 100644
index 0000000..410a040
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_constants.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+
+#ifndef __ARCH_MPIPE_CONSTANTS_H__
+#define __ARCH_MPIPE_CONSTANTS_H__
+
+#define MPIPE_NUM_CLASSIFIERS 10
+#define MPIPE_CLS_MHZ 1200
+
+#define MPIPE_NUM_EDMA_RINGS 32
+
+#define MPIPE_NUM_SGMII_MACS 16
+#define MPIPE_NUM_XAUI_MACS 4
+#define MPIPE_NUM_LOOPBACK_CHANNELS 4
+#define MPIPE_NUM_NON_LB_CHANNELS 28
+
+#define MPIPE_NUM_IPKT_BLOCKS 1536
+
+#define MPIPE_NUM_BUCKETS 4160
+
+#define MPIPE_NUM_NOTIF_RINGS 256
+
+#define MPIPE_NUM_NOTIF_GROUPS 32
+
+#define MPIPE_NUM_TLBS_PER_ASID 16
+#define MPIPE_TLB_IDX_WIDTH 4
+
+#define MPIPE_MMIO_NUM_SVC_DOM 32
+
+#endif /* __ARCH_MPIPE_CONSTANTS_H__ */
diff --git a/arch/tile/include/arch/mpipe_def.h b/arch/tile/include/arch/mpipe_def.h
new file mode 100644
index 0000000..c3d3021
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_def.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_DEF_H__
+#define __ARCH_MPIPE_DEF_H__
+#define MPIPE_MMIO_ADDR__REGION_SHIFT 26
+#define MPIPE_MMIO_ADDR__REGION_VAL_CFG 0x0
+#define MPIPE_MMIO_ADDR__REGION_VAL_IDMA 0x4
+#define MPIPE_MMIO_ADDR__REGION_VAL_EDMA 0x5
+#define MPIPE_MMIO_ADDR__REGION_VAL_BSM 0x6
+#define MPIPE_BSM_REGION_VAL__VA_SHIFT 7
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128 0x0
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256 0x1
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512 0x2
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024 0x3
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664 0x4
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096 0x5
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368 0x6
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384 0x7
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA 0x0
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED 0x1
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK 0x2
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY 0x3
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND 0x7
+#define MPIPE_LBL_NR_STATE__FIRST_WORD 0x2138
+#endif /* !defined(__ARCH_MPIPE_DEF_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm.h b/arch/tile/include/arch/mpipe_shm.h
new file mode 100644
index 0000000..f2e9e12
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm.h
@@ -0,0 +1,509 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+
+#ifndef __ARCH_MPIPE_SHM_H__
+#define __ARCH_MPIPE_SHM_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_shm_def.h>
+
+#ifndef __ASSEMBLER__
+/**
+ * MPIPE eDMA Descriptor.
+ * The eDMA descriptor is written by software and consumed by hardware.  It
+ * is used to specify the location of egress packet data to be sent out of
+ * the chip via one of the packet interfaces.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+    /* Word 0 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Generation number.  Used to indicate a valid descriptor in ring.  When
+     * a new descriptor is written into the ring, software must toggle this
+     * bit.  The net effect is that the GEN bit being written into new
+     * descriptors toggles each time the ring tail pointer wraps.
+     */
+    uint_reg_t gen        : 1;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r0         : 7;
+    /** Checksum generation enabled for this transfer. */
+    uint_reg_t csum       : 1;
+    /**
+     * Nothing to be sent.  Used, for example, when software has dropped a
+     * packet but still wishes to return all of the associated buffers.
+     */
+    uint_reg_t ns         : 1;
+    /**
+     * Notification interrupt will be delivered when packet has been egressed.
+     */
+    uint_reg_t notif      : 1;
+    /**
+     * Boundary indicator.  When 1, this transfer includes the EOP for this
+     * command.  Must be clear on all but the last descriptor for an egress
+     * packet.
+     */
+    uint_reg_t bound      : 1;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r1         : 4;
+    /**
+     * Number of bytes to be sent for this descriptor.  When zero, no data
+     * will be moved and the buffer descriptor will be ignored.  If the
+     * buffer descriptor indicates that it is chained, the low 7 bits of the
+     * VA indicate the offset within the first buffer (e.g. 127 bytes is the
+     * maximum offset into the first buffer).  If the size exceeds a single
+     * buffer, subsequent buffer descriptors will be fetched prior to
+     * processing the next eDMA descriptor in the ring.
+     */
+    uint_reg_t xfer_size  : 14;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r2         : 2;
+    /**
+     * Destination of checksum relative to CSUM_START relative to the first
+     * byte moved by this descriptor.  Must be zero if CSUM=0 in this
+     * descriptor.  Must be less than XFER_SIZE (e.g. the first byte of the
+     * CSUM_DEST must be within the span of this descriptor).
+     */
+    uint_reg_t csum_dest  : 8;
+    /**
+     * Start byte of checksum relative to the first byte moved by this
+     * descriptor.  If this is not the first descriptor for the egress
+     * packet, CSUM_START is still relative to the first byte in this
+     * descriptor.  Must be zero if CSUM=0 in this descriptor.
+     */
+    uint_reg_t csum_start : 8;
+    /**
+     * Initial value for 16-bit 1's compliment checksum if enabled via CSUM.
+     * Specified in network order.  That is, bits[7:0] will be added to the
+     * byte pointed to by CSUM_START and bits[15:8] will be added to the byte
+     * pointed to by CSUM_START+1 (with appropriate 1's compliment carries).
+     * Must be zero if CSUM=0 in this descriptor.
+     */
+    uint_reg_t csum_seed  : 16;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t csum_seed  : 16;
+    uint_reg_t csum_start : 8;
+    uint_reg_t csum_dest  : 8;
+    uint_reg_t r2         : 2;
+    uint_reg_t xfer_size  : 14;
+    uint_reg_t r1         : 4;
+    uint_reg_t bound      : 1;
+    uint_reg_t notif      : 1;
+    uint_reg_t ns         : 1;
+    uint_reg_t csum       : 1;
+    uint_reg_t r0         : 7;
+    uint_reg_t gen        : 1;
+#endif
+
+    /* Word 1 */
+
+#ifndef __BIG_ENDIAN__
+    /** Virtual address.  Must be sign extended by consumer. */
+    int_reg_t va           : 42;
+    /** Reserved. */
+    uint_reg_t __reserved_0 : 6;
+    /** Index of the buffer stack to which this buffer belongs. */
+    uint_reg_t stack_idx    : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_1 : 3;
+    /**
+     * Instance ID.  For devices that support more than one mPIPE instance,
+     * this field indicates the buffer owner.  If the INST field does not
+     * match the mPIPE's instance number when a packet is egressed, buffers
+     * with HWB set will be returned to the other mPIPE instance.
+     */
+    uint_reg_t inst         : 1;
+    /** Reserved. */
+    uint_reg_t __reserved_2 : 1;
+    /**
+     * Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+     * indicates whether the buffer will be released to the buffer stack
+     * manager.  When 0, software is responsible for releasing the buffer.
+     */
+    uint_reg_t hwb          : 1;
+    /**
+     * Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+     * descriptors.  For eDMA descriptors, indicates the buffer size if .c
+     * indicates a chained packet.  If an eDMA descriptor is not chained and
+     * the .hwb bit is not set, this field is ignored and the size is
+     * specified by the .xfer_size field.
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /**
+     * Chaining configuration for the buffer.  Indicates that an ingress
+     * packet or egress command is chained across multiple buffers, with each
+     * buffer's size indicated by the .size field.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_2 : 1;
+    uint_reg_t inst         : 1;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_0 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  /** Word access */
+  uint_reg_t words[2];
+} MPIPE_EDMA_DESC_t;
+
+/**
+ * MPIPE Packet Descriptor.
+ * The packet descriptor is filled by the mPIPE's classification,
+ * load-balancing, and buffer management services.  Some fields are consumed
+ * by mPIPE hardware, and others are consumed by Tile software.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+    /* Word 0 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Notification ring into which this packet descriptor is written.
+     * Typically written by load balancer, but can be overridden by
+     * classification program if NR is asserted.
+     */
+    uint_reg_t notif_ring   : 8;
+    /** Source channel for this packet.  Written by mPIPE DMA hardware. */
+    uint_reg_t channel      : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /**
+     * MAC Error.
+     * Generated by the MAC interface.  Asserted if there was an overrun of
+     * the MAC's receive FIFO.  This condition generally only occurs if the
+     * mPIPE clock is running too slowly.
+     */
+    uint_reg_t me           : 1;
+    /**
+     * Truncation Error.
+     * Written by the iDMA hardware.  Asserted if packet was truncated due to
+     * insufficient space in iPkt buffer
+     */
+    uint_reg_t tr           : 1;
+    /**
+     * Written by the iDMA hardware.  Indicates the number of bytes written
+     * to Tile memory.  In general, this is the actual size of the packet as
+     * received from the MAC.  But if the packet is truncated due to running
+     * out of buffers or due to the iPkt buffer filling up, then the L2_SIZE
+     * will be reduced to reflect the actual number of valid bytes written to
+     * Tile memory.
+     */
+    uint_reg_t l2_size      : 14;
+    /**
+     * CRC Error.
+     * Generated by the MAC.  Asserted if MAC indicated an L2 CRC error or
+     * other L2 error (bad length etc.) on the packet.
+     */
+    uint_reg_t ce           : 1;
+    /**
+     * Cut Through.
+     * Written by the iDMA hardware.  Asserted if packet was not completely
+     * received before being sent to classifier.  L2_Size will indicate
+     * number of bytes received so far.
+     */
+    uint_reg_t ct           : 1;
+    /**
+     * Written by the classification program.  Used by the load balancer to
+     * select the ring into which this packet descriptor is written.
+     */
+    uint_reg_t bucket_id    : 13;
+    /** Reserved. */
+    uint_reg_t __reserved_1 : 3;
+    /**
+     * Checksum.
+     * Written by classification program.  When 1, the checksum engine will
+     * perform checksum based on the CSUM_SEED, CSUM_START, and CSUM_BYTES
+     * fields.  The result will be placed in CSUM_VAL.
+     */
+    uint_reg_t cs           : 1;
+    /**
+     * Notification Ring Select.
+     * Written by the classification program.  When 1, the NotifRingIDX is
+     * set by classification program rather than being set by load balancer.
+     */
+    uint_reg_t nr           : 1;
+    /**
+     * Written by classification program.  Indicates whether packet and
+     * descriptor should both be dropped, both be delivered, or only the
+     * descriptor should be delivered.
+     */
+    uint_reg_t dest         : 2;
+    /**
+     * General Purpose Sequence Number Enable.
+     * Written by the classification program.  When 1, the GP_SQN_SEL field
+     * contains the sequence number selector and the GP_SQN field will be
+     * replaced with the associated sequence number.  When clear, the GP_SQN
+     * field is left intact and be used as "Custom" bytes.
+     */
+    uint_reg_t sq           : 1;
+    /**
+     * TimeStamp Enable.
+     * Enable TimeStamp insertion.  When clear, timestamp field may be filled
+     * with custom data by classifier.  When set, hardware inserts the
+     * timestamp when the start of packet is received from the MAC.
+     */
+    uint_reg_t ts           : 1;
+    /**
+     * Packet Sequence Number Enable.
+     * Enable PacketSQN insertion.  When clear, PacketSQN field may be filled
+     * with custom data by classifier.  When set, hardware inserts the packet
+     * sequence number when the packet descriptor is written to a
+     * notification ring.
+     */
+    uint_reg_t ps           : 1;
+    /**
+     * Buffer Error.
+     * Written by the iDMA hardware.  Asserted if iDMA ran out of buffers
+     * while writing the packet. Software must still return any buffer
+     * descriptors whose C field indicates a valid descriptor was consumed.
+     */
+    uint_reg_t be           : 1;
+    /**
+     * Written by  the classification program.  The associated counter is
+     * incremented when the packet is sent.
+     */
+    uint_reg_t ctr0         : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_2 : 3;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 3;
+    uint_reg_t ctr0         : 5;
+    uint_reg_t be           : 1;
+    uint_reg_t ps           : 1;
+    uint_reg_t ts           : 1;
+    uint_reg_t sq           : 1;
+    uint_reg_t dest         : 2;
+    uint_reg_t nr           : 1;
+    uint_reg_t cs           : 1;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t bucket_id    : 13;
+    uint_reg_t ct           : 1;
+    uint_reg_t ce           : 1;
+    uint_reg_t l2_size      : 14;
+    uint_reg_t tr           : 1;
+    uint_reg_t me           : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t channel      : 5;
+    uint_reg_t notif_ring   : 8;
+#endif
+
+    /* Word 1 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  The associated counter is
+     * incremented when the packet is sent.
+     */
+    uint_reg_t ctr1          : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_3  : 3;
+    /**
+     * Written by classification program.  Indicates the start byte for
+     * checksum.  Relative to 1st byte received from MAC.
+     */
+    uint_reg_t csum_start    : 8;
+    /**
+     * Checksum seed written by classification program.  Overwritten with
+     * resultant checksum if CS bit is asserted.  The endianness of the CSUM
+     * value bits when viewed by Tile software match the packet byte order.
+     * That is, bits[7:0] of the resulting checksum value correspond to
+     * earlier (more significant) bytes in the packet.  To avoid classifier
+     * software from having to byte swap the CSUM_SEED, the iDMA checksum
+     * engine byte swaps the classifier's result before seeding the checksum
+     * calculation.  Thus, the CSUM_START byte of packet data is added to
+     * bits[15:8] of the CSUM_SEED field generated by the classifier.  This
+     * byte swap will be visible to Tile software if the CS bit is clear.
+     */
+    uint_reg_t csum_seed_val : 16;
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom0       : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom0       : 32;
+    uint_reg_t csum_seed_val : 16;
+    uint_reg_t csum_start    : 8;
+    uint_reg_t __reserved_3  : 3;
+    uint_reg_t ctr1          : 5;
+#endif
+
+    /* Word 2 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom1 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom1 : 64;
+#endif
+
+    /* Word 3 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom2 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom2 : 64;
+#endif
+
+    /* Word 4 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom3 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom3 : 64;
+#endif
+
+    /* Word 5 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Sequence number applied when packet is distributed.   Classifier
+     * selects which sequence number is to be applied by writing the 13-bit
+     * SQN-selector into this field.
+     */
+    uint_reg_t gp_sqn     : 16;
+    /**
+     * Written by notification hardware.  The packet sequence number is
+     * incremented for each packet that wasn't dropped.
+     */
+    uint_reg_t packet_sqn : 48;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t packet_sqn : 48;
+    uint_reg_t gp_sqn     : 16;
+#endif
+
+    /* Word 6 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by hardware when the start-of-packet is received by the mPIPE
+     * from the MAC.  This is the nanoseconds part of the packet timestamp.
+     */
+    uint_reg_t time_stamp_ns  : 32;
+    /**
+     * Written by hardware when the start-of-packet is received by the mPIPE
+     * from the MAC.  This is the seconds part of the packet timestamp.
+     */
+    uint_reg_t time_stamp_sec : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t time_stamp_sec : 32;
+    uint_reg_t time_stamp_ns  : 32;
+#endif
+
+    /* Word 7 */
+
+#ifndef __BIG_ENDIAN__
+    /** Virtual address.  Must be sign extended by consumer. */
+    int_reg_t va           : 42;
+    /** Reserved. */
+    uint_reg_t __reserved_4 : 6;
+    /** Index of the buffer stack to which this buffer belongs. */
+    uint_reg_t stack_idx    : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_5 : 3;
+    /**
+     * Instance ID.  For devices that support more than one mPIPE instance,
+     * this field indicates the buffer owner.  If the INST field does not
+     * match the mPIPE's instance number when a packet is egressed, buffers
+     * with HWB set will be returned to the other mPIPE instance.
+     */
+    uint_reg_t inst         : 1;
+    /** Reserved. */
+    uint_reg_t __reserved_6 : 1;
+    /**
+     * Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+     * indicates whether the buffer will be released to the buffer stack
+     * manager.  When 0, software is responsible for releasing the buffer.
+     */
+    uint_reg_t hwb          : 1;
+    /**
+     * Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+     * descriptors.  For eDMA descriptors, indicates the buffer size if .c
+     * indicates a chained packet.  If an eDMA descriptor is not chained and
+     * the .hwb bit is not set, this field is ignored and the size is
+     * specified by the .xfer_size field.
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /**
+     * Chaining configuration for the buffer.  Indicates that an ingress
+     * packet or egress command is chained across multiple buffers, with each
+     * buffer's size indicated by the .size field.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t __reserved_6 : 1;
+    uint_reg_t inst         : 1;
+    uint_reg_t __reserved_5 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_4 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  /** Word access */
+  uint_reg_t words[8];
+} MPIPE_PDESC_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_SHM_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm_def.h b/arch/tile/include/arch/mpipe_shm_def.h
new file mode 100644
index 0000000..6124d39
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm_def.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_SHM_DEF_H__
+#define __ARCH_MPIPE_SHM_DEF_H__
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_UNCHAINED 0x0
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_CHAINED 0x1
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY 0x2
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_INVALID 0x3
+#endif /* !defined(__ARCH_MPIPE_SHM_DEF_H__) */
diff --git a/arch/tile/include/gxio/iorpc_mpipe.h b/arch/tile/include/gxio/iorpc_mpipe.h
new file mode 100644
index 0000000..f876da5
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_LINUX_RPC_H__
+#define __GXIO_MPIPE_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1200)
+#define GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1201)
+
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1203)
+#define GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1204)
+#define GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1205)
+#define GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1206)
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1207)
+#define GXIO_MPIPE_OP_INIT_NOTIF_GROUP IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1208)
+#define GXIO_MPIPE_OP_ALLOC_BUCKETS    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1209)
+#define GXIO_MPIPE_OP_INIT_BUCKET      IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120a)
+#define GXIO_MPIPE_OP_ALLOC_EDMA_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120b)
+#define GXIO_MPIPE_OP_INIT_EDMA_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x120c)
+
+#define GXIO_MPIPE_OP_COMMIT_RULES     IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120f)
+#define GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1210)
+#define GXIO_MPIPE_OP_LINK_OPEN_AUX    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1211)
+#define GXIO_MPIPE_OP_LINK_CLOSE_AUX   IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1212)
+
+#define GXIO_MPIPE_OP_ARM_POLLFD       IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define GXIO_MPIPE_OP_CLOSE_POLLFD     IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define GXIO_MPIPE_OP_GET_MMIO_BASE    IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t * context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t * context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum);
+
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t * context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags);
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring);
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring);
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t * context,
+					   unsigned int ring);
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t * context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags);
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t * context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits);
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t * context, unsigned int count,
+			     unsigned int first, unsigned int flags);
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t * context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info);
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t * context,
+				unsigned int count, unsigned int first,
+				unsigned int flags);
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t * context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t * context, const void *blob,
+			    size_t blob_size);
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t * context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags);
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t * context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags);
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t * context, int mac);
+
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie);
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t * context, int pollfd_cookie);
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t * context, HV_PTE *base);
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t * context,
+				 unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_mpipe_info.h b/arch/tile/include/gxio/iorpc_mpipe_info.h
new file mode 100644
index 0000000..0bcf3f7
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe_info.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_INFO_LINUX_RPC_H__
+#define __GXIO_MPIPE_INFO_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+
+#define GXIO_MPIPE_INFO_OP_ENUMERATE_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1251)
+#define GXIO_MPIPE_INFO_OP_GET_MMIO_BASE IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t * context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t * name,
+				  _gxio_mpipe_link_mac_t * mac);
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t * context,
+				  HV_PTE *base);
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t * context,
+				      unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_INFO_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/mpipe.h b/arch/tile/include/gxio/mpipe.h
new file mode 100644
index 0000000..8c25f8e
--- /dev/null
+++ b/arch/tile/include/gxio/mpipe.h
@@ -0,0 +1,1653 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_MPIPE_H_
+#define _GXIO_MPIPE_H_
+
+/*
+ *
+ * An API for allocating, configuring, and manipulating mPIPE hardware
+ * resources.
+ */
+
+#include "common.h"
+#include "dma_queue.h"
+
+#include <arch/mpipe_def.h>
+#include <arch/mpipe_shm.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * The TILE-Gx mPIPE&tm; shim provides Ethernet connectivity, packet
+ * classification, and packet load balancing services.  The
+ * gxio_mpipe_ API, declared in <gxio/mpipe.h>, allows applications to
+ * allocate mPIPE IO channels, configure packet distribution
+ * parameters, and send and receive Ethernet packets.  The API is
+ * designed to be a minimal wrapper around the mPIPE hardware, making
+ * system calls only where necessary to preserve inter-process
+ * protection guarantees.
+ *
+ * The APIs described below allow the programmer to allocate and
+ * configure mPIPE resources.  As described below, the mPIPE is a
+ * single shared hardware device that provides partitionable resources
+ * that are shared between all applications in the system.  The
+ * gxio_mpipe_ API allows userspace code to make resource request
+ * calls to the hypervisor, which in turns keeps track of the
+ * resources in use by all applications, maintains protection
+ * guarantees, and resets resources upon application shutdown.
+ *
+ * We strongly recommend reading the mPIPE section of the IO Device
+ * Guide (UG404) before working with this API.  Most functions in the
+ * gxio_mpipe_ API are directly analogous to hardware interfaces and
+ * the documentation assumes that the reader understands those
+ * hardware interfaces.
+ *
+ * @section mpipe__ingress mPIPE Ingress Hardware Resources
+ *
+ * The mPIPE ingress hardware provides extensive hardware offload for
+ * tasks like packet header parsing, load balancing, and memory
+ * management.  This section provides a brief introduction to the
+ * hardware components and the gxio_mpipe_ calls used to manage them;
+ * see the IO Device Guide for a much more detailed description of the
+ * mPIPE's capabilities.
+ *
+ * When a packet arrives at one of the mPIPE's Ethernet MACs, it is
+ * assigned a channel number indicating which MAC received it.  It
+ * then proceeds through the following hardware pipeline:
+ *
+ * @subsection mpipe__classification Classification
+ *
+ * A set of classification processors run header parsing code on each
+ * incoming packet, extracting information including the destination
+ * MAC address, VLAN, Ethernet type, and five-tuple hash.  Some of
+ * this information is then used to choose which buffer stack will be
+ * used to hold the packet, and which bucket will be used by the load
+ * balancer to determine which application will receive the packet.
+ *
+ * The rules by which the buffer stack and bucket are chosen can be
+ * configured via the @ref gxio_mpipe_classifier API.  A given app can
+ * specify multiple rules, each one specifying a bucket range, and a
+ * set of buffer stacks, to be used for packets matching the rule.
+ * Each rule can optionally specify a restricted set of channels,
+ * VLANs, and/or dMACs, in which it is interested.  By default, a
+ * given rule starts out matching all channels associated with the
+ * mPIPE context's set of open links; all VLANs; and all dMACs.
+ * Subsequent restrictions can then be added.
+ *
+ * @subsection mpipe__load_balancing Load Balancing
+ *
+ * The mPIPE load balancer is responsible for choosing the NotifRing
+ * to which the packet will be delivered.  This decision is based on
+ * the bucket number indicated by the classification program.  In
+ * general, the bucket number is based on some number of low bits of
+ * the packet's flow hash (applications that aren't interested in flow
+ * hashing use a single bucket).  Each load balancer bucket keeps a
+ * record of the NotifRing to which packets directed to that bucket
+ * are currently being delivered.  Based on the bucket's load
+ * balancing mode (@ref gxio_mpipe_bucket_mode_t), the load balancer
+ * either forwards the packet to the previously assigned NotifRing or
+ * decides to choose a new NotifRing.  If a new NotifRing is required,
+ * the load balancer chooses the least loaded ring in the NotifGroup
+ * associated with the bucket.
+ *
+ * The load balancer is a shared resource.  Each application needs to
+ * explicitly allocate NotifRings, NotifGroups, and buckets, using
+ * gxio_mpipe_alloc_notif_rings(), gxio_mpipe_alloc_notif_groups(),
+ * and gxio_mpipe_alloc_buckets().  Then the application needs to
+ * configure them using gxio_mpipe_init_notif_ring() and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * @subsection mpipe__buffers Buffer Selection and Packet Delivery
+ *
+ * Once the load balancer has chosen the destination NotifRing, the
+ * mPIPE DMA engine pops at least one buffer off of the 'buffer stack'
+ * chosen by the classification program and DMAs the packet data into
+ * that buffer.  Each buffer stack provides a hardware-accelerated
+ * stack of data buffers with the same size.  If the packet data is
+ * larger than the buffers provided by the chosen buffer stack, the
+ * mPIPE hardware pops off multiple buffers and chains the packet data
+ * through a multi-buffer linked list.  Once the packet data is
+ * delivered to the buffer(s), the mPIPE hardware writes the
+ * ::gxio_mpipe_idesc_t metadata object (calculated by the classifier)
+ * into the NotifRing and increments the number of packets delivered
+ * to that ring.
+ *
+ * Applications can push buffers onto a buffer stack by calling
+ * gxio_mpipe_push_buffer() or by egressing a packet with the
+ * ::gxio_mpipe_edesc_t::hwb bit set, indicating that the egressed
+ * buffers should be returned to the stack.
+ *
+ * Applications can allocate and initialize buffer stacks with the
+ * gxio_mpipe_alloc_buffer_stacks() and gxio_mpipe_init_buffer_stack()
+ * APIs.
+ *
+ * The application must also register the memory pages that will hold
+ * packets.  This requires calling gxio_mpipe_register_page() for each
+ * memory page that will hold packets allocated by the application for
+ * a given buffer stack.  Since each buffer stack is limited to 16
+ * registered pages, it may be necessary to use huge pages, or even
+ * extremely huge pages, to hold all the buffers.
+ *
+ * @subsection mpipe__iqueue NotifRings
+ *
+ * Each NotifRing is a region of shared memory, allocated by the
+ * application, to which the mPIPE delivers packet descriptors
+ * (::gxio_mpipe_idesc_t).  The application can allocate them via
+ * gxio_mpipe_alloc_notif_rings().  The application can then either
+ * explicitly initialize them with gxio_mpipe_init_notif_ring() and
+ * then read from them manually, or can make use of the convenience
+ * wrappers provided by @ref gxio_mpipe_wrappers.
+ *
+ * @section mpipe__egress mPIPE Egress Hardware
+ *
+ * Applications use eDMA rings to queue packets for egress.  The
+ * application can allocate them via gxio_mpipe_alloc_edma_rings().
+ * The application can then either explicitly initialize them with
+ * gxio_mpipe_init_edma_ring() and then write to them manually, or
+ * can make use of the convenience wrappers provided by
+ * @ref gxio_mpipe_wrappers.
+ *
+ * @section gxio__shortcomings Plans for Future API Revisions
+ *
+ * The API defined here is only an initial version of the mPIPE API.
+ * Future plans include:
+ *
+ * - Higher level wrapper functions to provide common initialization
+ * patterns.  This should help users start writing mPIPE programs
+ * without having to learn the details of the hardware.
+ *
+ * - Support for reset and deallocation of resources, including
+ * cleanup upon application shutdown.
+ *
+ * - Support for calling these APIs in the BME.
+ *
+ * - Support for IO interrupts.
+ *
+ * - Clearer definitions of thread safety guarantees.
+ *
+ * @section gxio__mpipe_examples Examples
+ *
+ * See the following mPIPE example programs for more information about
+ * allocating mPIPE resources and using them in real applications:
+ *
+ * - @ref mpipe/ingress/app.c : Receiving packets.
+ *
+ * - @ref mpipe/forward/app.c : Forwarding packets.
+ *
+ * Note that there are several more examples.
+ */
+
+/* Flags that can be passed to resource allocation functions. */
+enum gxio_mpipe_alloc_flags_e {
+	/* Require an allocation to start at a specified resource index. */
+	GXIO_MPIPE_ALLOC_FIXED = HV_MPIPE_ALLOC_FIXED,
+};
+
+/* Flags that can be passed to memory registration functions. */
+enum gxio_mpipe_mem_flags_e {
+	/* Do not fill L3 when writing, and invalidate lines upon egress. */
+	GXIO_MPIPE_MEM_FLAG_NT_HINT = IORPC_MEM_BUFFER_FLAG_NT_HINT,
+
+	/* L3 cache fills should only populate IO cache ways. */
+	GXIO_MPIPE_MEM_FLAG_IO_PIN = IORPC_MEM_BUFFER_FLAG_IO_PIN,
+};
+
+/* An ingress packet descriptor.  When a packet arrives, the mPIPE
+ * hardware generates this structure and writes it into a
+ * NotifRing.
+ */
+typedef MPIPE_PDESC_t gxio_mpipe_idesc_t;
+
+/* An egress packet descriptor.  Applications write this structure
+ * into eDMA rings and the hardware performs the indicated egress
+ * command.
+ */
+typedef MPIPE_EDMA_DESC_t gxio_mpipe_edesc_t;
+
+/* Get the "va" field from an "idesc".
+ *
+ * This is the address at which the ingress hardware copied the first
+ * byte of the packet.
+ *
+ * If the classifier detected a custom header, then this will point to
+ * the custom header, and gxio_mpipe_idesc_get_l2_start() will point
+ * to the actual L2 header.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_va(gxio_mpipe_idesc_t *idesc)
+{
+	return (unsigned char *)(long)idesc->va;
+}
+
+/* Get the "xfer_size" from an "idesc".
+ *
+ * This is the actual number of packet bytes transferred into memory
+ * by the hardware.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ *
+ * ISSUE: Is this the best name for this?
+ * FIXME: Add more docs about chaining, clipping, etc.
+ */
+static inline unsigned int gxio_mpipe_idesc_get_xfer_size(gxio_mpipe_idesc_t
+							  *idesc)
+{
+	return idesc->l2_size;
+}
+
+/* Get the "l2_offset" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is the number of bytes between the "va" and the L2 header.
+ *
+ * The L2 header consists of a destination mac address, a source mac
+ * address, and an initial ethertype.  Various initial ethertypes
+ * allow encoding extra information in the L2 header, often including
+ * a vlan, and/or a new ethertype.
+ *
+ * Note that the "l2_offset" will be non-zero if (and only if) the
+ * classifier processed a custom header for the packet.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_l2_offset(gxio_mpipe_idesc_t *idesc)
+{
+	return (idesc->custom1 >> 32) & 0xFF;
+}
+
+/* Get the "l2_start" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_va() plus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_l2_start(gxio_mpipe_idesc_t
+							   *idesc)
+{
+	unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+	return va + gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* Get the "l2_length" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_xfer_size() minus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned int gxio_mpipe_idesc_get_l2_length(gxio_mpipe_idesc_t
+							  *idesc)
+{
+	unsigned int xfer_size = idesc->l2_size;
+	return xfer_size - gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* A context object used to manage mPIPE hardware resources. */
+typedef struct {
+
+	/* File descriptor for calling up to Linux (and thus the HV). */
+	int fd;
+
+	/* The VA at which configuration registers are mapped. */
+	char *mmio_cfg_base;
+
+	/* The VA at which IDMA, EDMA, and buffer manager are mapped. */
+	char *mmio_fast_base;
+
+	/* The "initialized" buffer stacks. */
+	gxio_mpipe_rules_stacks_t __stacks;
+
+} gxio_mpipe_context_t;
+
+/* This is only used internally, but it's most easily made visible here. */
+typedef gxio_mpipe_context_t gxio_mpipe_info_context_t;
+
+/* Initialize an mPIPE context.
+ *
+ * This function allocates an mPIPE "service domain" and maps the MMIO
+ * registers into the caller's VA space.
+ *
+ * @param context Context object to be initialized.
+ * @param mpipe_instance Instance number of mPIPE shim to be controlled via
+ *  context.
+ */
+extern int gxio_mpipe_init(gxio_mpipe_context_t *context,
+			   unsigned int mpipe_instance);
+
+/*****************************************************************
+ *                         Buffer Stacks                          *
+ ******************************************************************/
+
+/* Allocate a set of buffer stacks.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of stacks required.
+ * @param first Index of first stack if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer stack, or
+ * ::GXIO_MPIPE_ERR_NO_BUFFER_STACK if allocation failed.
+ */
+extern int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t *context,
+					  unsigned int count,
+					  unsigned int first,
+					  unsigned int flags);
+
+/* Enum codes for buffer sizes supported by mPIPE. */
+typedef enum {
+	/* 128 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_128 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128,
+	/* 256 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_256 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256,
+	/* 512 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_512 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512,
+	/* 1024 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1024 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024,
+	/* 1664 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1664 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664,
+	/* 4096 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_4096 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096,
+	/* 10368 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_10368 =
+		MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368,
+	/* 16384 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_16384 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384
+} gxio_mpipe_buffer_size_enum_t;
+
+/* Convert a buffer size in bytes into a buffer size enum. */
+extern gxio_mpipe_buffer_size_enum_t
+gxio_mpipe_buffer_size_to_buffer_size_enum(size_t size);
+
+/* Convert a buffer size enum into a buffer size in bytes. */
+extern size_t
+gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+					   buffer_size_enum);
+
+/* Calculate the number of bytes required to store a given number of
+ * buffers in the memory registered with a buffer stack via
+ * gxio_mpipe_init_buffer_stack().
+ */
+extern size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers);
+
+/* Initialize a buffer stack.  This function binds a region of memory
+ * to be used by the hardware for storing buffer addresses pushed via
+ * gxio_mpipe_push_buffer() or as the result of sending a buffer out
+ * the egress with the 'push to stack when done' bit set.  Once this
+ * function returns, the memory region's contents may be arbitrarily
+ * modified by the hardware at any time and software should not access
+ * the memory region again.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer_size_enum The size of each buffer in the buffer stack,
+ * as an enum.
+ * @param mem The address of the buffer stack.  This memory must be
+ * physically contiguous and aligned to a 64kB boundary.
+ * @param mem_size The size of the buffer stack, in bytes.
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ * @return Zero on success, ::GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE if
+ * buffer_size_enum is invalid, ::GXIO_MPIPE_ERR_BAD_BUFFER_STACK if
+ * stack has not been allocated.
+ */
+extern int gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t *context,
+					unsigned int stack,
+					gxio_mpipe_buffer_size_enum_t
+					buffer_size_enum, void *mem,
+					size_t mem_size,
+					unsigned int mem_flags);
+
+/* Push a buffer onto a previously initialized buffer stack.
+ *
+ * The size of the buffer being pushed must match the size that was
+ * registered with gxio_mpipe_init_buffer_stack().  All packet buffer
+ * addresses are 128-byte aligned; the low 7 bits of the specified
+ * buffer address will be ignored.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer The buffer (the low seven bits are ignored).
+ */
+static inline void gxio_mpipe_push_buffer(gxio_mpipe_context_t *context,
+					  unsigned int stack, void *buffer)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+	MPIPE_BSM_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_BSM -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+#if __SIZEOF_POINTER__ == 4
+	val.va = ((unsigned long)buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#else
+	val.va = ((long)buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#endif
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/* Pop a buffer off of a previously initialized buffer stack.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @return The buffer, or NULL is the stack is empty.
+ */
+static inline void *gxio_mpipe_pop_buffer(gxio_mpipe_context_t *context,
+					  unsigned int stack)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_BSM -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+	while (1) {
+		/*
+		 * Case 1: val.c == ..._UNCHAINED, va is non-zero.
+		 * Case 2: val.c == ..._INVALID, va is zero.
+		 * Case 3: val.c == ..._NOT_RDY, va is zero.
+		 */
+		MPIPE_BSM_REGION_VAL_t val;
+		val.word =
+			__gxio_mmio_read(context->mmio_fast_base +
+					 offset.word);
+
+		/*
+		 * Handle case 1 and 2 by returning the buffer (or NULL).
+		 * Handle case 3 by waiting for the prefetch buffer to refill.
+		 */
+		if (val.c != MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY)
+			return (void *)((unsigned long)val.
+					va << MPIPE_BSM_REGION_VAL__VA_SHIFT);
+	}
+}
+
+/*****************************************************************
+ *                          NotifRings                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifRings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that NotifRings are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifRings required.
+ * @param first Index of first NotifRing if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifRing, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_RING if allocation failed.
+ */
+extern int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t *context,
+					unsigned int count, unsigned int first,
+					unsigned int flags);
+
+/* Initialize a NotifRing, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_idesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 128, 512,
+ * 2048, or 65536 * sizeof(gxio_mpipe_idesc_t).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_NOTIF_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int gxio_mpipe_init_notif_ring(gxio_mpipe_context_t *context,
+				      unsigned int ring,
+				      void *mem, size_t mem_size,
+				      unsigned int mem_flags);
+
+/* Configure an interrupt to be sent to a tile on incoming NotifRing
+ *  traffic.  Once an interrupt is sent for a particular ring, no more
+ *  will be sent until gxio_mica_enable_notif_ring_interrupt() is called.
+ *
+ * @param context An initialized mPIPE context.
+ * @param x X coordinate of interrupt target tile.
+ * @param y Y coordinate of interrupt target tile.
+ * @param i Index of the IPI register which will receive the interrupt.
+ * @param e Specific event which will be set in the target IPI register when
+ * the interrupt occurs.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t
+						   *context, int x, int y,
+						   int i, int e,
+						   unsigned int ring);
+
+/* Enable an interrupt on incoming NotifRing traffic.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t
+						  *context, unsigned int ring);
+
+/* Map all of a client's memory via the given IOTLB.
+ * @param context An initialized mPIPE context.
+ * @param iotlb IOTLB index.
+ * @param pte Page table entry.
+ * @param flags Flags.
+ * @return Zero on success, or a negative error code.
+ */
+extern int gxio_mpipe_register_client_memory(gxio_mpipe_context_t *context,
+					     unsigned int iotlb, HV_PTE pte,
+					     unsigned int flags);
+
+/*****************************************************************
+ *                        Notif Groups                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifGroups.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifGroups required.
+ * @param first Index of first NotifGroup if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifGroup, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_GROUP if allocation failed.
+ */
+extern int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t *context,
+					 unsigned int count,
+					 unsigned int first,
+					 unsigned int flags);
+
+/* Add a NotifRing to a NotifGroup.  This only sets a bit in the
+ * application's 'group' object; the hardware NotifGroup can be
+ * initialized by passing 'group' to gxio_mpipe_init_notif_group() or
+ * gxio_mpipe_init_notif_group_and_buckets().
+ */
+static inline void
+gxio_mpipe_notif_group_add_ring(gxio_mpipe_notif_group_bits_t *bits, int ring)
+{
+	bits->ring_mask[ring / 64] |= (1ull << (ring % 64));
+}
+
+/* Set a particular NotifGroup bitmask.  Since the load balancer
+ * makes decisions based on both bucket and NotifGroup state, most
+ * applications should use gxio_mpipe_init_notif_group_and_buckets()
+ * rather than using this function to configure just a NotifGroup.
+ */
+extern int gxio_mpipe_init_notif_group(gxio_mpipe_context_t *context,
+				       unsigned int group,
+				       gxio_mpipe_notif_group_bits_t bits);
+
+/*****************************************************************
+ *                         Load Balancer                          *
+ ******************************************************************/
+
+/* Allocate a set of load balancer buckets.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that buckets are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * Note that the buckets are actually divided into two sub-ranges, of
+ * different sizes, and different chunk sizes, and the range you get
+ * by default is determined by the size of the request.  Allocations
+ * cannot span the two sub-ranges.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of buckets required.
+ * @param first Index of first bucket if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer bucket, or
+ * ::GXIO_MPIPE_ERR_NO_BUCKET if allocation failed.
+ */
+extern int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t *context,
+				    unsigned int count, unsigned int first,
+				    unsigned int flags);
+
+/* The legal modes for gxio_mpipe_bucket_info_t and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * All modes except ::GXIO_MPIPE_BUCKET_ROUND_ROBIN expect that the user
+ * will allocate a power-of-two number of buckets and initialize them
+ * to the same mode.  The classifier program then uses the appropriate
+ * number of low bits from the incoming packet's flow hash to choose a
+ * load balancer bucket.  Based on that bucket's load balancing mode,
+ * reference count, and currently active NotifRing, the load balancer
+ * chooses the NotifRing to which the packet will be delivered.
+ */
+typedef enum {
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, in which case packets will be dropped.  If
+	 * the bucket reference count ever reaches zero, a new NotifRing may
+	 * be chosen.
+	 */
+	GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA,
+
+	/* All packets for a bucket always go to the same NotifRing.
+	 */
+	GXIO_MPIPE_BUCKET_STATIC_FLOW_AFFINITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED,
+
+	/* All packets for a bucket go to the least full NotifRing in the
+	 * group, providing load balancing round robin behavior.
+	 */
+	GXIO_MPIPE_BUCKET_ROUND_ROBIN =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK,
+
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, at which point the bucket starts using the
+	 * least full NotifRing in the group.  If all NotifRings in the
+	 * group are full, packets will be dropped.
+	 */
+	GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY,
+
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, or a random timer fires, at which point the
+	 * bucket starts using the least full NotifRing in the group.  If
+	 * all NotifRings in the group are full, packets will be dropped.
+	 * WARNING: This mode is BROKEN on chips with fewer than 64 tiles.
+	 */
+	GXIO_MPIPE_BUCKET_PREFER_FLOW_LOCALITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND,
+
+} gxio_mpipe_bucket_mode_t;
+
+/* Copy a set of bucket initialization values into the mPIPE
+ * hardware.  Since the load balancer makes decisions based on both
+ * bucket and NotifGroup state, most applications should use
+ * gxio_mpipe_init_notif_group_and_buckets() rather than using this
+ * function to configure a single bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param bucket Bucket index to be initialized.
+ * @param bucket_info Initial reference count, NotifRing index, and mode.
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET on failure.
+ */
+extern int gxio_mpipe_init_bucket(gxio_mpipe_context_t *context,
+				  unsigned int bucket,
+				  gxio_mpipe_bucket_info_t bucket_info);
+
+/* Initializes a group and range of buckets and range of rings such
+ * that the load balancer runs a particular load balancing function.
+ *
+ * First, the group is initialized with the given rings.
+ *
+ * Second, each bucket is initialized with the mode and group, and a
+ * ring chosen round-robin from the given rings.
+ *
+ * Normally, the classifier picks a bucket, and then the load balancer
+ * picks a ring, based on the bucket's mode, group, and current ring,
+ * possibly updating the bucket's ring.
+ *
+ * @param context An initialized mPIPE context.
+ * @param group The group.
+ * @param ring The first ring.
+ * @param num_rings The number of rings.
+ * @param bucket The first bucket.
+ * @param num_buckets The number of buckets.
+ * @param mode The load balancing mode.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET,
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_GROUP, or
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_RING on failure.
+ */
+extern int gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t
+						   *context,
+						   unsigned int group,
+						   unsigned int ring,
+						   unsigned int num_rings,
+						   unsigned int bucket,
+						   unsigned int num_buckets,
+						   gxio_mpipe_bucket_mode_t
+						   mode);
+
+/* Return credits to a NotifRing and/or bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index, or -1.
+ * @param bucket The bucket, or -1.
+ * @param count The number of credits to return.
+ */
+static inline void gxio_mpipe_credit(gxio_mpipe_context_t *context,
+				     int ring, int bucket, unsigned int count)
+{
+	/* NOTE: Fancy struct initialization would break "C89" header test. */
+
+	MPIPE_IDMA_RELEASE_REGION_ADDR_t offset = { {0} };
+	MPIPE_IDMA_RELEASE_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = ring;
+	offset.bucket = bucket;
+	offset.ring_enable = (ring >= 0);
+	offset.bucket_enable = (bucket >= 0);
+	val.count = count;
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/*****************************************************************
+ *                         Egress Rings                           *
+ ******************************************************************/
+
+/* Allocate a set of eDMA rings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of eDMA rings required.
+ * @param first Index of first eDMA ring if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer eDMA ring, or
+ * ::GXIO_MPIPE_ERR_NO_EDMA_RING if allocation failed.
+ */
+extern int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t *context,
+				       unsigned int count, unsigned int first,
+				       unsigned int flags);
+
+/* Initialize an eDMA ring, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The eDMA ring index.
+ * @param channel The channel to use.  This must be one of the channels
+ * associated with the context's set of open links.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_edesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 512, 2048,
+ * 8192 or 65536, times 16 (i.e. sizeof(gxio_mpipe_edesc_t)).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_EDMA_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int gxio_mpipe_init_edma_ring(gxio_mpipe_context_t *context,
+				     unsigned int ring, unsigned int channel,
+				     void *mem, size_t mem_size,
+				     unsigned int mem_flags);
+
+/*****************************************************************
+ *                      Classifier Program                        *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for loading or configuring the mPIPE classifier program.
+ *
+ * The mPIPE classification processors all run a special "classifier"
+ * program which, for each incoming packet, parses the packet headers,
+ * encodes some packet metadata in the "idesc", and either drops the
+ * packet, or picks a notif ring to handle the packet, and a buffer
+ * stack to contain the packet, usually based on the channel, VLAN,
+ * dMAC, flow hash, and packet size, under the guidance of the "rules"
+ * API described below.
+ *
+ * @section gxio_mpipe_classifier_default Default Classifier
+ *
+ * The MDE provides a simple "default" classifier program.  It is
+ * shipped as source in "$TILERA_ROOT/src/sys/mpipe/classifier.c",
+ * which serves as its official documentation.  It is shipped as a
+ * binary program in "$TILERA_ROOT/tile/boot/classifier", which is
+ * automatically included in bootroms created by "tile-monitor", and
+ * is automatically loaded by the hypervisor at boot time.
+ *
+ * The L2 analysis handles LLC packets, SNAP packets, and "VLAN
+ * wrappers" (keeping the outer VLAN).
+ *
+ * The L3 analysis handles IPv4 and IPv6, dropping packets with bad
+ * IPv4 header checksums, requesting computation of a TCP/UDP checksum
+ * if appropriate, and hashing the dest and src IP addresses, plus the
+ * ports for TCP/UDP packets, into the flow hash.  No special analysis
+ * is done for "fragmented" packets or "tunneling" protocols.  Thus,
+ * the first fragment of a fragmented TCP/UDP packet is hashed using
+ * src/dest IP address and ports and all subsequent fragments are only
+ * hashed according to src/dest IP address.
+ *
+ * The L3 analysis handles other packets too, hashing the dMAC
+ * smac into a flow hash.
+ *
+ * The channel, VLAN, and dMAC used to pick a "rule" (see the
+ * "rules" APIs below), which in turn is used to pick a buffer stack
+ * (based on the packet size) and a bucket (based on the flow hash).
+ *
+ * To receive traffic matching a particular (channel/VLAN/dMAC
+ * pattern, an application should allocate its own buffer stacks and
+ * load balancer buckets, and map traffic to those stacks and buckets,
+ * as decribed by the "rules" API below.
+ *
+ * Various packet metadata is encoded in the idesc.  The flow hash is
+ * four bytes at 0x0C.  The VLAN is two bytes at 0x10.  The ethtype is
+ * two bytes at 0x12.  The l3 start is one byte at 0x14.  The l4 start
+ * is one byte at 0x15 for IPv4 and IPv6 packets, and otherwise zero.
+ * The protocol is one byte at 0x16 for IPv4 and IPv6 packets, and
+ * otherwise zero.
+ *
+ * @section gxio_mpipe_classifier_custom Custom Classifiers.
+ *
+ * A custom classifier may be created using "tile-mpipe-cc" with a
+ * customized version of the default classifier sources.
+ *
+ * The custom classifier may be included in bootroms using the
+ * "--classifier" option to "tile-monitor", or loaded dynamically
+ * using gxio_mpipe_classifier_load_from_file().
+ *
+ * Be aware that "extreme" customizations may break the assumptions of
+ * the "rules" APIs described below, but simple customizations, such
+ * as adding new packet metadata, should be fine.
+ */
+
+/* A set of classifier rules, plus a context. */
+typedef struct {
+
+	/* The context. */
+	gxio_mpipe_context_t *context;
+
+	/* The actual rules. */
+	gxio_mpipe_rules_list_t list;
+
+} gxio_mpipe_rules_t;
+
+/* Initialize a classifier program rules list.
+ *
+ * This function can be called on a previously initialized rules list
+ * to discard any previously added rules.
+ *
+ * @param rules Rules list to initialize.
+ * @param context An initialized mPIPE context.
+ */
+extern void gxio_mpipe_rules_init(gxio_mpipe_rules_t *rules,
+				  gxio_mpipe_context_t *context);
+
+/* Begin a new rule on the indicated rules list.
+ *
+ * Note that an empty rule matches all packets, but an empty rule list
+ * matches no packets.
+ *
+ * @param rules Rules list to which new rule is appended.
+ * @param bucket First load balancer bucket to which packets will be
+ * delivered.
+ * @param num_buckets Number of buckets (must be a power of two) across
+ * which packets will be distributed based on the "flow hash".
+ * @param stacks Either NULL, to assign each packet to the smallest
+ * initialized buffer stack which does not induce chaining (and to
+ * drop packets which exceed the largest initialized buffer stack
+ * buffer size), or an array, with each entry indicating which buffer
+ * stack should be used for packets up to that size (with 255
+ * indicating that those packets should be dropped).
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_begin(gxio_mpipe_rules_t *rules,
+				  unsigned int bucket,
+				  unsigned int num_buckets,
+				  gxio_mpipe_rules_stacks_t *stacks);
+
+/* Set the headroom of the current rule.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param headroom The headroom.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t *rules,
+					 uint8_t headroom);
+
+/* Indicate that packets from a particular channel can be delivered
+ * to the buckets and buffer stacks associated with the current rule.
+ *
+ * Channels added must be associated with links opened by the mPIPE context
+ * used in gxio_mpipe_rules_init().  A rule with no channels is equivalent
+ * to a rule naming all such associated channels.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param channel The channel to add.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t *rules,
+					unsigned int channel);
+
+/* Commit rules.
+ *
+ * The rules are sent to the hypervisor, where they are combined with
+ * the rules from other apps, and used to program the hardware classifier.
+ *
+ * Note that if this function returns an error, then the rules will NOT
+ * have been committed, even if the error is due to interactions with
+ * rules from another app.
+ *
+ * @param rules Rules list to commit.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_commit(gxio_mpipe_rules_t *rules);
+
+/*****************************************************************
+ *                     Ingress Queue Wrapper                      *
+ ******************************************************************/
+
+/*
+ *
+ * Convenience functions for receiving packets from a NotifRing and
+ * sending packets via an eDMA ring.
+ *
+ * The mpipe ingress and egress hardware uses shared memory packet
+ * descriptors to describe packets that have arrived on ingress or
+ * are destined for egress.  These descriptors are stored in shared
+ * memory ring buffers and written or read by hardware as necessary.
+ * The gxio library provides wrapper functions that manage the head and
+ * tail pointers for these rings, allowing the user to easily read or
+ * write packet descriptors.
+ *
+ * The initialization interface for ingress and egress rings is quite
+ * similar.  For example, to create an ingress queue, the user passes
+ * a ::gxio_mpipe_iqueue_t state object, a ring number from
+ * gxio_mpipe_alloc_notif_rings(), and the address of memory to hold a
+ * ring buffer to the gxio_mpipe_iqueue_init() function.  The function
+ * returns success when the state object has been initialized and the
+ * hardware configured to deliver packets to the specified ring
+ * buffer.  Similarly, gxio_mpipe_equeue_init() takes a
+ * ::gxio_mpipe_equeue_t state object, a ring number from
+ * gxio_mpipe_alloc_edma_rings(), and a shared memory buffer.
+ *
+ * @section gxio_mpipe_iqueue Working with Ingress Queues
+ *
+ * Once initialized, the gxio_mpipe_iqueue_t API provides two flows
+ * for getting the ::gxio_mpipe_idesc_t packet descriptor associated
+ * with incoming packets.  The simplest is to call
+ * gxio_mpipe_iqueue_get() or gxio_mpipe_iqueue_try_get().  These
+ * functions copy the oldest packet descriptor out of the NotifRing and
+ * into a descriptor provided by the caller.  They also immediately
+ * inform the hardware that a descriptor has been processed.
+ *
+ * For applications with stringent performance requirements, higher
+ * efficiency can be achieved by avoiding the packet descriptor copy
+ * and processing multiple descriptors at once.  The
+ * gxio_mpipe_iqueue_peek() and gxio_mpipe_iqueue_try_peek() functions
+ * allow such optimizations.  These functions provide a pointer to the
+ * next valid ingress descriptor in the NotifRing's shared memory ring
+ * buffer, and a count of how many contiguous descriptors are ready to
+ * be processed.  The application can then process any number of those
+ * descriptors in place, calling gxio_mpipe_iqueue_consume() to inform
+ * the hardware after each one has been processed.
+ *
+ * @section gxio_mpipe_equeue Working with Egress Queues
+ *
+ * Similarly, the egress queue API provides a high-performance
+ * interface plus a simple wrapper for use in posting
+ * ::gxio_mpipe_edesc_t egress packet descriptors.  The simple
+ * version, gxio_mpipe_equeue_put(), allows the programmer to wait for
+ * an eDMA ring slot to become available and write a single descriptor
+ * into the ring.
+ *
+ * Alternatively, the gxio_mpipe_equeue_reserve() and
+ * gxio_mpipe_equeue_put_at() APIs can be used to reserve multiple
+ * eDMA ring slots and then fill each slot with a
+ * ::gxio_mpipe_edesc_t.  This capability can be used to reduce
+ * per-operation overhead by posting multiple packets with a single
+ * gxio_mpipe_equeue_reserve() call.  It also allows gather operations
+ * to be performed by posting multiple descriptors, one for each
+ * fragment in the final egress packet.
+ *
+ * The 'slot' number returned by gxio_mpipe_reserve() is really a
+ * 63-bit sequence number, the low bits of which indicate the ring
+ * buffer index and the high bits the number of times the application
+ * has gone around the egress ring buffer.  The extra bits allow an
+ * application to check for egress completion by calling
+ * gxio_mpipe_equeue_is_complete() to see whether a particular 'slot'
+ * number has finished.  Given the maximum packet rates of the Gx
+ * processor, the 63-bit slot number will never wrap.
+ *
+ * In practice, most applications use the ::gxio_mpipe_edesc_t::hwb
+ * bit to indicate that the buffers containing egress packet data
+ * should be pushed onto a buffer stack when egress is complete.  In
+ * this case, the programmer generally does not need to know when an
+ * egress operation actually finishes, since there is no need to free
+ * a buffer post-egress.
+ *
+ * @section gxio_mpipe_equeue_ordered Ordered Packet Forwarding
+ *
+ * The gxio_mpipe_equeue_put_at() API call also be used to perform
+ * in-order forwarding.  mPIPE ingress packets can be marked with
+ * sequence numbers stored in ::gxio_mpipe_idesc_t.  If ingress is
+ * configured to provide sequence numbers, an application can use
+ * gxio_mpipe_equeue_put_at() to put packets into the eDMA ring slot
+ * indicated by their ingress sequence number, in effect forcing the
+ * system to perform ordered packets forwarding.  When using this
+ * mechanism, applications should take care to obey the following
+ * rules:
+ *
+ * - Never call gxio_mpipe_equeue_reserve() on an eDMA ring that is
+ * used for ordered forwarding; that function knows nothing about the
+ * descriptors that have been posted given a sequence number.
+ *
+ * - Make sure that the number of ingress buffers is less than the
+ * number of slots in the eDMA ring.  This guarantees that a burst of
+ * ingress packets cannot overflow the slots available in the eDMA
+ * ring buffer.
+ *
+ * - gxio_mpipe_equeue_put_at() must be called once for each ingress
+ * packet.  Skipping a packet will cause the hardware to stall waiting
+ * for the next in-order packet descriptor.
+ *
+ * - If the application chooses to drop a packet rather than forward
+ * it, it can set the ::gxio_mpipe_edesc_t::ns (no send) bit on the
+ * descriptor passed to gxio_mpipe_equeue_put_at() to indicate that no
+ * data should be sent.  If indicated, the buffer will still be pushed
+ * onto the buffer stack when the egress descriptor is processed.
+ */
+
+/* A convenient interface to a NotifRing, for use by a single thread.
+ */
+typedef struct {
+
+	/* The context. */
+	gxio_mpipe_context_t *context;
+
+	/* The actual NotifRing. */
+	gxio_mpipe_idesc_t *idescs;
+
+	/* The number of entries. */
+	unsigned long num_entries;
+
+	/* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+	/* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+	/* The next entry. */
+	unsigned int head;
+
+	/* The NotifRing id. */
+	unsigned int ring;
+
+#ifdef __BIG_ENDIAN__
+	/* The number of byteswapped entries. */
+	unsigned int swapped;
+#endif
+
+} gxio_mpipe_iqueue_t;
+
+/* Initialize an "iqueue".
+ *
+ * Takes the iqueue plus the same args as gxio_mpipe_init_notif_ring().
+ */
+extern int gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t *iqueue,
+				  gxio_mpipe_context_t *context,
+				  unsigned int ring,
+				  void *mem, size_t mem_size,
+				  unsigned int mem_flags);
+
+/* Advance over some old entries in an iqueue.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param count The number of entries to advance over.
+ */
+static inline void gxio_mpipe_iqueue_advance(gxio_mpipe_iqueue_t *iqueue,
+					     int count)
+{
+	/* Advance with proper wrap. */
+	int head = iqueue->head + count;
+	iqueue->head =
+		(head & iqueue->mask_num_entries) +
+		(head >> iqueue->log2_num_entries);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Track swapped entries. */
+	iqueue->swapped -= count;
+#endif
+}
+
+/* Release the ring and bucket for an old entry in an iqueue.
+ *
+ * Releasing the ring allows more packets to be delivered to the ring.
+ *
+ * Releasing the bucket allows flows using the bucket to be moved to a
+ * new ring when using GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY.
+ *
+ * This function is shorthand for "gxio_mpipe_credit(iqueue->context,
+ * iqueue->ring, idesc->bucket_id, 1)", and it may be more convenient
+ * to make that underlying call, using those values, instead of
+ * tracking the entire "idesc".
+ *
+ * If packet processing is deferred, optimal performance requires that
+ * the releasing be deferred as well.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void gxio_mpipe_iqueue_release(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t *idesc)
+{
+	gxio_mpipe_credit(iqueue->context, iqueue->ring, idesc->bucket_id, 1);
+}
+
+/* Consume a packet from an "iqueue".
+ *
+ * After processing packets peeked at via gxio_mpipe_iqueue_peek()
+ * or gxio_mpipe_iqueue_try_peek(), you must call this function, or
+ * gxio_mpipe_iqueue_advance() plus gxio_mpipe_iqueue_release(), to
+ * advance over those entries, and release their rings and buckets.
+ *
+ * You may call this function as each packet is processed, or you can
+ * wait until several packets have been processed.
+ *
+ * Note that if you are using a single bucket, and you are handling
+ * batches of N packets, then you can replace several calls to this
+ * function with calls to "gxio_mpipe_iqueue_advance(iqueue, N)" and
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, bucket, N)".
+ *
+ * Note that if your classifier sets "idesc->nr", then you should
+ * explicitly call "gxio_mpipe_iqueue_advance(iqueue, idesc)" plus
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, -1, 1)", to
+ * avoid incorrectly crediting the (unused) bucket.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void gxio_mpipe_iqueue_consume(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t *idesc)
+{
+	gxio_mpipe_iqueue_advance(iqueue, 1);
+	gxio_mpipe_iqueue_release(iqueue, idesc);
+}
+
+/* Peek at the next packet(s) in an "iqueue", without waiting.
+ *
+ * If no packets are available, fills idesc_ref with NULL, and then
+ * returns ::GXIO_MPIPE_ERR_IQUEUE_EMPTY.  Otherwise, fills idesc_ref
+ * with the address of the next valid packet descriptor, and returns
+ * the maximum number of valid descriptors which can be processed.
+ * You may process fewer descriptors if desired.
+ *
+ * Call gxio_mpipe_iqueue_consume() on each packet once it has been
+ * processed (or dropped), to allow more packets to be delivered.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc_ref A pointer to a packet descriptor pointer.
+ * @return The (positive) number of packets which can be processed,
+ * or ::GXIO_MPIPE_ERR_IQUEUE_EMPTY if no packets are available.
+ */
+static inline int gxio_mpipe_iqueue_try_peek(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t **idesc_ref)
+{
+	gxio_mpipe_idesc_t *next;
+
+	uint64_t head = iqueue->head;
+	uint64_t tail = __gxio_mmio_read(iqueue->idescs);
+
+	/* Available entries. */
+	uint64_t avail =
+		(tail >= head) ? (tail - head) : (iqueue->num_entries - head);
+
+	if (avail == 0) {
+		*idesc_ref = NULL;
+		return GXIO_MPIPE_ERR_IQUEUE_EMPTY;
+	}
+
+	next = &iqueue->idescs[head];
+
+	/* ISSUE: Is this helpful? */
+	__insn_prefetch(next);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Swap new entries directly in memory. */
+	{
+		int i, j;
+		for (i = iqueue->swapped; i < avail; i++) {
+			for (j = 0; j < 8; j++)
+				next[i].words[j] =
+					__builtin_bswap64(next[i].words[j]);
+		}
+		iqueue->swapped = avail;
+	}
+#endif
+
+	*idesc_ref = next;
+
+	return avail;
+}
+
+/* Drop a packet by pushing its buffer (if appropriate).
+ *
+ * NOTE: The caller must still call gxio_mpipe_iqueue_consume() if idesc
+ * came from gxio_mpipe_iqueue_try_peek() or gxio_mpipe_iqueue_peek().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc A packet descriptor.
+ */
+static inline void gxio_mpipe_iqueue_drop(gxio_mpipe_iqueue_t *iqueue,
+					  gxio_mpipe_idesc_t *idesc)
+{
+	/* FIXME: Handle "chaining" properly. */
+
+	if (!idesc->be) {
+		unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+		gxio_mpipe_push_buffer(iqueue->context, idesc->stack_idx, va);
+	}
+}
+
+/*****************************************************************
+ *                      Egress Queue Wrapper                      *
+ ******************************************************************/
+
+/* A convenient, thread-safe interface to an eDMA ring. */
+typedef struct {
+
+	/* State object for tracking head and tail pointers. */
+	__gxio_dma_queue_t dma_queue;
+
+	/* The ring entries. */
+	gxio_mpipe_edesc_t *edescs;
+
+	/* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+	/* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+} gxio_mpipe_equeue_t;
+
+/* Initialize an "equeue".
+ *
+ * Takes the equeue plus the same args as gxio_mpipe_init_edma_ring().
+ */
+extern int gxio_mpipe_equeue_init(gxio_mpipe_equeue_t *equeue,
+				  gxio_mpipe_context_t *context,
+				  unsigned int edma_ring_id,
+				  unsigned int channel,
+				  void *mem, unsigned int mem_size,
+				  unsigned int mem_flags);
+
+/* Reserve slots for eDMA commands.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve.
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_reserve(gxio_mpipe_equeue_t *equeue,
+						unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, 1);
+}
+
+/* Reserve slots for eDMA commands, if possible.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve.
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_try_reserve(gxio_mpipe_equeue_t
+						    *equeue, unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, 0);
+}
+
+/*
+ * HACK: This helper function tricks gcc 4.6 into avoiding saving
+ * a copy of "edesc->words[0]" on the stack for no obvious reason.
+ */
+
+static inline void gxio_mpipe_equeue_put_at_aux(gxio_mpipe_equeue_t *equeue,
+						uint_reg_t ew[2],
+						unsigned long slot)
+{
+	unsigned long edma_slot = slot & equeue->mask_num_entries;
+	gxio_mpipe_edesc_t *edesc_p = &equeue->edescs[edma_slot];
+
+	/*
+	 * ISSUE: Could set eDMA ring to be on generation 1 at start, which
+	 * would avoid the negation here, perhaps allowing "__insn_bfins()".
+	 */
+	ew[0] |= !((slot >> equeue->log2_num_entries) & 1);
+
+	/*
+	 * NOTE: We use "__gxio_mpipe_write()", plus the fact that the eDMA
+	 * queue alignment restrictions ensure that these two words are on
+	 * the same cacheline, to force proper ordering between the stores.
+	 */
+	__gxio_mmio_write64(&edesc_p->words[1], ew[1]);
+	__gxio_mmio_write64(&edesc_p->words[0], ew[0]);
+}
+
+/* Post an eDMA command to an eDMA queue at a given egress slot.
+ *
+ * This function copies the supplied edesc into entry "slot mod N" in
+ * the underlying ring, setting the "gen" bit to the appropriate value
+ * based on "(slot mod N*2)", where "N" is the size of the ring.  Note
+ * that the higher bits of slot are unused.
+ *
+ * Normally this function is used to fill in slots reserved by, for
+ * example, gxio_mpipe_equeue_reserve().
+ *
+ * This function can also be used without "reserving" slots, if the
+ * application KNOWS that the ring can never overflow, for example,
+ * by pushing fewer buffers into the buffer stacks than there are
+ * total slots in the equeue.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc eDMA command to be posted.
+ * @param slot An egress slot (only the low bits are actually used).
+ */
+static inline void gxio_mpipe_equeue_put_at(gxio_mpipe_equeue_t *equeue,
+					    gxio_mpipe_edesc_t edesc,
+					    unsigned long slot)
+{
+	gxio_mpipe_equeue_put_at_aux(equeue, edesc.words, slot);
+}
+
+/* Post a single eDMA command to an eDMA queue.
+ *
+ * This is a convenience wrapper around gxio_mpipe_equeue_reserve()
+ * and gxio_mpipe_equeue_put_at().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc eDMA command to be posted.
+ * @return 0 on success.
+ */
+static inline int gxio_mpipe_equeue_put(gxio_mpipe_equeue_t *equeue,
+					gxio_mpipe_edesc_t edesc)
+{
+	int64_t slot = gxio_mpipe_equeue_reserve(equeue, 1);
+	if (slot < 0)
+		return (int)slot;
+
+	gxio_mpipe_equeue_put_at(equeue, edesc, slot);
+
+	return 0;
+}
+
+/* Ask the mPIPE hardware to egress outstanding packets immediately.
+ *
+ * This call is not necessary, but may slightly reduce overall latency.
+ *
+ * Technically, you should flush all gxio_mpipe_equeue_put_at() writes
+ * to memory before calling this function, to ensure the descriptors
+ * are visible in memory before the mPIPE hardware actually looks for
+ * them.  But this should be very rare, and the only side effect would
+ * be increased latency, so it is up to the caller to decide whether
+ * or not to flush memory.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ */
+static inline void gxio_mpipe_equeue_flush(gxio_mpipe_equeue_t *equeue)
+{
+	/* Use "ring_idx = 0" and "count = 0" to "wake up" the eDMA ring. */
+	MPIPE_EDMA_POST_REGION_VAL_t val = { {0} };
+	__insn_flushwb();	/* Flush the write buffers. */
+	__gxio_mmio_write(equeue->dma_queue.post_region_addr, val.word);
+}
+
+/* Determine if a given eDMA command has been completed.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param slot The slot used by the eDMA command.
+ * @param update If true, and the command does not appear to have completed
+ * yet, then update any software cache of the hardware completion counter,
+ * and check again.  This should normally be true.
+ * @return True iff the given eDMA command has been completed.
+ *
+ * ISSUE: This should return "bool" and should take "bool update".
+ */
+static inline int gxio_mpipe_equeue_is_complete(gxio_mpipe_equeue_t *equeue,
+						int64_t slot, int update)
+{
+	return __gxio_dma_queue_is_complete(&equeue->dma_queue, slot, update);
+}
+
+/*****************************************************************
+ *                        Link Management                         *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for manipulating and sensing the state and configuration
+ * of physical network links.
+ *
+ * @section gxio_mpipe_link_perm Link Permissions
+ *
+ * Opening a link (with gxio_mpipe_link_open()) requests a set of link
+ * permissions, which control what may be done with the link, and potentially
+ * what permissions may be granted to other processes.
+ *
+ * Data permission allows the process to receive packets from the link by
+ * specifying the link's channel number in mPIPE packet distribution rules,
+ * and to send packets to the link by using the link's channel number as
+ * the target for an eDMA ring.
+ *
+ * Stats permission allows the process to retrieve link attributes (such as
+ * the speeds it is capable of running at, or whether it is currently up), and
+ * to read and write certain statistics-related registers in the link's MAC.
+ *
+ * Control permission allows the process to retrieve and modify link attributes
+ * (so that it may, for example, bring the link up and take it down), and
+ * read and write many registers in the link's MAC and PHY.
+ *
+ * Any permission may be requested as shared, which allows other processes
+ * to also request shared permission, or exclusive, which prevents other
+ * processes from requesting it.  In keeping with GXIO's typical usage in
+ * an embedded environment, the defaults for all permissions are shared.
+ *
+ * Permissions are granted on a first-come, first-served basis, so if two
+ * applications request an exclusive permission on the same link, the one
+ * to run first will win.  Note, however, that some system components, like
+ * the kernel Ethernet driver, may get an opportunity to open links before
+ * any applications run.
+ *
+ * @section gxio_mpipe_link_names Link Names
+ *
+ * Link names are of the form gbe<em>number</em> (for Gigabit Ethernet),
+ * xgbe<em>number</em> (for 10 Gigabit Ethernet), loop<em>number</em> (for
+ * internal mPIPE loopback), or ilk<em>number</em>/<em>channel</em>
+ * (for Interlaken links); for instance, gbe0, xgbe1, loop3, and
+ * ilk0/12 are all possible link names.  The correspondence between
+ * the link name and an mPIPE instance number or mPIPE channel number is
+ * system-dependent; all links will not exist on all systems, and the set
+ * of numbers used for a particular link type may not start at zero and may
+ * not be contiguous.  Use gxio_mpipe_link_enumerate() to retrieve the set of
+ * links which exist on a system, and always use gxio_mpipe_link_instance()
+ * to determine which mPIPE controls a particular link.
+ *
+ * Note that in some cases, links may share hardware, such as PHYs, or
+ * internal mPIPE buffers; in these cases, only one of the links may be
+ * opened at a time.  This is especially common with xgbe and gbe ports,
+ * since each xgbe port uses 4 SERDES lanes, each of which may also be
+ * configured as one gbe port.
+ *
+ * @section gxio_mpipe_link_states Link States
+ *
+ * The mPIPE link management model revolves around three different states,
+ * which are maintained for each link:
+ *
+ * 1. The <em>current</em> link state: is the link up now, and if so, at
+ *    what speed?
+ *
+ * 2. The <em>desired</em> link state: what do we want the link state to be?
+ *    The system is always working to make this state the current state;
+ *    thus, if the desired state is up, and the link is down, we'll be
+ *    constantly trying to bring it up, automatically.
+ *
+ * 3. The <em>possible</em> link state: what speeds are valid for this
+ *    particular link?  Or, in other words, what are the capabilities of
+ *    the link hardware?
+ *
+ * These link states are not, strictly speaking, related to application
+ * state; they may be manipulated at any time, whether or not the link
+ * is currently being used for data transfer.  However, for convenience,
+ * gxio_mpipe_link_open() and gxio_mpipe_link_close() (or application exit)
+ * can affect the link state.  These implicit link management operations
+ * may be modified or disabled by the use of link open flags.
+ *
+ * From an application, you can use gxio_mpipe_link_get_attr()
+ * and gxio_mpipe_link_set_attr() to manipulate the link states.
+ * gxio_mpipe_link_get_attr() with ::GXIO_MPIPE_LINK_POSSIBLE_STATE
+ * gets you the possible link state.  gxio_mpipe_link_get_attr() with
+ * ::GXIO_MPIPE_LINK_CURRENT_STATE gets you the current link state.
+ * Finally, gxio_mpipe_link_set_attr() and gxio_mpipe_link_get_attr()
+ * with ::GXIO_MPIPE_LINK_DESIRED_STATE allow you to modify or retrieve
+ * the desired link state.
+ *
+ * If you want to manage a link from a part of your application which isn't
+ * involved in packet processing, you can use the ::GXIO_MPIPE_LINK_NO_DATA
+ * flags on a gxio_mpipe_link_open() call.  This opens the link, but does
+ * not request data permission, so it does not conflict with any exclusive
+ * permissions which may be held by other processes.  You can then can use
+ * gxio_mpipe_link_get_attr() and gxio_mpipe_link_set_attr() on this link
+ * object to bring up or take down the link.
+ *
+ * Some links support link state bits which support various loopback
+ * modes. ::GXIO_MPIPE_LINK_LOOP_MAC tests datapaths within the Tile
+ * Processor itself; ::GXIO_MPIPE_LINK_LOOP_PHY tests the datapath between
+ * the Tile Processor and the external physical layer interface chip; and
+ * ::GXIO_MPIPE_LINK_LOOP_EXT tests the entire network datapath with the
+ * aid of an external loopback connector.  In addition to enabling hardware
+ * testing, such configuration can be useful for software testing, as well.
+ *
+ * When LOOP_MAC or LOOP_PHY is enabled, packets transmitted on a channel
+ * will be received by that channel, instead of being emitted on the
+ * physical link, and packets received on the physical link will be ignored.
+ * Other than that, all standard GXIO operations work as you might expect.
+ * Note that loopback operation requires that the link be brought up using
+ * one or more of the GXIO_MPIPE_LINK_SPEED_xxx link state bits.
+ *
+ * Those familiar with previous versions of the MDE on TILEPro hardware
+ * will notice significant similarities between the NetIO link management
+ * model and the mPIPE link management model.  However, the NetIO model
+ * was developed in stages, and some of its features -- for instance,
+ * the default setting of certain flags -- were shaped by the need to be
+ * compatible with previous versions of NetIO.  Since the features provided
+ * by the mPIPE hardware and the mPIPE GXIO library are significantly
+ * different than those provided by NetIO, in some cases, we have made
+ * different choices in the mPIPE link management API.  Thus, please read
+ * this documentation carefully before assuming that mPIPE link management
+ * operations are exactly equivalent to their NetIO counterparts.
+ */
+
+/* An object used to manage mPIPE link state and resources. */
+typedef struct {
+	/* The overall mPIPE context. */
+	gxio_mpipe_context_t *context;
+
+	/* The channel number used by this link. */
+	uint8_t channel;
+
+	/* The MAC index used by this link. */
+	uint8_t mac;
+} gxio_mpipe_link_t;
+
+/* Retrieve one of this system's legal link names, and its MAC address.
+ *
+ * @param index Link name index.  If a system supports N legal link names,
+ *  then indices between 0 and N - 1, inclusive, each correspond to one of
+ *  those names.  Thus, to retrieve all of a system's legal link names,
+ *  call this function in a loop, starting with an index of zero, and
+ *  incrementing it once per iteration until -1 is returned.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  link name.  The buffer should contain space for at least
+ *  ::GXIO_MPIPE_LINK_NAME_LEN bytes; the returned name, including the
+ *  terminating null byte, will be no longer than that.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  MAC address.  The buffer should contain space for at least 6 bytes.
+ * @return Zero if a link name was successfully retrieved; -1 if one was
+ *  not.
+ */
+extern int gxio_mpipe_link_enumerate_mac(int index, char *link_name,
+					 uint8_t *mac_addr);
+
+/* Open an mPIPE link.
+ *
+ *  A link must be opened before it may be used to send or receive packets,
+ *  and before its state may be examined or changed.  Depending up on the
+ *  link's intended use, one or more link permissions may be requested via
+ *  the flags parameter; see @ref gxio_mpipe_link_perm.  In addition, flags
+ *  may request that the link's state be modified at open time.  See @ref
+ *  gxio_mpipe_link_states and @ref gxio_mpipe_link_open_flags for more detail.
+ *
+ * @param link A link state object, which will be initialized if this
+ *  function completes successfully.
+ * @param context An initialized mPIPE context.
+ * @param link_name Name of the link.
+ * @param flags Zero or more @ref gxio_mpipe_link_open_flags, ORed together.
+ * @return 0 if the link was successfully opened, or a negative error code.
+ *
+ */
+extern int gxio_mpipe_link_open(gxio_mpipe_link_t *link,
+				gxio_mpipe_context_t *context,
+				const char *link_name, unsigned int flags);
+
+/* Close an mPIPE link.
+ *
+ *  Closing a link makes it available for use by other processes.  Once
+ *  a link has been closed, packets may no longer be sent on or received
+ *  from the link, and its state may not be examined or changed.
+ *
+ * @param link A link state object, which will no longer be initialized
+ *  if this function completes successfully.
+ * @return 0 if the link was successfully closed, or a negative error code.
+ *
+ */
+extern int gxio_mpipe_link_close(gxio_mpipe_link_t *link);
+
+/* Return a link's channel number.
+ *
+ * @param link A properly initialized link state object.
+ * @return The channel number for the link.
+ */
+static inline int gxio_mpipe_link_channel(gxio_mpipe_link_t *link)
+{
+	return link->channel;
+}
+
+#endif /* !_GXIO_MPIPE_H_ */
diff --git a/arch/tile/include/hv/drv_mpipe_intf.h b/arch/tile/include/hv/drv_mpipe_intf.h
new file mode 100644
index 0000000..6cdae3b
--- /dev/null
+++ b/arch/tile/include/hv/drv_mpipe_intf.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the mpipe driver.
+ */
+
+#ifndef _SYS_HV_DRV_MPIPE_INTF_H
+#define _SYS_HV_DRV_MPIPE_INTF_H
+
+#include <arch/mpipe.h>
+#include <arch/mpipe_constants.h>
+
+
+/** Number of buffer stacks (32). */
+#define HV_MPIPE_NUM_BUFFER_STACKS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH)
+
+/** Number of NotifRings (256). */
+#define HV_MPIPE_NUM_NOTIF_RINGS (MPIPE_NUM_NOTIF_RINGS)
+
+/** Number of NotifGroups (32). */
+#define HV_MPIPE_NUM_NOTIF_GROUPS (MPIPE_NUM_NOTIF_GROUPS)
+
+/** Number of buckets (4160). */
+#define HV_MPIPE_NUM_BUCKETS (MPIPE_NUM_BUCKETS)
+
+/** Number of "lo" buckets (4096). */
+#define HV_MPIPE_NUM_LO_BUCKETS 4096
+
+/** Number of "hi" buckets (64). */
+#define HV_MPIPE_NUM_HI_BUCKETS \
+  (HV_MPIPE_NUM_BUCKETS - HV_MPIPE_NUM_LO_BUCKETS)
+
+/** Number of edma rings (24). */
+#define HV_MPIPE_NUM_EDMA_RINGS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH)
+
+
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_MPIPE_ALLOC_FIXED 0x01
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_CFG << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_SIZE (64 * 1024)
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_FAST_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_IDMA << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the fast register MMIO region (IDMA, EDMA, buffer stack). */
+#define HV_MPIPE_FAST_MMIO_SIZE \
+  ((MPIPE_MMIO_ADDR__REGION_VAL_BSM + 1 - MPIPE_MMIO_ADDR__REGION_VAL_IDMA) \
+   << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+
+/*
+ * Each type of resource allocation comes in quantized chunks, where
+ * XXX_BITS is the number of chunks, and XXX_RES_PER_BIT is the number
+ * of resources in each chunk.
+ */
+
+/** Number of buffer stack chunks available (32). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH
+
+/** Granularity of buffer stack allocation (1). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_RES_PER_BIT \
+  (HV_MPIPE_NUM_BUFFER_STACKS / HV_MPIPE_ALLOC_BUFFER_STACKS_BITS)
+
+/** Number of NotifRing chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__NOTIF_RING_MASK_WIDTH
+
+/** Granularity of NotifRing allocation (8). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_RINGS / HV_MPIPE_ALLOC_NOTIF_RINGS_BITS)
+
+/** Number of NotifGroup chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS \
+  HV_MPIPE_NUM_NOTIF_GROUPS
+
+/** Granularity of NotifGroup allocation (1). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_GROUPS / HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS)
+
+/** Number of lo bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_LO_WIDTH
+
+/** Granularity of lo bucket allocation (256). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_LO_BUCKETS / HV_MPIPE_ALLOC_LO_BUCKETS_BITS)
+
+/** Number of hi bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_HI_WIDTH
+
+/** Granularity of hi bucket allocation (4). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_HI_BUCKETS / HV_MPIPE_ALLOC_HI_BUCKETS_BITS)
+
+/** Number of eDMA ring chunks available (24). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH
+
+/** Granularity of eDMA ring allocation (1). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_EDMA_RINGS / HV_MPIPE_ALLOC_EDMA_RINGS_BITS)
+
+
+
+
+/** Bit vector encoding which NotifRings are in a NotifGroup. */
+typedef struct
+{
+  /** The actual bits. */
+  uint64_t ring_mask[4];
+
+} gxio_mpipe_notif_group_bits_t;
+
+
+/** Another name for MPIPE_LBL_INIT_DAT_BSTS_TBL_t. */
+typedef MPIPE_LBL_INIT_DAT_BSTS_TBL_t gxio_mpipe_bucket_info_t;
+
+
+
+/** Eight buffer stack ids. */
+typedef struct
+{
+  /** The stacks. */
+  uint8_t stacks[8];
+
+} gxio_mpipe_rules_stacks_t;
+
+
+/** A destination mac address. */
+typedef struct
+{
+  /** The octets. */
+  uint8_t octets[6];
+
+} gxio_mpipe_rules_dmac_t;
+
+
+/** A vlan. */
+typedef uint16_t gxio_mpipe_rules_vlan_t;
+
+
+
+/** Maximum number of characters in a link name. */
+#define GXIO_MPIPE_LINK_NAME_LEN  32
+
+
+/** Structure holding a link name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_LINK_NAME_LEN];
+}
+_gxio_mpipe_link_name_t;
+
+/** Maximum number of characters in a symbol name. */
+#define GXIO_MPIPE_SYMBOL_NAME_LEN  128
+
+
+/** Structure holding a symbol name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_SYMBOL_NAME_LEN];
+}
+_gxio_mpipe_symbol_name_t;
+
+
+/** Structure holding a MAC address. */
+typedef struct
+{
+  /** The address. */
+  uint8_t mac[6];
+}
+_gxio_mpipe_link_mac_t;
+
+
+
+/** Request shared data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  Other processes may also
+ *  request shared data permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_DATA               0x00000001UL
+
+/** Do not request data permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_DATA            0x00000002UL
+
+/** Request exclusive data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  No other processes may
+ *  request data permission on this link, and if any process already has
+ *  data permission on it, this open will fail.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_DATA          0x00000004UL
+
+/** Request shared stats permission -- that is, the ability to read and write
+ *  registers which contain link statistics, and to get link attributes --
+ *  on the specified link.  Other processes may also request shared stats
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_STATS              0x00000008UL
+
+/** Do not request stats permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_STATS           0x00000010UL
+
+/** Request exclusive stats permission -- that is, the ability to read and
+ *  write registers which contain link statistics, and to get link
+ *  attributes -- on the specified link.  No other processes may request
+ *  stats permission on this link, and if any process already
+ *  has stats permission on it, this open will fail.
+ *
+ *  Requesting exclusive stats permission is normally a very bad idea, since
+ *  it prevents programs like mpipe-stat from providing information on this
+ *  link.  Applications should only do this if they use MAC statistics
+ *  registers, and cannot tolerate any of the clear-on-read registers being
+ *  reset by other statistics programs.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_STATS         0x00000020UL
+
+/** Request shared control permission -- that is, the ability to modify link
+ *  attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  Other processes may also request shared control
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_CTL                0x00000040UL
+
+/** Do not request control permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_CTL             0x00000080UL
+
+/** Request exclusive control permission -- that is, the ability to modify
+ *  link attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  No other processes may request control permission on
+ *  this link, and if any process already has control permission on it,
+ *  this open will fail.
+ *
+ *  Requesting exclusive control permission is not always a good idea, since
+ *  it prevents programs like mpipe-link from configuring the link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_CTL           0x00000100UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; do not
+ *  change the desired state of the link when it is closed or the process
+ *  exits.  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UP            0x00000200UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; when the
+ *  link is closed or this process exits, if no other process has the link
+ *  open, set the desired state of the link to down.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UPDOWN        0x00000400UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; when the link is closed or this process exits, if no other
+ *  process has the link open, set the desired state of the link to down.
+ *  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_DOWN          0x00000800UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; do not change the desired state of the link when it is
+ *  closed or the process exits.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_NONE          0x00001000UL
+
+/** Request that this open call not complete until the network link is up.
+ *  The process will wait as long as necessary for this to happen;
+ *  applications which wish to abandon waiting for the link after a
+ *  specific time period should not specify this flag when opening a link,
+ *  but should instead call gxio_mpipe_link_wait() afterward.  The link
+ *  must be opened with stats permission.  Note that this flag by itself
+ *  does not change the desired link state; if other open flags or previous
+ *  link state changes have not requested a desired state of up, the open
+ *  call will never complete.  This flag is not available to kernel
+ *  clients.
+ */
+#define GXIO_MPIPE_LINK_WAIT               0x00002000UL
+
+
+/*
+ * Note: link attributes must fit in 24 bits, since we use the top 8 bits
+ * of the IORPC offset word for the channel number.
+ */
+
+/** Determine whether jumbo frames may be received.  If this attribute's
+ *  value value is nonzero, the MAC will accept frames of up to 10240 bytes.
+ *  If the value is zero, the MAC will only accept frames of up to 1544
+ *  bytes.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_JUMBO      0x010000
+
+/** Determine whether to send pause frames on this link if the mPIPE packet
+ *  FIFO is nearly full.  If the value is zero, pause frames are not sent.
+ *  If the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times.
+ *
+ *  Bear in mind that in almost all circumstances, the mPIPE packet FIFO
+ *  will never fill up, since mPIPE will empty it as fast as or faster than
+ *  the incoming data rate, by either delivering or dropping packets.  The
+ *  only situation in which this is not true is if the memory and cache
+ *  subsystem is extremely heavily loaded, and mPIPE cannot perform DMA of
+ *  packet data to memory in a timely fashion.  In particular, pause frames
+ *  will <em>not</em> be sent if packets cannot be delivered because
+ *  NotifRings are full, buckets are full, or buffers are not available in
+ *  a buffer stack. */
+#define GXIO_MPIPE_LINK_SEND_PAUSE         0x020000
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, mPIPE shim will suspend output on the link's
+ *  channel when a pause frame is received.  If the value is zero, pause
+ *  frames will be ignored.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_PAUSE      0x030000
+
+/** Interface MAC address.  The value is a 6-byte MAC address, in the least
+ *  significant 48 bits of the value; in other words, an address which would
+ *  be printed as '12:34:56:78:90:AB' in IEEE 802 canonical format would
+ *  be returned as 0x12345678ab.
+ *
+ *  Depending upon the overall system design, a MAC address may or may not
+ *  be available for each interface.  Note that the interface's MAC address
+ *  does not limit the packets received on its channel, although the
+ *  classifier's rules could be configured to do that.  Similarly, the MAC
+ *  address is not used when transmitting packets, although applications
+ *  could certainly decide to use the assigned address as a source MAC
+ *  address when doing so.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified.
+ */
+#define GXIO_MPIPE_LINK_MAC                0x040000
+
+/** Determine whether to discard egress packets on link down. If this value
+ *  is nonzero, packets sent on this link while the link is down will be
+ *  discarded.  If this value is zero, no packets will be sent on this link
+ *  while it is down.  The default value is one. */
+#define GXIO_MPIPE_LINK_DISCARD_IF_DOWN    0x050000
+
+/** Possible link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate link modes which are actually supported by
+ *  the hardware.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified. */
+#define GXIO_MPIPE_LINK_POSSIBLE_STATE     0x060000
+
+/** Current link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate the current state of the hardware.  If the
+ *  link is down, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will be zero;
+ *  if the link is up, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will
+ *  result in exactly one of the speed values, indicating the current speed.
+ *  This attribute may only be retrieved with gxio_mpipe_link_get_attr(); it
+ *  may not be modified. */
+#define GXIO_MPIPE_LINK_CURRENT_STATE      0x070000
+
+/** Desired link state. The value is a conbination of flags, which specify
+ *  the desired state for the link.  With gxio_mpipe_link_set_attr(), this
+ *  will, in the background, attempt to bring up the link using whichever of
+ *  the requested flags are reasonable, or take down the link if the flags
+ *  are zero.  The actual link up or down operation may happen after this
+ *  call completes.  If the link state changes in the future, the system
+ *  will continue to try to get back to the desired link state; for
+ *  instance, if the link is brought up successfully, and then the network
+ *  cable is disconnected, the link will go down.  However, the desired
+ *  state of the link is still up, so if the cable is reconnected, the link
+ *  will be brought up again.
+ *
+ *  With gxio_mpipe_link_set_attr(), this will indicate the desired state
+ *  for the link, as set with a previous gxio_mpipe_link_set_attr() call,
+ *  or implicitly by a gxio_mpipe_link_open() or link close operation.
+ *  This may not reflect the current state of the link; to get that, use
+ *  ::GXIO_MPIPE_LINK_CURRENT_STATE.
+ */
+#define GXIO_MPIPE_LINK_DESIRED_STATE      0x080000
+
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define GXIO_MPIPE_LINK_10M        0x0000000000000001UL
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define GXIO_MPIPE_LINK_100M       0x0000000000000002UL
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define GXIO_MPIPE_LINK_1G         0x0000000000000004UL
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define GXIO_MPIPE_LINK_10G        0x0000000000000008UL
+
+/** Link can run, should run, or is running at 20 Gbps. */
+#define GXIO_MPIPE_LINK_20G        0x0000000000000010UL
+
+/** Link can run, should run, or is running at 25 Gbps. */
+#define GXIO_MPIPE_LINK_25G        0x0000000000000020UL
+
+/** Link can run, should run, or is running at 50 Gbps. */
+#define GXIO_MPIPE_LINK_50G        0x0000000000000040UL
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define GXIO_MPIPE_LINK_ANYSPEED   0x0000000000000800UL
+
+/** All legal link speeds.  This value is provided for use in extracting
+ *  the speed-related subset of the link state flags; it is not intended
+ *  to be set directly as a value for one of the GXIO_MPIPE_LINK_xxx_STATE
+ *  attributes.  A link is up or is requested to be up if its current or
+ *  desired state, respectively, ANDED with this value, is nonzero. */
+#define GXIO_MPIPE_LINK_SPEED_MASK 0x0000000000000FFFUL
+
+/** Link can run, should run, or is running in MAC loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the Tile
+ *  Processor. */
+#define GXIO_MPIPE_LINK_LOOP_MAC   0x0000000000001000UL
+
+/** Link can run, should run, or is running in PHY loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the external
+ *  PHY chip. */
+#define GXIO_MPIPE_LINK_LOOP_PHY   0x0000000000002000UL
+
+/** Link can run, should run, or is running in external loopback mode.
+ *  This requires that an external loopback plug be installed on the
+ *  Ethernet port.  Note that only some links require that this be
+ *  configured via the gxio_mpipe_link routines; other links can do
+ *  external loopack with the plug and no special configuration. */
+#define GXIO_MPIPE_LINK_LOOP_EXT   0x0000000000004000UL
+
+/** All legal loopback types. */
+#define GXIO_MPIPE_LINK_LOOP_MASK  0x000000000000F000UL
+
+/** Link can run, should run, or is running in full-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_FDX        0x0000000000010000UL
+
+/** Link can run, should run, or is running in half-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_HDX        0x0000000000020000UL
+
+
+/** An individual rule. */
+typedef struct
+{
+  /** The total size. */
+  uint16_t size;
+
+  /** The priority. */
+  int16_t priority;
+
+  /** The "headroom" in each buffer. */
+  uint8_t headroom;
+
+  /** The "tailroom" in each buffer. */
+  uint8_t tailroom;
+
+  /** The "capacity" of the largest buffer. */
+  uint16_t capacity;
+
+  /** The mask for converting a flow hash into a bucket. */
+  uint16_t bucket_mask;
+
+  /** The offset for converting a flow hash into a bucket. */
+  uint16_t bucket_first;
+
+  /** The buffer stack ids. */
+  gxio_mpipe_rules_stacks_t stacks;
+
+  /** The actual channels. */
+  uint32_t channel_bits;
+
+  /** The number of dmacs. */
+  uint16_t num_dmacs;
+
+  /** The number of vlans. */
+  uint16_t num_vlans;
+
+  /** The actual dmacs and vlans. */
+  uint8_t dmacs_and_vlans[];
+
+} gxio_mpipe_rules_rule_t;
+
+
+/** A list of classifier rules. */
+typedef struct
+{
+  /** The offset to the end of the current rule. */
+  uint16_t tail;
+
+  /** The offset to the start of the current rule. */
+  uint16_t head;
+
+  /** The actual rules. */
+  uint8_t rules[4096 - 4];
+
+} gxio_mpipe_rules_list_t;
+
+
+
+
+/** mPIPE statistics structure. These counters include all relevant
+ *  events occurring on all links within the mPIPE shim. */
+typedef struct
+{
+  /** Number of ingress packets dropped for any reason. */
+  uint64_t ingress_drops;
+  /** Number of ingress packets dropped because a buffer stack was empty. */
+  uint64_t ingress_drops_no_buf;
+  /** Number of ingress packets dropped or truncated due to lack of space in
+   *  the iPkt buffer. */
+  uint64_t ingress_drops_ipkt;
+  /** Number of ingress packets dropped by the classifier or load balancer */
+  uint64_t ingress_drops_cls_lb;
+  /** Total number of ingress packets. */
+  uint64_t ingress_packets;
+  /** Total number of egress packets. */
+  uint64_t egress_packets;
+  /** Total number of ingress bytes. */
+  uint64_t ingress_bytes;
+  /** Total number of egress bytes. */
+  uint64_t egress_bytes;
+}
+gxio_mpipe_stats_t;
+
+
+#endif /* _SYS_HV_DRV_MPIPE_INTF_H */
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH 6/6] tilegx network driver: initial support
  2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
                   ` (4 preceding siblings ...)
  2012-04-06 20:38 ` [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
@ 2012-04-06 20:42 ` Chris Metcalf
  2012-04-09 13:49   ` Arnd Bergmann
  5 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 20:42 UTC (permalink / raw)
  To: linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 2045 ++++++++++++++++++++++++++++++++++++
 3 files changed, 2048 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..cca63e8
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,2045 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+
+#include <gxio/mpipe.h>
+
+/* For TSO */
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+
+#include <arch/sim.h>
+
+
+/* #define USE_SIM_PRINTF */
+
+#ifdef USE_SIM_PRINTF
+
+static __attribute__((unused, format (printf, 1, 2))) void
+sim_printf(const char *format, ...)
+{
+	char *str;
+	char buf[1024];
+
+	va_list args;
+	va_start(args, format);
+	(void)vsnprintf(buf, sizeof(buf), format, args);
+	va_end(args);
+
+	/* NOTE: Copied from "sim_print()". */
+	for (str = buf; *str != '\0'; str++) {
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+			     (*str << _SIM_CONTROL_OPERATOR_BITS));
+	}
+	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+		     (SIM_PUTC_FLUSH_BINARY << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+
+/* HACK: Allow use of "sim_printf()" instead of "printk()". */
+#define printk sim_printf
+
+#endif
+
+
+/* First, "tile_net_init_module()" initializes each network cpu to
+ * handle incoming packets, and initializes all the network devices.
+ *
+ * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
+ * turn on packet processing, if needed.
+ *
+ * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
+ * stop egress, and possibly turn off packet processing.
+ *
+ * We start out with the ingress IRQ enabled on each CPU.  When it
+ * fires, it is automatically disabled, and we call "napi_schedule()".
+ * This will cause "tile_net_poll()" to be called, which will pull
+ * packets from the netio queue, filtering them out, or passing them
+ * to "netif_receive_skb()".  If our budget is exhausted, we will
+ * return, knowing we will be called again later.  Otherwise, we
+ * reenable the ingress IRQ, and call "napi_complete()".
+ *
+ *
+ * NOTE: Failing to free completions for an arbitrarily long time
+ * (which is defined to be illegal) does in fact cause bizarre problems.
+ *
+ * NOTE: The egress code can be interrupted by the interrupt handler.
+ */
+
+
+/* HACK: Define to support GSO.
+ * ISSUE: This may actually hurt performance of the TCP blaster.
+ */
+#undef TILE_NET_GSO
+
+/* HACK: Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Define to use "round robin" distribution. */
+#undef TILE_NET_ROUND_ROBIN
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The number of distinct channels. */
+#define TILE_NET_CHANNELS (MPIPE_NUM_SGMII_MACS + MPIPE_NUM_LOOPBACK_CHANNELS)
+
+/* The max number of distinct devices ("xgbe" shares the "gbe" channels). */
+#define TILE_NET_DEVS (TILE_NET_CHANNELS + MPIPE_NUM_XAUI_MACS)
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+
+#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))
+
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+
+MODULE_AUTHOR("Tilera");
+MODULE_LICENSE("GPL");
+
+
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+
+/* Statistics counters for a specific cpu and device. */
+struct tile_net_stats_t {
+	u32 rx_packets;
+	u32 rx_bytes;
+	u32 tx_packets;
+	u32 tx_bytes;
+};
+
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+
+/* Info for a specific cpu.
+ *
+ * ISSUE: Should "comps" be per channel instead of per dev?
+ */
+struct tile_net_info_t {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each device. */
+	struct tile_net_comps *comps_for_dev[TILE_NET_DEVS];
+	/* Stats for each device. */
+	struct tile_net_stats_t stats_for_dev[TILE_NET_DEVS];
+};
+
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* Our "devno". */
+	int devno;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* Total stats. */
+	struct net_device_stats stats;
+	/* The (lazy) "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The (lazy) headers for TSO. */
+	unsigned char *headers;
+};
+
+
+/* The actual devices. */
+static struct net_device *tile_net_devs[TILE_NET_DEVS];
+
+/* The device for a given channel.  HACK: We use "32", not
+ * TILE_NET_CHANNELS, because it is fairly subtle that the 5 bit
+ * "idesc.channel" field never exceeds TILE_NET_CHANNELS.
+ */
+static struct net_device *tile_net_devs_for_channel[32];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static struct mutex tile_net_devs_mutex;
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info_t, per_cpu_info);
+
+/* Access to "per_cpu_info". */
+static struct tile_net_info_t *infos[NR_CPUS];
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+
+/* True if "network_cpus" was specified. */
+static bool network_cpus_used;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char loopify_link_name[16];
+
+
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+	       s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+
+/* Convert a "buffer ptr" into a "buffer cpa". */
+static inline void *buf_to_cpa(void *buf)
+{
+	return (void *)__pa(buf);
+}
+
+
+/* Convert a "buffer cpa" into a "buffer ptr". */
+static inline void *cpa_to_buf(void *cpa)
+{
+	return (void *)__va(cpa);
+}
+
+
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	__insn_mf();
+
+	gxio_mpipe_push_buffer(&context, stack, buf_to_cpa(skb->data));
+
+	return true;
+}
+
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info_t *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info_t *info,
+				    gxio_mpipe_idesc_t *idesc)
+{
+	/* NOTE: This can be NULL during shutdown. */
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+
+	void *va;
+
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	void *buf;
+	unsigned long len;
+
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = cpa_to_buf(gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 * FIXME: The classifier will soon detect "multicast".
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+		struct tile_net_stats_t *stats =
+			&info->stats_for_dev[priv->devno];
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va)
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p\n",
+			      buf, skb, skb->data);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		stats->rx_packets++;
+		stats->rx_bytes += len;
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+
+	unsigned int work = 0;
+
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(struct net_device *dev,
+				 struct tile_net_comps *comps,
+				 int limit, bool force_update)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	gxio_mpipe_equeue_t *equeue = priv->equeue;
+
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info_t *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info_t *info = (struct tile_net_info_t *)arg;
+
+	unsigned int k;
+
+	bool pending = false;
+
+	unsigned long irqflags;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (k = 0; k < TILE_NET_DEVS; k++) {
+		struct tile_net_comps *comps = info->comps_for_dev[k];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(tile_net_devs[k], comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+
+	infos[my_cpu] = info;
+}
+
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *count_ptr)
+{
+	long count = *(long *)count_ptr;
+
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (count != 0) {
+			if (!info->napi_added) {
+				/* FIXME: HACK: We use one of the devices.
+				 * ISSUE: We never call "netif_napi_del()".
+				 */
+				netif_napi_add(tile_net_devs[0], &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+
+/* Helper function for tile_net_open() and tile_net_stop(). */
+static int tile_net_update(void)
+{
+	int channel;
+	long count = 0;
+	int cpu;
+
+	/* HACK: This is too big for the linux stack. */
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	/* TODO: Add support for "dmac" splitting? */
+	for (channel = 0; channel < TILE_NET_DEVS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (count++ == 0) {
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can happen if there is no classifier.
+	 * ISSUE: Can anything else cause it to happen?
+	 */
+	if (gxio_mpipe_rules_commit(&rules) != 0) {
+		pr_warning("Failed to update classifier rules!\n");
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu, &count, 1);
+
+	/* HACK: Allow packets to flow. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+
+/* Helper function for "tile_net_init_cpus()". */
+static void tile_net_init_stacks(int network_cpus_count)
+{
+	int err;
+	int i;
+
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+
+	int num_buffers;
+
+	size_t stack_bytes;
+
+	pte_t pte = { 0 };
+
+	void *mem;
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes, honoring the 64KB minimum alignment. */
+	stack_bytes = ROUND_UP(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			       64 * 1024);
+	if (stack_bytes > HPAGE_SIZE)
+		panic("Cannot allocate %d physically contiguous buffers.",
+		      num_buffers);
+
+#if 0
+	sim_printf("Using %d buffers for %d network cpus.\n",
+		   num_buffers, network_cpus_count);
+#endif
+
+	/* Allocate two buffer stacks. */
+	small_buffer_stack = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (small_buffer_stack < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buffer_stacks()'");
+	large_buffer_stack = small_buffer_stack + 1;
+
+	/* Allocate the small memory stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					   small_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Allocate the large buffer stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					   large_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Pin all the client memory. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	err = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+	err = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+
+	/* Provide initial buffers. */
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true))
+			panic("Cannot provide initial buffers!");
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false))
+			panic("Cannot provide initial buffers!");
+	}
+}
+
+
+/* Actually initialize the mPIPE state. */
+static int tile_net_init_cpus(void)
+{
+	int network_cpus_count;
+
+	int ring;
+	int group;
+
+	int next_ring;
+
+	int cpu;
+
+	int i;
+
+#ifdef TILE_NET_ROUND_ROBIN
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_ROUND_ROBIN;
+#else
+	/* Use random rebalancing. */
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY;
+#endif
+
+	if (!hash_default) {
+		pr_warning("Networking requires hash_default!\n");
+		goto fail;
+	}
+
+	if (gxio_mpipe_init(&context, 0) != 0) {
+		pr_warning("Failed to initialize mPIPE!\n");
+		goto fail;
+	}
+
+	if (!network_cpus_used)
+		network_cpus_map = cpu_online_map;
+
+#ifdef CONFIG_DATAPLANE
+	/* Remove dataplane cpus. */
+	cpus_andnot(network_cpus_map, network_cpus_map, dataplane_map);
+#endif
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	/* ISSUE: Handle failures more gracefully. */
+	tile_net_init_stacks(network_cpus_count);
+
+	/* Allocate one NotifRing for each network cpu. */
+	ring = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					    0, 0);
+	if (ring < 0) {
+		pr_warning("Failed to allocate notif rings.\n");
+		goto fail;
+	}
+
+	/* ISSUE: Handle failures below more cleanly. */
+
+	/* Init NotifRings. */
+	next_ring = ring;
+
+	for_each_online_cpu(cpu) {
+
+		size_t notif_ring_size =
+			IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info_t *info = infos[cpu];
+
+		size_t comps_size =
+			TILE_NET_DEVS * sizeof(struct tile_net_comps);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate comps memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		/* ISSUE: Is this needed? */
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_DEVS; i++)
+			info->comps_for_dev[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate iqueue memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+
+		if (gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					   addr, notif_ring_size, 0) != 0)
+			panic("Failure in 'gxio_mpipe_iqueue_init()'.");
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	group = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (group < 0)
+		panic("Failure in 'gxio_mpipe_alloc_notif_groups()'.");
+
+#ifndef TILE_NET_ROUND_ROBIN
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+#endif
+
+	/* Allocate some buckets. */
+	first_bucket = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (first_bucket < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buckets()'.");
+
+	/* Init group and buckets. */
+	if (gxio_mpipe_init_notif_group_and_buckets(&context, group, ring,
+						    network_cpus_count,
+						    first_bucket, num_buckets,
+						    mode) != 0)
+		panic("Fail in 'gxio_mpipe_init_notif_group_and_buckets().");
+
+
+	/* Create an irq and register it. */
+	ingress_irq = create_irq();
+	if (ingress_irq < 0)
+		panic("Failed to create irq for ingress.");
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	BUG_ON(request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			   0, NULL, NULL) != 0);
+
+	for_each_online_cpu(cpu) {
+
+		struct tile_net_info_t *info = infos[cpu];
+
+		int ring = info->iqueue.ring;
+
+		if (!info->has_iqueue)
+			continue;
+
+		gxio_mpipe_request_notif_ring_interrupt(&context,
+							cpu_x(cpu), cpu_y(cpu),
+							1, ingress_irq, ring);
+	}
+
+	return 0;
+
+fail:
+	return -EIO;
+}
+
+
+/* Create persistent egress info for a given channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ */
+static int tile_net_init_egress(struct tile_net_priv *priv)
+{
+	int channel =
+		((priv->loopify_channel >= 0) ?
+		 priv->loopify_channel : priv->channel);
+
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char* headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t* edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t* equeue;
+	int edma;
+
+	/* Allocate memory for the "headers".
+	 * ISSUE: Defer this until TSO is actually needed?
+	 */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		pr_warning("Could not allocate memory for TSO headers.\n");
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		pr_warning("Could not allocate memory for eDMA ring.\n");
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		pr_warning("Could not allocate memory for equeue info.\n");
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring. */
+	edma = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (edma < 0) {
+		pr_warning("Could not allocate edma ring.\n");
+		goto fail_equeue;
+	}
+
+	/* Initialize the equeue.  This should not fail. */
+	if (gxio_mpipe_equeue_init(equeue, &context, edma, channel,
+				   edescs, edescs_size, 0) != 0)
+		panic("Failure in 'gxio_mpipe_equeue_init()'.");
+
+	/* Done. */
+	priv->equeue = equeue;
+	priv->headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return -EIO;
+}
+
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Determine if this is the "loopify" device. */
+	bool loopify = !strcmp(dev->name, loopify_link_name);
+
+	int result;
+
+	mutex_lock(&tile_net_devs_mutex);
+
+	if (ingress_irq < 0) {
+		result = tile_net_init_cpus();
+		if (result != 0)
+			goto fail;
+	}
+
+	if (priv->channel < 0) {
+		const char* ln = loopify ? "loop0" : dev->name;
+		if (gxio_mpipe_link_open(&priv->link, &context, ln, 0) < 0) {
+			netdev_err(dev, "Failed to open '%s'.\n", ln);
+			result = -EIO;
+			goto fail;
+		}
+		priv->channel = gxio_mpipe_link_channel(&priv->link);
+		BUG_ON(priv->channel < 0 || priv->channel >= 32);
+	}
+
+	if (loopify && priv->loopify_channel < 0) {
+		if (gxio_mpipe_link_open(&priv->loopify_link,
+					 &context, "loop1", 0) < 0) {
+			netdev_err(dev, "Failed to open 'loop1'.\n");
+			result = -EIO;
+			goto fail;
+		}
+		priv->loopify_channel =
+			gxio_mpipe_link_channel(&priv->loopify_link);
+		BUG_ON(priv->loopify_channel < 0);
+	}
+
+	/* Initialize egress info (if needed). */
+	if (priv->equeue == NULL) {
+		result = tile_net_init_egress(priv);
+		if (result != 0)
+			goto fail;
+	}
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	result = tile_net_update();
+	if (result != 0)
+		goto fail_channel;
+
+	mutex_unlock(&tile_net_devs_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail_channel:
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		else
+			priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		else
+			priv->channel = -1;
+	}
+
+	mutex_unlock(&tile_net_devs_mutex);
+	return result;
+}
+
+
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update();
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	mutex_unlock(&tile_net_devs_mutex);
+
+	return 0;
+}
+
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	gxio_mpipe_equeue_t *equeue = priv->equeue;
+
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_stats_t *stats;
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	struct tile_net_comps *comps;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	BUG_ON(skb->data_len == 0);
+	BUG_ON(sh->nr_frags == 0);
+	BUG_ON(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	BUG_ON(skb_headlen(skb) != sh_len);
+	BUG_ON(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	BUG_ON(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	BUG_ON(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	BUG_ON(skb->protocol != htons(ETH_P_IP));
+	BUG_ON(ih->protocol != IPPROTO_TCP);
+	BUG_ON(skb->ip_summed != CHECKSUM_PARTIAL);
+	BUG_ON(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	BUG_ON(f_id + 1 != sh->nr_frags);
+	BUG_ON(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		if (net_ratelimit())
+			pr_info("Egress blocked on '%s'!\n", dev->name);
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			priv->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum, usum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jsum = __insn_v2sadu(jsum, 0);
+		jsum = __insn_v2sadu(jsum, 0);
+		jsum = (0xFFFF ^ jsum);
+		jh->check = jsum;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final)
+			uh->fin = uh->psh = 0;
+
+		/* Compute the tcp pseudo-header checksum. */
+		usum = tsum_hack + htons(s_len);
+		usum = __insn_v2sadu(usum, 0);
+		usum = __insn_v2sadu(usum, 0);
+		uh->check = usum;
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	__insn_mf();
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			priv->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = (ulong)buf_to_cpa(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = (ulong)buf_to_cpa(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	comps = info->comps_for_dev[priv->devno];
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(dev, comps, 32, false);
+
+	/* Update the completions array. */
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	stats = &info->stats_for_dev[priv->devno];
+	stats->tx_packets += tx_packets;
+	stats->tx_bytes += tx_bytes;
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	gxio_mpipe_equeue_t *equeue = priv->equeue;
+
+	struct tile_net_info_t *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_stats_t *stats;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	struct tile_net_comps *comps;
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = (ulong)buf_to_cpa(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		if (net_ratelimit())
+			pr_info("Egress blocked on '%s'!\n", dev->name);
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	comps = info->comps_for_dev[priv->devno];
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(dev, comps, 32, false);
+
+	/* Update the completions array. */
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	stats = &info->stats_for_dev[priv->devno];
+	stats->tx_packets++;
+	stats->tx_bytes += ((len >= ETH_ZLEN) ? len : ETH_ZLEN);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+
+/* Get System Network Statistics.
+ *
+ * Returns the address of the device statistics structure.
+ */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	int devno = priv->devno;
+
+	u32 rx_packets = 0;
+	u32 tx_packets = 0;
+	u32 rx_bytes = 0;
+	u32 tx_bytes = 0;
+	int i;
+
+	for_each_online_cpu(i) {
+		rx_packets += infos[i]->stats_for_dev[devno].rx_packets;
+		rx_bytes += infos[i]->stats_for_dev[devno].rx_bytes;
+		tx_packets += infos[i]->stats_for_dev[devno].tx_packets;
+		tx_bytes += infos[i]->stats_for_dev[devno].tx_bytes;
+	}
+
+	priv->stats.rx_packets = rx_packets;
+	priv->stats.rx_bytes = rx_bytes;
+	priv->stats.tx_packets = tx_packets;
+	priv->stats.tx_bytes = tx_bytes;
+
+	return &priv->stats;
+}
+
+
+/* Change the "mtu". */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+
+/* Change the Ethernet Address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops      = &tile_net_ops;
+	dev->watchdog_timeo  = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t* mac)
+{
+	int ret;
+	int devno = 0;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Find the next available devno. */
+	while (tile_net_devs[devno] != NULL)
+		devno++;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+	priv->devno = devno;
+
+	priv->channel = priv->loopify_channel = -1;
+
+	/* Save the device. */
+	tile_net_devs[devno] = dev;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		tile_net_devs[devno] = NULL;
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+
+/* Module cleanup. */
+static void __exit tile_net_cleanup(void)
+{
+	int i;
+
+	for (i = 0; i < TILE_NET_DEVS; i++) {
+		struct net_device *dev = tile_net_devs[i];
+		if (dev != NULL) {
+			unregister_netdev(dev);
+			free_netdev(dev);
+		}
+	}
+}
+
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	return 0;
+}
+
+
+#ifndef MODULE
+/* The "network_cpus" boot argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "network_cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static int __init network_cpus_setup(char *str)
+{
+	int rc = cpulist_parse_crop(str, &network_cpus_map);
+	if (rc != 0) {
+		pr_warning("network_cpus=%s: malformed cpu list\n",
+		       str);
+	} else {
+
+		/* Remove dedicated cpus. */
+		cpumask_and(&network_cpus_map, &network_cpus_map,
+			    cpu_possible_mask);
+
+#ifdef CONFIG_DATAPLANE
+		/* Remove dataplane cpus. */
+		cpumask_andnot(&network_cpus_map, &network_cpus_map,
+			       &dataplane_map);
+#endif
+
+		if (cpumask_empty(&network_cpus_map)) {
+			pr_warning("Ignoring network_cpus='%s'.\n", str);
+		} else {
+			char buf[1024];
+			cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+			pr_info("Linux network CPUs: %s\n", buf);
+			network_cpus_used = true;
+		}
+	}
+
+	return 0;
+}
+__setup("network_cpus=", network_cpus_setup);
+
+
+/* The "loopify=LINK" boot argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+static int __init loopify_setup(char *str)
+{
+	strncpy(loopify_link_name, str, sizeof(loopify_link_name) - 1);
+	return 0;
+}
+__setup("loopify=", loopify_setup);
+
+#endif
+
+
+module_init(tile_net_init_module);
+module_exit(tile_net_cleanup);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v2 6/6] tilegx network driver: initial support
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
                                   ` (3 preceding siblings ...)
  2012-04-06 20:38                 ` [PATCH v2 4/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
@ 2012-04-06 20:42                 ` Chris Metcalf
  2012-04-30 14:35                   ` Arnd Bergmann
  2012-04-28 19:41                 ` [PATCH v2 5/6] arch/tile: break out the "csum a long" function to <asm/checksum.h> Chris Metcalf
  5 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 20:42 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1952 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1955 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..169b16c
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1952 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+
+#include <gxio/mpipe.h>
+
+/* For TSO */
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+
+#include <arch/sim.h>
+
+
+/* #define USE_SIM_PRINTF */
+
+#ifdef USE_SIM_PRINTF
+
+static __attribute__((unused, format (printf, 1, 2))) void
+sim_printf(const char *format, ...)
+{
+	char *str;
+	char buf[1024];
+
+	va_list args;
+	va_start(args, format);
+	(void)vsnprintf(buf, sizeof(buf), format, args);
+	va_end(args);
+
+	/* NOTE: Copied from "sim_print()". */
+	for (str = buf; *str != '\0'; str++) {
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+			     (*str << _SIM_CONTROL_OPERATOR_BITS));
+	}
+	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+		     (SIM_PUTC_FLUSH_BINARY << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+
+/* HACK: Allow use of "sim_printf()" instead of "printk()". */
+#define printk sim_printf
+
+#endif
+
+
+/* First, "tile_net_init_module()" initializes each network cpu to
+ * handle incoming packets, and initializes all the network devices.
+ *
+ * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
+ * turn on packet processing, if needed.
+ *
+ * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
+ * stop egress, and possibly turn off packet processing.
+ *
+ * We start out with the ingress IRQ enabled on each CPU.  When it
+ * fires, it is automatically disabled, and we call "napi_schedule()".
+ * This will cause "tile_net_poll()" to be called, which will pull
+ * packets from the netio queue, filtering them out, or passing them
+ * to "netif_receive_skb()".  If our budget is exhausted, we will
+ * return, knowing we will be called again later.  Otherwise, we
+ * reenable the ingress IRQ, and call "napi_complete()".
+ *
+ *
+ * NOTE: Failing to free completions for an arbitrarily long time
+ * (which is defined to be illegal) does in fact cause bizarre problems.
+ *
+ * NOTE: The egress code can be interrupted by the interrupt handler.
+ */
+
+
+/* HACK: Define to support GSO.
+ * ISSUE: This may actually hurt performance of the TCP blaster.
+ */
+#undef TILE_NET_GSO
+
+/* HACK: Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Define to use "round robin" distribution. */
+#undef TILE_NET_ROUND_ROBIN
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+
+#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))
+
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static struct mutex tile_net_devs_for_channel_mutex;
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* Access to "per_cpu_info". */
+static struct tile_net_info *infos[NR_CPUS];
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+
+/* True if "network_cpus" was specified. */
+static bool network_cpus_used;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char loopify_link_name[16];
+
+
+/* The "network_cpus" boot argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "network_cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static int __init network_cpus_setup(char *str)
+{
+	int rc = cpulist_parse_crop(str, &network_cpus_map);
+	if (rc != 0) {
+		pr_warning("network_cpus=%s: malformed cpu list\n",
+		       str);
+	} else {
+
+		/* Remove dedicated cpus. */
+		cpumask_and(&network_cpus_map, &network_cpus_map,
+			    cpu_possible_mask);
+
+
+		if (cpumask_empty(&network_cpus_map)) {
+			pr_warning("Ignoring network_cpus='%s'.\n", str);
+		} else {
+			char buf[1024];
+			cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+			pr_info("Linux network CPUs: %s\n", buf);
+			network_cpus_used = true;
+		}
+	}
+
+	return 0;
+}
+__setup("network_cpus=", network_cpus_setup);
+
+
+/* The "loopify=LINK" boot argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+static int __init loopify_setup(char *str)
+{
+	strncpy(loopify_link_name, str, sizeof(loopify_link_name) - 1);
+	return 0;
+}
+__setup("loopify=", loopify_setup);
+
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+	       s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				    gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+
+	void *va;
+
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	void *buf;
+	unsigned long len;
+
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = tile_io_addr_to_va((unsigned long)gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 * FIXME: The classifier will soon detect "multicast".
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va)
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p\n",
+			      buf, skb, skb->data);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		atomic_add(1, (atomic_t *)&priv->stats.rx_packets);
+		atomic_add(len, (atomic_t *)&priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	unsigned int work = 0;
+
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t* equeue,
+				 struct tile_net_comps *comps,
+				 int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+
+	unsigned int i;
+
+	bool pending = false;
+
+	unsigned long irqflags;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+
+	infos[my_cpu] = info;
+}
+
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				/* FIXME: HACK: We use one of the devices.
+				 * ISSUE: We never call "netif_napi_del()".
+				 */
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+
+/* Helper function for tile_net_open() and tile_net_stop(). */
+static int tile_net_update(void)
+{
+	struct net_device *dev = NULL;
+	int channel;
+	long count = 0;
+	int cpu;
+
+	/* HACK: This is too big for the linux stack. */
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	/* TODO: Add support for "dmac" splitting? */
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (dev == NULL) {
+			dev = tile_net_devs_for_channel[channel];
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can happen if there is no classifier.
+	 * ISSUE: Can anything else cause it to happen?
+	 */
+	if (gxio_mpipe_rules_commit(&rules) != 0) {
+		pr_warning("Failed to update classifier rules!\n");
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu, dev, 1);
+
+	/* HACK: Allow packets to flow. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+
+/* Helper function for "tile_net_init_cpus()". */
+static void tile_net_init_stacks(int network_cpus_count)
+{
+	int err;
+	int i;
+
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+
+	int num_buffers;
+
+	size_t stack_bytes;
+
+	pte_t pte = { 0 };
+
+	void *mem;
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes, honoring the 64KB minimum alignment. */
+	stack_bytes = ROUND_UP(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			       64 * 1024);
+	if (stack_bytes > HPAGE_SIZE)
+		panic("Cannot allocate %d physically contiguous buffers.",
+		      num_buffers);
+
+#if 0
+	sim_printf("Using %d buffers for %d network cpus.\n",
+		   num_buffers, network_cpus_count);
+#endif
+
+	/* Allocate two buffer stacks. */
+	small_buffer_stack = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (small_buffer_stack < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buffer_stacks()'");
+	large_buffer_stack = small_buffer_stack + 1;
+
+	/* Allocate the small memory stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					   small_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Allocate the large buffer stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					   large_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Pin all the client memory. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	err = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+	err = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+
+	/* Provide initial buffers. */
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true))
+			panic("Cannot provide initial buffers!");
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false))
+			panic("Cannot provide initial buffers!");
+	}
+}
+
+
+/* Actually initialize the mPIPE state. */
+static int tile_net_init_cpus(void)
+{
+	int network_cpus_count;
+
+	int ring;
+	int group;
+
+	int next_ring;
+
+	int cpu;
+
+	int i;
+
+#ifdef TILE_NET_ROUND_ROBIN
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_ROUND_ROBIN;
+#else
+	/* Use random rebalancing. */
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY;
+#endif
+
+	if (!hash_default) {
+		pr_warning("Networking requires hash_default!\n");
+		goto fail;
+	}
+
+	if (gxio_mpipe_init(&context, 0) != 0) {
+		pr_warning("Failed to initialize mPIPE!\n");
+		goto fail;
+	}
+
+	if (!network_cpus_used)
+		network_cpus_map = *cpu_online_mask;
+
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	/* ISSUE: Handle failures more gracefully. */
+	tile_net_init_stacks(network_cpus_count);
+
+	/* Allocate one NotifRing for each network cpu. */
+	ring = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					    0, 0);
+	if (ring < 0) {
+		pr_warning("Failed to allocate notif rings.\n");
+		goto fail;
+	}
+
+	/* ISSUE: Handle failures below more cleanly. */
+
+	/* Init NotifRings. */
+	next_ring = ring;
+
+	for_each_online_cpu(cpu) {
+
+		size_t notif_ring_size =
+			IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = infos[cpu];
+
+		/* ISSUE: This is overkill. */
+		size_t comps_size =
+			TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate comps memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		/* ISSUE: Is this needed? */
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate iqueue memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+
+		if (gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					   addr, notif_ring_size, 0) != 0)
+			panic("Failure in 'gxio_mpipe_iqueue_init()'.");
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	group = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (group < 0)
+		panic("Failure in 'gxio_mpipe_alloc_notif_groups()'.");
+
+#ifndef TILE_NET_ROUND_ROBIN
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+#endif
+
+	/* Allocate some buckets. */
+	first_bucket = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (first_bucket < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buckets()'.");
+
+	/* Init group and buckets. */
+	if (gxio_mpipe_init_notif_group_and_buckets(&context, group, ring,
+						    network_cpus_count,
+						    first_bucket, num_buckets,
+						    mode) != 0)
+		panic("Fail in 'gxio_mpipe_init_notif_group_and_buckets().");
+
+
+	/* Create an irq and register it. */
+	ingress_irq = create_irq();
+	if (ingress_irq < 0)
+		panic("Failed to create irq for ingress.");
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	BUG_ON(request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			   0, NULL, NULL) != 0);
+
+	for_each_online_cpu(cpu) {
+
+		struct tile_net_info *info = infos[cpu];
+
+		int ring = info->iqueue.ring;
+
+		if (!info->has_iqueue)
+			continue;
+
+		gxio_mpipe_request_notif_ring_interrupt(&context,
+							cpu_x(cpu), cpu_y(cpu),
+							1, ingress_irq, ring);
+	}
+
+	return 0;
+
+fail:
+	return -EIO;
+}
+
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(int echannel)
+{
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char* headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t* edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t* equeue;
+	int edma;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		pr_warning("Could not allocate memory for TSO headers.\n");
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		pr_warning("Could not allocate memory for eDMA ring.\n");
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		pr_warning("Could not allocate memory for equeue info.\n");
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring. */
+	edma = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (edma < 0) {
+		pr_warning("Could not allocate edma ring.\n");
+		goto fail_equeue;
+	}
+
+	/* Initialize the equeue.  This should not fail. */
+	if (gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				   edescs, edescs_size, 0) != 0)
+		panic("Failure in 'gxio_mpipe_equeue_init()'.");
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return -EIO;
+}
+
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Determine if this is the "loopify" device. */
+	bool loopify = !strcmp(dev->name, loopify_link_name);
+
+	int result;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	if (ingress_irq < 0) {
+		result = tile_net_init_cpus();
+		if (result != 0)
+			goto fail;
+	}
+
+	if (priv->channel < 0) {
+		const char* ln = loopify ? "loop0" : dev->name;
+		if (gxio_mpipe_link_open(&priv->link, &context, ln, 0) < 0) {
+			netdev_err(dev, "Failed to open '%s'.\n", ln);
+			result = -EIO;
+			goto fail;
+		}
+		priv->channel = gxio_mpipe_link_channel(&priv->link);
+		BUG_ON(priv->channel < 0 ||
+		       priv->channel >= TILE_NET_CHANNELS);
+	}
+
+	if (loopify && priv->loopify_channel < 0) {
+		if (gxio_mpipe_link_open(&priv->loopify_link,
+					 &context, "loop1", 0) < 0) {
+			netdev_err(dev, "Failed to open 'loop1'.\n");
+			result = -EIO;
+			goto fail;
+		}
+		priv->loopify_channel =
+			gxio_mpipe_link_channel(&priv->loopify_link);
+		BUG_ON(priv->loopify_channel < 0 ||
+			priv->loopify_channel >= TILE_NET_CHANNELS);
+	}
+
+	priv->echannel =
+		((priv->loopify_channel >= 0) ?
+		 priv->loopify_channel : priv->channel);
+
+	/* Initialize egress info (if needed). */
+	result = tile_net_init_egress(priv->echannel);
+	if (result != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	result = tile_net_update();
+	if (result != 0)
+		goto fail_channel;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail_channel:
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		else
+			priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		else
+			priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+	return result;
+}
+
+
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update();
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	BUG_ON(skb->data_len == 0);
+	BUG_ON(sh->nr_frags == 0);
+	BUG_ON(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	BUG_ON(skb_headlen(skb) != sh_len);
+	BUG_ON(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	BUG_ON(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	BUG_ON(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	BUG_ON(skb->protocol != htons(ETH_P_IP));
+	BUG_ON(ih->protocol != IPPROTO_TCP);
+	BUG_ON(skb->ip_summed != CHECKSUM_PARTIAL);
+	BUG_ON(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	BUG_ON(f_id + 1 != sh->nr_frags);
+	BUG_ON(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final)
+			uh->fin = uh->psh = 0;
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	atomic_add(tx_packets, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add(tx_bytes, (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+		   (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+
+/* Get System Network Statistics.
+ *
+ * Returns the address of the device statistics structure.
+ */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+
+/* Change the "mtu". */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+
+/* Change the Ethernet Address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops      = &tile_net_ops;
+	dev->watchdog_timeo  = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t* mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = priv->loopify_channel = priv->echannel = -1;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v10] tilegx network driver: initial support
  2012-06-06 18:54                                                 ` David Miller
  2001-09-17  4:00                                                   ` [PATCH v10] " Chris Metcalf
@ 2012-04-06 20:42                                                   ` Chris Metcalf
  2012-06-07 20:39                                                     ` David Miller
  2012-06-07 20:52                                                     ` Joe Perches
  2012-06-07 20:45                                                   ` Chris Metcalf
  2 siblings, 2 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-06 20:42 UTC (permalink / raw)
  To: David Miller, eric.dumazet, bhutchings, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version makes the driver multi-queued and support non-zero
tx_queue_len.  I also made a couple of magic numbers into #defines.
I skimmed the tg3.c driver, but didn't see any other obvious
changes that would be appropriate.

 drivers/net/ethernet/tile/Kconfig  |    2 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1898 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1902 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..098b1c4 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,8 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
+	select HIGH_RES_TIMERS if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..ee7556a
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1898 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and echannel. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* The transmit wake timer for a given cpu and echannel. */
+struct tile_net_tx_wake {
+	struct hrtimer timer;
+	struct net_device *dev;
+};
+	
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct hrtimer egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+	/* Transmit wake timer for each egress channel. */
+	struct tile_net_tx_wake tx_wake[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* Buffer sizes and mpipe enum codes for buffer stacks.
+ * See arch/tile/include/gxio/mpipe.h for the set of possible values.
+ */
+#define BUFFER_SIZE_SMALL_ENUM GXIO_MPIPE_BUFFER_SIZE_128
+#define BUFFER_SIZE_SMALL 128
+#define BUFFER_SIZE_LARGE_ENUM GXIO_MPIPE_BUFFER_SIZE_1664
+#define BUFFER_SIZE_LARGE 1664
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? BUFFER_SIZE_SMALL : BUFFER_SIZE_LARGE);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	for (;;) {
+		tile_io_addr_t addr =
+			(tile_io_addr_t)gxio_mpipe_pop_buffer(&context, stack);
+		if (addr == 0)
+			break;
+		dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr)));
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == BUFFER_SIZE_SMALL_ENUM)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(gxio_mpipe_idesc_t *idesc)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		struct tile_net_priv *priv = netdev_priv(dev);
+		tile_net_stats_add(1, &priv->stats.rx_dropped);
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers();
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			break;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			break;
+	}
+	return n;
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	hrtimer_start(&info->tx_wake[priv->echannel].timer,
+		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+	struct tile_net_tx_wake *tx_wake =
+		container_of(t, struct tile_net_tx_wake, timer);
+	netif_wake_subqueue(tx_wake->dev, smp_processor_id());
+	return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled. */
+static void tile_net_schedule_egress_timer(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->egress_timer_scheduled) {
+		hrtimer_start(&info->egress_timer,
+			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      HRTIMER_MODE_REL_PINNED);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer();
+
+	local_irq_restore(irqflags);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = arg;
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  BUFFER_SIZE_SMALL_ENUM,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  BUFFER_SIZE_LARGE_ENUM,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring++,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc < 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return ring;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring);
+		if (rc < 0)
+			goto fail;
+		ring = rc;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu, rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize the transmit wake timer for this device for each cpu. */
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_init(&tx_wake->timer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		tx_wake->timer.function = tile_net_handle_tx_wake_timer;
+		tx_wake->dev = dev;
+	}
+
+	for_each_online_cpu(cpu)
+		netif_start_subqueue(dev, cpu);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_cancel(&tx_wake->timer);
+		netif_stop_subqueue(dev, cpu);
+	}
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+				       struct tile_net_comps *comps,
+				       gxio_mpipe_equeue_t *equeue,
+				       int num_edescs)
+{
+	/* Try to acquire a completion entry. */
+	if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+	    tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+		/* Try to acquire an egress slot. */
+		s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+
+		/* Freeing some completions gives the equeue time to drain. */
+		tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+		slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+	}
+
+	/* Still nothing; give up and stop the queue for a short while. */
+	netif_stop_subqueue(dev, smp_processor_id());
+	tile_net_schedule_tx_wake_timer(dev);
+	return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int data_len = skb->data_len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, p_len;
+	unsigned int isum_seed, tsum_seed, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = skb_network_offset(skb);
+	th_off = skb_transport_offset(skb);
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_seed = ((0xFFFF - ih->check) +
+		     (0xFFFF - ih->tot_len) +
+		     (0xFFFF - ih->id));
+	tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(sh_len + p_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_seed + ih->tot_len +
+				      ih->id) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_seed + htons(sh_len + p_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do "TSO" handling for egress.
+ *
+ * Normally drivers set NETIF_F_TSO only to support hardware TSO;
+ * otherwise the stack uses scatter-gather to implement GSO in software.
+ * On our testing, enabling GSO support (via NETIF_F_SG) drops network
+ * performance down to around 7.5 Gbps on the 10G interfaces, although
+ * also dropping cpu utilization way down, to under 8%.  But
+ * implementing "TSO" in the driver brings performance back up to line
+ * rate, while dropping cpu usage even further, to less than 4%.  In
+ * practice, profiling of GSO shows that skb_segment() is what causes
+ * the performance overheads; we benefit in the driver from using
+ * preallocated memory to duplicate the TCP/IP headers.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	struct tile_net_comps *comps = info->comps_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned long irqflags;
+	int num_edescs;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_edescs;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_edescs; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_edescs - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_edescs; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Return subqueue id on this core (one per core). */
+static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	return smp_processor_id();
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		netif_wake_subqueue(dev, cpu);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_select_queue = tile_net_select_queue,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev_mqs(sizeof(*priv), name, tile_net_setup,
+			       NR_CPUS, 1);
+	if (!dev) {
+		pr_err("alloc_netdev_mqs(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc.
  2012-04-06 17:52 ` [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
@ 2012-04-09 13:24   ` Arnd Bergmann
  2012-04-09 20:53     ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-09 13:24 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel

On Friday 06 April 2012, Chris Metcalf wrote:
> Add support for MMIO read/write on tilegx to support GXIO IORPC access.
> Similar to the asm-generic version, but we include memory fences on
> the writes to be conservative.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

It's usually better to use inline assembly here, to guarantee that
the compiler does not split an access into multiple byte sized
accesses as it might sometimes do if a register data structure
is marged "packed". The "volatile" guarantees that the access
does not go beyond a single word, but it does not guarantee that
it's atomic.

I don't think you need the fences after the write because PCI MMIO
writes are posted anyway (only PIO is non-posted), but you might need
some kind of barrier on the read to prevent a scenario where an MMIO 
read tells you that a DMA has completed, but the CPU (or the compiler)
has scheduled the read of that data ahead of the MMIO read.

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim
  2012-04-06 20:38 ` [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
@ 2012-04-09 13:34   ` Arnd Bergmann
  2012-04-09 21:04     ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-09 13:34 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel

On Friday 06 April 2012, Chris Metcalf wrote:
> The TILE-Gx chip includes a packet-processing network engine called
> mPIPE ("Multicore Programmable Intelligent Packet Engine").  This
> change adds support for using the mPIPE engine from within the
> kernel.  The engine has more functionality than is exposed here,
> but to keep the kernel code and binary simpler, this is a subset
> of the full API designed to enable standard Linux networking only.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

Hi Chris,

I don't have anything to say about the driver itself, but a few general
comments on coding style.

> +config TILE_GXIO_MPIPE
> +	bool "Tilera Gx mPIPE I/O support"
> +	select TILE_GXIO
> +	select TILE_GXIO_DMA
> +	---help---
> +	  This option supports direct access to the TILE-Gx mPIPE hardware
> +	  from kernel space.  It is not required in order to use the gxio
> +	  library to access mPIPE from user space.

Since this is all library code and does not provide any functionality itself,
you can make the option invisible and just select it from the drivers that
need it.

> +EXPORT_SYMBOL(gxio_mpipe_alloc_buffer_stacks);

Since these are all pretty specific low-level functions, I think it would be
more appropriate to mark them all EXPORT_SYMBOL_GPL.

> +
> +typedef struct {
> +	iorpc_mem_buffer_t buffer;
> +	unsigned int stack;
> +	unsigned int buffer_size_enum;
> +} init_buffer_stack_aux_param_t;

In kernel coding style, we don't use typedef for structures like this.
Just call this a 'struct init_buffer_stack_aux_param' so that a reader
can see that it is a complex data structure and not just a scalar.

> +int gxio_mpipe_link_close(gxio_mpipe_link_t * link)
> +{
> +	return gxio_mpipe_link_close_aux(link->context, link->mac);
> +}
> +
> +EXPORT_SYMBOL(gxio_mpipe_init);
> +EXPORT_SYMBOL(gxio_mpipe_buffer_size_to_buffer_size_enum);
> +EXPORT_SYMBOL(gxio_mpipe_buffer_size_enum_to_buffer_size);
> +EXPORT_SYMBOL(gxio_mpipe_calc_buffer_stack_bytes);
> +EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack);
> +EXPORT_SYMBOL(gxio_mpipe_init_notif_ring);
> +EXPORT_SYMBOL(gxio_mpipe_init_notif_group_and_buckets);
> +EXPORT_SYMBOL(gxio_mpipe_rules_init);
> +EXPORT_SYMBOL(gxio_mpipe_rules_begin);

Move the EXPORT_SYMBOL (_GPL) right after the function, not at the end of the file.

> +// MMIO Ingress DMA Release Region Address.
> +// This is a description of the physical addresses used to manipulate ingress
> +// credit counters.  Accesses to this address space should use an address of
> +// this form and a value like that specified in IDMA_RELEASE_REGION_VAL.
> +

Comment style: You should use

 /*
  * Multi-line
  * comment
  */

or /* single-line comment */

> +__extension__
> +typedef union
> +{
> +  struct
> +  {
> +#ifndef __BIG_ENDIAN__
> +    // Reserved.
> +    uint_reg_t __reserved_0  : 3;
> +    // NotifRing to be released
> +    uint_reg_t ring          : 8;
> +    // Bucket to be released
> +    uint_reg_t bucket        : 13;
> +    // Enable NotifRing release
> +    uint_reg_t ring_enable   : 1;
> +    // Enable Bucket release
> +    uint_reg_t bucket_enable : 1;
> +    // This field of the address selects the region (address space) to be
> +    // accessed.  For the iDMA release region, this field must be 4.
> +    uint_reg_t region        : 3;
> +    // Reserved.
> +    uint_reg_t __reserved_1  : 6;
> +    // This field of the address indexes the 32 entry service domain table.
> +    uint_reg_t svc_dom       : 5;
> +    // Reserved.
> +    uint_reg_t __reserved_2  : 24;
> +#else   // __BIG_ENDIAN__
> +    uint_reg_t __reserved_2  : 24;
> +    uint_reg_t svc_dom       : 5;
> +    uint_reg_t __reserved_1  : 6;
> +    uint_reg_t region        : 3;
> +    uint_reg_t bucket_enable : 1;
> +    uint_reg_t ring_enable   : 1;
> +    uint_reg_t bucket        : 13;
> +    uint_reg_t ring          : 8;
> +    uint_reg_t __reserved_0  : 3;
> +#endif
> +  };
> +  uint_reg_t word;
> +} MPIPE_IDMA_RELEASE_REGION_ADDR_t;

Best try to avoid all bitfields for interfaces like this. Make it an le32
or be32 variable instead and use masks for the accessing the individual
fields.

Do not use capital letters for types.

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-06 20:42 ` [PATCH 6/6] tilegx network driver: initial support Chris Metcalf
@ 2012-04-09 13:49   ` Arnd Bergmann
  2012-04-09 21:30     ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-09 13:49 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, netdev

On Friday 06 April 2012, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> ---
>  drivers/net/ethernet/tile/Kconfig  |    1 +
>  drivers/net/ethernet/tile/Makefile |    4 +-
>  drivers/net/ethernet/tile/tilegx.c | 2045 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 2048 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/net/ethernet/tile/tilegx.c

I think the directory name should be the company, not the architecture here, so make
it drivers/net/ethernet/tilera/tilegx.c instead.

> +
> +MODULE_AUTHOR("Tilera");
> +MODULE_LICENSE("GPL");
> +

MODULE_AUTHOR is normally a real person with an email address.

> +/* Statistics counters for a specific cpu and device. */
> +struct tile_net_stats_t {
> +	u32 rx_packets;
> +	u32 rx_bytes;
> +	u32 tx_packets;
> +	u32 tx_bytes;
> +};

I think you need to drop the _t postfix here, which presumably comes
from converting it from a typedef.

> +
> +/* The actual devices. */
> +static struct net_device *tile_net_devs[TILE_NET_DEVS];
> +
> +/* The device for a given channel.  HACK: We use "32", not
> + * TILE_NET_CHANNELS, because it is fairly subtle that the 5 bit
> + * "idesc.channel" field never exceeds TILE_NET_CHANNELS.
> + */
> +static struct net_device *tile_net_devs_for_channel[32];

When you need to keep a list or array of device structures in a driver, you're
usually doing something very wrong. The convention is to just pass the pointer
around to where you need it.

> +
> +/* Convert a "buffer ptr" into a "buffer cpa". */
> +static inline void *buf_to_cpa(void *buf)
> +{
> +	return (void *)__pa(buf);
> +}
> +
> +
> +/* Convert a "buffer cpa" into a "buffer ptr". */
> +static inline void *cpa_to_buf(void *cpa)
> +{
> +	return (void *)__va(cpa);
> +}

This is almost certainly wrong: The type returned by __pa is a phys_addr_t,
which cannot be dereferenced like a pointer. On normal drivers, you would
use dma_map_single()/dma_unmap_single() to get a token that can get
passed into a dma engine. From what I can tell, this device is directly mapped,
while your PCI uses an IOMMU, so that would require two different
implementations of dma mapping operations.

> +/* Allocate and push a buffer. */
> +static bool tile_net_provide_buffer(bool small)
> +{
> +	int stack = small ? small_buffer_stack : large_buffer_stack;
> +
> +	/* Buffers must be aligned. */
> +	const unsigned long align = 128;
> +
> +	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
> +	 * and also "reserves" that many bytes.
> +	 */
> +	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
> +
> +	/* Allocate (or fail). */
> +	struct sk_buff *skb = dev_alloc_skb(len);
> +	if (skb == NULL)
> +		return false;
> +
> +	/* Make room for a back-pointer to 'skb'. */
> +	skb_reserve(skb, sizeof(struct sk_buff **));
> +
> +	/* Make sure we are aligned. */
> +	skb_reserve(skb, -(long)skb->data & (align - 1));
> +
> +	/* Save a back-pointer to 'skb'. */
> +	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;

This looks very wrong: why would you put the pointer to the skb into the
skb itself?

> +	/* Make sure "skb" and the back-pointer have been flushed. */
> +	__insn_mf();

Try to use archicture independent names for flush operations like this
to make it more readable. I assume this should be smp_wmb()?

> +
> +		/* Compute the "ip checksum". */
> +		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
> +		jsum = __insn_v2sadu(jsum, 0);
> +		jsum = __insn_v2sadu(jsum, 0);
> +		jsum = (0xFFFF ^ jsum);
> +		jh->check = jsum;
> +
> +		/* Update the tcp "seq". */
> +		uh->seq = htonl(seq);
> +
> +		/* Update some flags. */
> +		if (!final)
> +			uh->fin = uh->psh = 0;
> +
> +		/* Compute the tcp pseudo-header checksum. */
> +		usum = tsum_hack + htons(s_len);
> +		usum = __insn_v2sadu(usum, 0);
> +		usum = __insn_v2sadu(usum, 0);
> +		uh->check = usum;

Why to you open-code the ip checksum functions here? Normally the stack takes
care of this by calling the functions you already provide in
arch/tile/lib/checksum.c

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc.
  2012-04-09 13:24   ` Arnd Bergmann
@ 2012-04-09 20:53     ` Chris Metcalf
  0 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-09 20:53 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel

On 4/9/2012 9:24 AM, Arnd Bergmann wrote:
> On Friday 06 April 2012, Chris Metcalf wrote:
>> Add support for MMIO read/write on tilegx to support GXIO IORPC access.
>> Similar to the asm-generic version, but we include memory fences on
>> the writes to be conservative.
>>
>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> It's usually better to use inline assembly here, to guarantee that
> the compiler does not split an access into multiple byte sized
> accesses as it might sometimes do if a register data structure
> is marged "packed". The "volatile" guarantees that the access
> does not go beyond a single word, but it does not guarantee that
> it's atomic.

Good point.  Since tile doesn't support unaligned reads in kernel space
anyway, it hadn't occurred to me, but you're right that the compiler might
be too conservative in some circumstances.  Fixed.

> I don't think you need the fences after the write because PCI MMIO
> writes are posted anyway (only PIO is non-posted), but you might need
> some kind of barrier on the read to prevent a scenario where an MMIO 
> read tells you that a DMA has completed, but the CPU (or the compiler)
> has scheduled the read of that data ahead of the MMIO read.

I'm not sure why posted vs non-posted would matter, but I think you're
right that we shouldn't need them, and some simple testing seemed to show
that the system was stable without them.  The initial developer said they
were there originally just there for reasons of paranoia during development.

Thanks!

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim
  2012-04-09 13:34   ` Arnd Bergmann
@ 2012-04-09 21:04     ` Chris Metcalf
  0 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-09 21:04 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel

On 4/9/2012 9:34 AM, Arnd Bergmann wrote:
> On Friday 06 April 2012, Chris Metcalf wrote:
>> The TILE-Gx chip includes a packet-processing network engine called
>> mPIPE ("Multicore Programmable Intelligent Packet Engine").  This
>> change adds support for using the mPIPE engine from within the
>> kernel.  The engine has more functionality than is exposed here,
>> but to keep the kernel code and binary simpler, this is a subset
>> of the full API designed to enable standard Linux networking only.
>>
>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
>>
>>
>> +config TILE_GXIO_MPIPE
>> +	bool "Tilera Gx mPIPE I/O support"
>> [...]
> Since this is all library code and does not provide any functionality itself,
> you can make the option invisible and just select it from the drivers that
> need it.

Good point.  Before last week they were actually full-scale user options
that you had select to be able to enable networking support, which I
realized was pretty broken when I looked at it.  I had reversed the
"depends" to be "selects", but I hadn't realized I should just make them
purely internal.  Done.

>> +EXPORT_SYMBOL(gxio_mpipe_alloc_buffer_stacks);
> Since these are all pretty specific low-level functions, I think it would be
> more appropriate to mark them all EXPORT_SYMBOL_GPL.

Done.

>> +
>> +typedef struct {
>> +	iorpc_mem_buffer_t buffer;
>> +	unsigned int stack;
>> +	unsigned int buffer_size_enum;
>> +} init_buffer_stack_aux_param_t;
> In kernel coding style, we don't use typedef for structures like this.
> Just call this a 'struct init_buffer_stack_aux_param' so that a reader
> can see that it is a complex data structure and not just a scalar.

This is machine-generated code from "upstream" (the Tilera hypervisor).  I
will look into how straightforward it is to use plain structs here.

>> +int gxio_mpipe_link_close(gxio_mpipe_link_t * link)
>> +{
>> +	return gxio_mpipe_link_close_aux(link->context, link->mac);
>> +}
>> +
>> +EXPORT_SYMBOL(gxio_mpipe_init);
>> +EXPORT_SYMBOL(gxio_mpipe_buffer_size_to_buffer_size_enum);
>> +EXPORT_SYMBOL(gxio_mpipe_buffer_size_enum_to_buffer_size);
>> +EXPORT_SYMBOL(gxio_mpipe_calc_buffer_stack_bytes);
>> +EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack);
>> +EXPORT_SYMBOL(gxio_mpipe_init_notif_ring);
>> +EXPORT_SYMBOL(gxio_mpipe_init_notif_group_and_buckets);
>> +EXPORT_SYMBOL(gxio_mpipe_rules_init);
>> +EXPORT_SYMBOL(gxio_mpipe_rules_begin);
> Move the EXPORT_SYMBOL (_GPL) right after the function, not at the end of the file.

Done.  There were some shared-code issues that made this initially ugly
(this code is shared by a userspace library and by the kernel), but some
more aggressive use of "unifdef" and "sed" left things cleaner when the
dust settled. :-)

>> +// MMIO Ingress DMA Release Region Address.
>> +// This is a description of the physical addresses used to manipulate ingress
>> +// credit counters.  Accesses to this address space should use an address of
>> +// this form and a value like that specified in IDMA_RELEASE_REGION_VAL.
>> +
> Comment style

Yes, more of the machine-generated code here.  I will look into this one as
well.

> [...]
> +    uint_reg_t __reserved_0  : 3;
> +#endif
> +  };
> +  uint_reg_t word;
> +} MPIPE_IDMA_RELEASE_REGION_ADDR_t;
> Best try to avoid all bitfields for interfaces like this. Make it an le32
> or be32 variable instead and use masks for the accessing the individual
> fields.

We've had good experiences with bitfields, in fact.  In practice, in our
experience, bitfields result in code that's easier to get right, especially
when doing read-modify-write of something that's more than a bit wide, vs.
using shifts/masks.  (And especially when the overall item is 64 bits wide;
you have to remember to make your masks properly "UL" or you get mysterious
truncation, etc.)  Our compiler has well-understood bitfield management
properties.  This was less true with older compilers on other
architectures, I think.

> Do not use capital letters for types.

We normally don't, but these types are in fact exactly the representation
of the structure of the MMIO word whose structure is given by #defines of
the form MPIPE_IDMA_RELEASE_REGION_ADDR__* in the lower-level headers. 
(Note that I've pruned the headers so as not to spam the kernel with
multiple 200KB+ headers that we only use a few dozen lines in, so this is
less obvious than it otherwise would be.)

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-09 13:49   ` Arnd Bergmann
@ 2012-04-09 21:30     ` Chris Metcalf
  2012-04-10 10:42       ` Arnd Bergmann
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-09 21:30 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel, netdev

On 4/9/2012 9:49 AM, Arnd Bergmann wrote:
> On Friday 06 April 2012, Chris Metcalf wrote:
>> This change adds support for the tilegx network driver based on the
>> GXIO IORPC support in the tilegx software stack, using the on-chip
>> mPIPE packet processing engine.
>>
>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
>> ---
>>  drivers/net/ethernet/tile/Kconfig  |    1 +
>>  drivers/net/ethernet/tile/Makefile |    4 +-
>>  drivers/net/ethernet/tile/tilegx.c | 2045 ++++++++++++++++++++++++++++++++++++
>>  3 files changed, 2048 insertions(+), 2 deletions(-)
>>  create mode 100644 drivers/net/ethernet/tile/tilegx.c
> I think the directory name should be the company, not the architecture here, so make
> it drivers/net/ethernet/tilera/tilegx.c instead.

This path was picked back when Jeff Kirsher did the initial move into
drivers/net/ethernet/ for the tilepro driver.  I don't have too strong an
opinion on this; at this point I'm mostly just concerned that it seems like
potentially not worth the churn to move the files for 3.2, then again for
3.5.  But if folks agree we should do it, it's fine with me.

We can put that in a separate change so it sweeps up the tilepro ethernet
support as well, which is otherwise not involved in this change series.

>> +MODULE_AUTHOR("Tilera");
>> +MODULE_LICENSE("GPL");
>> +
> MODULE_AUTHOR is normally a real person with an email address.

The actual author would rather not publish his name (I just double-checked
with him).  I didn't write this module, so it doesn't seem right to use my
name.  I did change it to "Tilera Corporation" just because that seems a
bit better.  I did a sweep and turned up a fair number of other similar
uses in our internal code and for now made them all "Tilera Corporation",
but I've encouraged our OS developers to consider using their names on
driver code they are writing, so some drivers coming from Tilera may carry
full names in the future.

>> +/* Statistics counters for a specific cpu and device. */
>> +struct tile_net_stats_t {
>> +	u32 rx_packets;
>> +	u32 rx_bytes;
>> +	u32 tx_packets;
>> +	u32 tx_bytes;
>> +};
> I think you need to drop the _t postfix here, which presumably comes
> from converting it from a typedef.

Fixed.

>> +/* The actual devices. */
>> +static struct net_device *tile_net_devs[TILE_NET_DEVS];
>> +
>> +/* The device for a given channel.  HACK: We use "32", not
>> + * TILE_NET_CHANNELS, because it is fairly subtle that the 5 bit
>> + * "idesc.channel" field never exceeds TILE_NET_CHANNELS.
>> + */
>> +static struct net_device *tile_net_devs_for_channel[32];
> When you need to keep a list or array of device structures in a driver, you're
> usually doing something very wrong. The convention is to just pass the pointer
> around to where you need it.

We need "tile_net_devs_for_channel" because we share a single hardware
queue for all devices, and each packet's metadata contains a "channel"
value which indicates the device.

>> +
>> +/* Convert a "buffer ptr" into a "buffer cpa". */
>> +static inline void *buf_to_cpa(void *buf)
>> +{
>> +	return (void *)__pa(buf);
>> +}
>> +
>> +
>> +/* Convert a "buffer cpa" into a "buffer ptr". */
>> +static inline void *cpa_to_buf(void *cpa)
>> +{
>> +	return (void *)__va(cpa);
>> +}
> This is almost certainly wrong: The type returned by __pa is a phys_addr_t,
> which cannot be dereferenced like a pointer. On normal drivers, you would
> use dma_map_single()/dma_unmap_single() to get a token that can get
> passed into a dma engine. From what I can tell, this device is directly mapped,
> while your PCI uses an IOMMU, so that would require two different
> implementations of dma mapping operations.

Well, it's right, but ridiculously confusing.  What I've done today is
eliminate these two functions, and add the following code in <asm/io.h>:

/*
 * The on-chip I/O hardware on tilegx is configured with VA=PA for the
 * kernel's PA range.  The low-level APIs and field names use "va" and
 * "void *" nomenclature, to be consistent with the general notion
 * that the addresses in question are virtualizable, but in the kernel
 * context we are actually manipulating PA values.  To allow readers
 * of the code to understand what's happening, we direct their
 * attention to this comment by using the following two no-op functions.
 */
static inline unsigned long pa_to_tile_io_addr(phys_addr_t pa)
{
        BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(unsigned long));
        return pa;
}
static inline phys_addr_t tile_io_addr_to_pa(unsigned long tile_io_addr)
{
        return tile_io_addr;
}

Then the individual uses in the network driver are just things like
"edesc_head.va = pa_to_tile_io_addr(__pa(va))" or "va =
__va(tile_io_addr_to_pa((unsigned long)gxio_mpipe_idesc_get_va(idesc)))"
which I think is a little clearer.

>> +/* Allocate and push a buffer. */
>> +static bool tile_net_provide_buffer(bool small)
>> +{
>> [...]
>> +
>> +	/* Save a back-pointer to 'skb'. */
>> +	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
> This looks very wrong: why would you put the pointer to the skb into the
> skb itself?

Because we create skbuffs, and then feed the raw underlying buffer storage
to our hardware, and later, we get back this raw pointer from hardware,
from which we need to be able to extract the actual skbuff.

>> +	/* Make sure "skb" and the back-pointer have been flushed. */
>> +	__insn_mf();
> Try to use archicture independent names for flush operations like this
> to make it more readable. I assume this should be smp_wmb()?

Done, though it's just wmb() here, since we're fencing against the I/O
hardware, not against other cores.

>> +
>> +		/* Compute the "ip checksum". */
>> +		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
>> +		jsum = __insn_v2sadu(jsum, 0);
>> +		jsum = __insn_v2sadu(jsum, 0);
>> +		jsum = (0xFFFF ^ jsum);
>> +		jh->check = jsum;
>> +
>> +		/* Update the tcp "seq". */
>> +		uh->seq = htonl(seq);
>> +
>> +		/* Update some flags. */
>> +		if (!final)
>> +			uh->fin = uh->psh = 0;
>> +
>> +		/* Compute the tcp pseudo-header checksum. */
>> +		usum = tsum_hack + htons(s_len);
>> +		usum = __insn_v2sadu(usum, 0);
>> +		usum = __insn_v2sadu(usum, 0);
>> +		uh->check = usum;
> Why to you open-code the ip checksum functions here? Normally the stack takes
> care of this by calling the functions you already provide in
> arch/tile/lib/checksum.c

If there is a way to do TSO without this, we'd be happy to hear it, but
it's not clear how it would be possible.  We are only computing a PARTIAL
checksum here, and letting the hardware compute the "full" checksum.

Thanks!

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-09 21:30     ` Chris Metcalf
@ 2012-04-10 10:42       ` Arnd Bergmann
  2012-04-12 23:23         ` Chris Metcalf
  2012-04-15 23:06         ` Chris Metcalf
  0 siblings, 2 replies; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-10 10:42 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, netdev

On Monday 09 April 2012, Chris Metcalf wrote:
> On 4/9/2012 9:49 AM, Arnd Bergmann wrote:
> > On Friday 06 April 2012, Chris Metcalf wrote:
> >> This change adds support for the tilegx network driver based on the
> >> GXIO IORPC support in the tilegx software stack, using the on-chip
> >> mPIPE packet processing engine.
> >>
> >> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> >> ---
> >>  drivers/net/ethernet/tile/Kconfig  |    1 +
> >>  drivers/net/ethernet/tile/Makefile |    4 +-
> >>  drivers/net/ethernet/tile/tilegx.c | 2045 ++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 2048 insertions(+), 2 deletions(-)
> >>  create mode 100644 drivers/net/ethernet/tile/tilegx.c
> > I think the directory name should be the company, not the architecture here, so make
> > it drivers/net/ethernet/tilera/tilegx.c instead.
> 
> This path was picked back when Jeff Kirsher did the initial move into
> drivers/net/ethernet/ for the tilepro driver.  I don't have too strong an
> opinion on this; at this point I'm mostly just concerned that it seems like
> potentially not worth the churn to move the files for 3.2, then again for
> 3.5.  But if folks agree we should do it, it's fine with me.

Ah, I didn't realize that the directory already exists. It's probably better
not to move it then.

> The actual author would rather not publish his name (I just double-checked
> with him). 

Hmm, it doesn't look all that bad actually, the comments I had are just for
small details.

> >> +/* The actual devices. */
> >> +static struct net_device *tile_net_devs[TILE_NET_DEVS];
> >> +
> >> +/* The device for a given channel.  HACK: We use "32", not
> >> + * TILE_NET_CHANNELS, because it is fairly subtle that the 5 bit
> >> + * "idesc.channel" field never exceeds TILE_NET_CHANNELS.
> >> + */
> >> +static struct net_device *tile_net_devs_for_channel[32];
> > When you need to keep a list or array of device structures in a driver, you're
> > usually doing something very wrong. The convention is to just pass the pointer
> > around to where you need it.
> 
> We need "tile_net_devs_for_channel" because we share a single hardware
> queue for all devices, and each packet's metadata contains a "channel"
> value which indicates the device.
 
Ok, but please remove tile_net_devs then.

I think a better abstraction for tile_net_devs_for_channel would be
some interface that lets you add private data to a channel so when
you get data from a channel, you can extract that pointer from the driver
using the channel.

Don't you already have a per-channel data structure?

> 
> /*
>  * The on-chip I/O hardware on tilegx is configured with VA=PA for the
>  * kernel's PA range.  The low-level APIs and field names use "va" and
>  * "void *" nomenclature, to be consistent with the general notion
>  * that the addresses in question are virtualizable, but in the kernel
>  * context we are actually manipulating PA values.  To allow readers
>  * of the code to understand what's happening, we direct their
>  * attention to this comment by using the following two no-op functions.
>  */
> static inline unsigned long pa_to_tile_io_addr(phys_addr_t pa)
> {
>         BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(unsigned long));
>         return pa;
> }
> static inline phys_addr_t tile_io_addr_to_pa(unsigned long tile_io_addr)
> {
>         return tile_io_addr;
> }
> 
> Then the individual uses in the network driver are just things like
> "edesc_head.va = pa_to_tile_io_addr(__pa(va))" or "va =
> __va(tile_io_addr_to_pa((unsigned long)gxio_mpipe_idesc_get_va(idesc)))"
> which I think is a little clearer.

Yes, although I would probably add a typedef for tile_io_addr and pass
the virtual address in and out these helper functions.

For added clarity, you could make the interface look like dma_map_single(),
which requires adding an empty unmap() function as well -- that would
make it obvious where that data is actually used. Why do you require
the reverse map anyway? Normally you only need to pass a bus address to
the device but don't need to translate that back into a virtual address
because you already had that in the beginning.

> >> +/* Allocate and push a buffer. */
> >> +static bool tile_net_provide_buffer(bool small)
> >> +{
> >> [...]
> >> +
> >> +	/* Save a back-pointer to 'skb'. */
> >> +	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
> > This looks very wrong: why would you put the pointer to the skb into the
> > skb itself?
> 
> Because we create skbuffs, and then feed the raw underlying buffer storage
> to our hardware, and later, we get back this raw pointer from hardware,
> from which we need to be able to extract the actual skbuff.

Hmm, this sounds very unusual, but I don't really have a better suggestion
here.

> >> +		/* Compute the "ip checksum". */
> >> +		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
> >> +		jsum = __insn_v2sadu(jsum, 0);
> >> +		jsum = __insn_v2sadu(jsum, 0);
> >> +		jsum = (0xFFFF ^ jsum);
> >> +		jh->check = jsum;
> >> +
> >> +		/* Update the tcp "seq". */
> >> +		uh->seq = htonl(seq);
> >> +
> >> +		/* Update some flags. */
> >> +		if (!final)
> >> +			uh->fin = uh->psh = 0;
> >> +
> >> +		/* Compute the tcp pseudo-header checksum. */
> >> +		usum = tsum_hack + htons(s_len);
> >> +		usum = __insn_v2sadu(usum, 0);
> >> +		usum = __insn_v2sadu(usum, 0);
> >> +		uh->check = usum;
> > Why to you open-code the ip checksum functions here? Normally the stack takes
> > care of this by calling the functions you already provide in
> > arch/tile/lib/checksum.c
> 
> If there is a way to do TSO without this, we'd be happy to hear it, but
> it's not clear how it would be possible.  We are only computing a PARTIAL
> checksum here, and letting the hardware compute the "full" checksum.

Sounds like you're looking for csum_partial() ;-)

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-10 10:42       ` Arnd Bergmann
@ 2012-04-12 23:23         ` Chris Metcalf
  2012-04-13 10:34           ` Arnd Bergmann
  2012-04-15 23:06         ` Chris Metcalf
  1 sibling, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-04-12 23:23 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel, netdev

On 4/10/2012 6:42 AM, Arnd Bergmann wrote:
> On Monday 09 April 2012, Chris Metcalf wrote:
>> On 4/9/2012 9:49 AM, Arnd Bergmann wrote:
>>> On Friday 06 April 2012, Chris Metcalf wrote:
>>>> This change adds support for the tilegx network driver based on the
>>>> GXIO IORPC support in the tilegx software stack, using the on-chip
>>>> mPIPE packet processing engine.
>>>>
>>>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
>>>> ---
>>>>  drivers/net/ethernet/tile/Kconfig  |    1 +
>>>>  drivers/net/ethernet/tile/Makefile |    4 +-
>>>>  drivers/net/ethernet/tile/tilegx.c | 2045 ++++++++++++++++++++++++++++++++++++
>>>>  3 files changed, 2048 insertions(+), 2 deletions(-)
>>>>  create mode 100644 drivers/net/ethernet/tile/tilegx.c
>>>
>> The actual author would rather not publish his name (I just double-checked
>> with him). 
> Hmm, it doesn't look all that bad actually, the comments I had are just for
> small details.

FWIW, the author's preference doesn't have to do with the code quality, but
for his own reasons.

>>>> +/* The actual devices. */
>>>> +static struct net_device *tile_net_devs[TILE_NET_DEVS];
>>>> +
>>>> +/* The device for a given channel.  HACK: We use "32", not
>>>> + * TILE_NET_CHANNELS, because it is fairly subtle that the 5 bit
>>>> + * "idesc.channel" field never exceeds TILE_NET_CHANNELS.
>>>> + */
>>>> +static struct net_device *tile_net_devs_for_channel[32];
>>> When you need to keep a list or array of device structures in a driver, you're
>>> usually doing something very wrong. The convention is to just pass the pointer
>>> around to where you need it.
>> We need "tile_net_devs_for_channel" because we share a single hardware
>> queue for all devices, and each packet's metadata contains a "channel"
>> value which indicates the device.
>  
> Ok, but please remove tile_net_devs then.
> I think a better abstraction for tile_net_devs_for_channel would be
> some interface that lets you add private data to a channel so when
> you get data from a channel, you can extract that pointer from the driver
> using the channel.


I think what would be clearer is to document how and why we are using this
additional data structure.  We do access via both arrays where it is
efficient to do so, so getting rid of either of them doesn't seem right. 
Let's keep the "normal" tile_net_devs[] as is, indexed by devno, and make
the tile_net_devs_for_channel[] more abstracted by using the following code:

/*
 * Provide support for efficiently mapping a channel to the device
 * that is using that channel, or NULL if none.  The pointers in this
 * array are only non-NULL when pointing to active tilegx net_device
 * structures, and they are cleared before the struture itself is
 * released.
 *
 * HACK: We use "32", not TILE_NET_CHANNELS, because it is fairly
 * subtle that the 5 bit "idesc.channel" field never exceeds
 * TILE_NET_CHANNELS.
 */
static struct net_device *channel_to_dev[32];

static void bychannel_add(struct net_device *dev)
{
	struct tile_net_priv *priv = netdev_priv(dev);
	BUG_ON(channel_to_dev[priv->channel] != NULL);
	channel_to_dev[priv->channel] = dev;
}

static void bychannel_delete(struct net_device *dev)
{
	struct tile_net_priv *priv = netdev_priv(dev);
	channel_to_dev[priv->channel] = NULL;
}

static inline struct net_device *bychannel_lookup(int channel)
{
	return channel_to_dev[channel];
}


We then call bychannel_add() in the open method, and bychannel_delete() in
the close method, so it's clear that the pointers have appropriate lifetimes.

> Don't you already have a per-channel data structure?

Nope.

>> /*
>>  * The on-chip I/O hardware on tilegx is configured with VA=PA for the
>>  * kernel's PA range.  The low-level APIs and field names use "va" and
>>  * "void *" nomenclature, to be consistent with the general notion
>>  * that the addresses in question are virtualizable, but in the kernel
>>  * context we are actually manipulating PA values.  To allow readers
>>  * of the code to understand what's happening, we direct their
>>  * attention to this comment by using the following two no-op functions.
>>  */
>> static inline unsigned long pa_to_tile_io_addr(phys_addr_t pa)
>> {
>>         BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(unsigned long));
>>         return pa;
>> }
>> static inline phys_addr_t tile_io_addr_to_pa(unsigned long tile_io_addr)
>> {
>>         return tile_io_addr;
>> }
>>
>> Then the individual uses in the network driver are just things like
>> "edesc_head.va = pa_to_tile_io_addr(__pa(va))" or "va =
>> __va(tile_io_addr_to_pa((unsigned long)gxio_mpipe_idesc_get_va(idesc)))"
>> which I think is a little clearer.
> Yes, although I would probably add a typedef for tile_io_addr and pass
> the virtual address in and out these helper functions.

Good ideas, done.

> For added clarity, you could make the interface look like dma_map_single(),
> which requires adding an empty unmap() function as well -- that would
> make it obvious where that data is actually used. Why do you require
> the reverse map anyway? Normally you only need to pass a bus address to
> the device but don't need to translate that back into a virtual address
> because you already had that in the beginning.

We need the reverse map since the hardware hands us an "idesc" which has
just the tile_io_addr value in it, so we need to map it back to a va to
deal with it.

I don't think the map/unmap abstraction is too helpful here, since we're
assuming that memory is all fully-mapped, as it always is on GX.

>>>> +		/* Compute the "ip checksum". */
>>>> +		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
>>>> +		jsum = __insn_v2sadu(jsum, 0);
>>>> +		jsum = __insn_v2sadu(jsum, 0);
>>>> +		jsum = (0xFFFF ^ jsum);
>>>> +		jh->check = jsum;
>>>> +
>>>> +		/* Update the tcp "seq". */
>>>> +		uh->seq = htonl(seq);
>>>> +
>>>> +		/* Update some flags. */
>>>> +		if (!final)
>>>> +			uh->fin = uh->psh = 0;
>>>> +
>>>> +		/* Compute the tcp pseudo-header checksum. */
>>>> +		usum = tsum_hack + htons(s_len);
>>>> +		usum = __insn_v2sadu(usum, 0);
>>>> +		usum = __insn_v2sadu(usum, 0);
>>>> +		uh->check = usum;
>>> Why to you open-code the ip checksum functions here? Normally the stack takes
>>> care of this by calling the functions you already provide in
>>> arch/tile/lib/checksum.c
>> If there is a way to do TSO without this, we'd be happy to hear it, but
>> it's not clear how it would be possible.  We are only computing a PARTIAL
>> checksum here, and letting the hardware compute the "full" checksum.
> Sounds like you're looking for csum_partial() ;-)

Well, that's a pretty heavy-weight operation on memory.  Here we're just
updating from a few values held in registers, more or less.  csum_partial()
didn't seem like a good fit.  What I've done is move the longto16() routine
from checksum.c to <asm/checksum.h> and rename it csum_long(), then use it
like this, e.g. for the "usum" block above:

		/* Compute the tcp pseudo-header checksum. */
		uh->check = csum_long(tsum_hack + htons(s_len));


-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-12 23:23         ` Chris Metcalf
@ 2012-04-13 10:34           ` Arnd Bergmann
  2012-04-28 22:07             ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-13 10:34 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, netdev

On Thursday 12 April 2012, Chris Metcalf wrote:
> On 4/10/2012 6:42 AM, Arnd Bergmann wrote:
> > Ok, but please remove tile_net_devs then.
> > I think a better abstraction for tile_net_devs_for_channel would be
> > some interface that lets you add private data to a channel so when
> > you get data from a channel, you can extract that pointer from the driver
> > using the channel.

> I think what would be clearer is to document how and why we are using this
> additional data structure.  We do access via both arrays where it is
> efficient to do so, so getting rid of either of them doesn't seem right. 
> Let's keep the "normal" tile_net_devs[] as is, indexed by devno, and make
> the tile_net_devs_for_channel[] more abstracted by using the following code:

The tile_net_devs still feels dirty. You basically only
use it in tile_net_handle_egress_timer(), but there you don't
actually take the mutex that protects addition and removal from
the array, so it's racy in case of hotplug.

A more conservative way to do this is to have the timer per
device (or by channel, if you like), so it does not have to
iterate the array.

> /*
>  * Provide support for efficiently mapping a channel to the device
>  * that is using that channel, or NULL if none.  The pointers in this
>  * array are only non-NULL when pointing to active tilegx net_device
>  * structures, and they are cleared before the struture itself is
>  * released.
>  *
>  * HACK: We use "32", not TILE_NET_CHANNELS, because it is fairly
>  * subtle that the 5 bit "idesc.channel" field never exceeds
>  * TILE_NET_CHANNELS.
>  */
> static struct net_device *channel_to_dev[32];
> 
> static void bychannel_add(struct net_device *dev)
> {
>         struct tile_net_priv *priv = netdev_priv(dev);
>         BUG_ON(channel_to_dev[priv->channel] != NULL);
>         channel_to_dev[priv->channel] = dev;
> }
> 
> static void bychannel_delete(struct net_device *dev)
> {
>         struct tile_net_priv *priv = netdev_priv(dev);
>         channel_to_dev[priv->channel] = NULL;
> }
> 
> static inline struct net_device *bychannel_lookup(int channel)
> {
>         return channel_to_dev[channel];
> }
> 
> 
> We then call bychannel_add() in the open method, and bychannel_delete() in
> the close method, so it's clear that the pointers have appropriate lifetimes.

Ok.

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* RE: [PATCH 6/6] tilegx network driver: initial support
  2012-04-10 10:42       ` Arnd Bergmann
  2012-04-12 23:23         ` Chris Metcalf
@ 2012-04-15 23:06         ` Chris Metcalf
  1 sibling, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-15 23:06 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel, netdev

Arnd Bergmann wrote:
> > We need "tile_net_devs_for_channel" because we share a single hardware
> > queue for all devices, and each packet's metadata contains a "channel"
> > value which indicates the device.
> 
> Ok, but please remove tile_net_devs then.

I spent some time on Friday with the driver author and we made some good changes, switching over to using more of a per-channel model for everything. We also removed the per-cpu per-driver stats structures and just use atomics to update the per-driver stats directly.  The upshot is a much cleaner set of data structures, but unfortunately we didn't have time to work all the way through the ramifications before the weekend, and I'm out next week for spring school vacation.

Thanks again for the feedback!

Chris Metcalf

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v2 5/6] arch/tile: break out the "csum a long" function to <asm/checksum.h>
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
                                   ` (4 preceding siblings ...)
  2012-04-06 20:42                 ` [PATCH v2 6/6] tilegx network driver: initial support Chris Metcalf
@ 2012-04-28 19:41                 ` Chris Metcalf
  5 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-28 19:41 UTC (permalink / raw)
  To: Arnd Bergmann, linux-kernel

This makes it available to the tilegx network driver.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/tile/include/asm/checksum.h |   18 ++++++++++++++++++
 arch/tile/lib/checksum.c         |   15 +--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/tile/include/asm/checksum.h b/arch/tile/include/asm/checksum.h
index a120766..b21a2fd 100644
--- a/arch/tile/include/asm/checksum.h
+++ b/arch/tile/include/asm/checksum.h
@@ -21,4 +21,22 @@
 __wsum do_csum(const unsigned char *buff, int len);
 #define do_csum do_csum
 
+/*
+ * Return the sum of all the 16-bit subwords in a long.
+ * This sums two subwords on a 32-bit machine, and four on 64 bits.
+ * The implementation does two vector adds to capture any overflow.
+ */
+static inline unsigned int csum_long(unsigned long x)
+{
+	unsigned long ret;
+#ifdef __tilegx__
+	ret = __insn_v2sadu(x, 0);
+	ret = __insn_v2sadu(ret, 0);
+#else
+	ret = __insn_sadh_u(x, 0);
+	ret = __insn_sadh_u(ret, 0);
+#endif
+	return ret;
+}
+
 #endif /* _ASM_TILE_CHECKSUM_H */
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c
index e4bab5b..c3ca3e6 100644
--- a/arch/tile/lib/checksum.c
+++ b/arch/tile/lib/checksum.c
@@ -16,19 +16,6 @@
 #include <net/checksum.h>
 #include <linux/module.h>
 
-static inline unsigned int longto16(unsigned long x)
-{
-	unsigned long ret;
-#ifdef __tilegx__
-	ret = __insn_v2sadu(x, 0);
-	ret = __insn_v2sadu(ret, 0);
-#else
-	ret = __insn_sadh_u(x, 0);
-	ret = __insn_sadh_u(ret, 0);
-#endif
-	return ret;
-}
-
 __wsum do_csum(const unsigned char *buff, int len)
 {
 	int odd, count;
@@ -94,7 +81,7 @@ __wsum do_csum(const unsigned char *buff, int len)
 	}
 	if (len & 1)
 		result += *buff;
-	result = longto16(result);
+	result = csum_long(result);
 	if (odd)
 		result = swab16(result);
 out:
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-13 10:34           ` Arnd Bergmann
@ 2012-04-28 22:07             ` Chris Metcalf
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
  2012-04-29 11:15               ` [PATCH 6/6] tilegx network driver: initial support Arnd Bergmann
  0 siblings, 2 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-04-28 22:07 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-kernel, netdev

On 4/13/2012 6:34 AM, Arnd Bergmann wrote:
> On Thursday 12 April 2012, Chris Metcalf wrote:
>> On 4/10/2012 6:42 AM, Arnd Bergmann wrote:
>>> Ok, but please remove tile_net_devs then.
>>> I think a better abstraction for tile_net_devs_for_channel would be
>>> some interface that lets you add private data to a channel so when
>>> you get data from a channel, you can extract that pointer from the driver
>>> using the channel.
>> I think what would be clearer is to document how and why we are using this
>> additional data structure.  We do access via both arrays where it is
>> efficient to do so, so getting rid of either of them doesn't seem right. 

In the latest round of changes (to be mailed shortly), we eliminated one of
the arrays entirely.  We now just have an array of net_device pointers
indexed by channel, which we need since we get packets from the hardware
and are only given the channel.  To get the device, we have to look it up
in the array.

Since this is now the only array of net_device pointers, I eliminated the
bychannel*() API I discussed in the previous email, since its use didn't
seem as compelling any more.

>> Let's keep the "normal" tile_net_devs[] as is, indexed by devno, and make
>> the tile_net_devs_for_channel[] more abstracted by using the following code:
> The tile_net_devs still feels dirty. You basically only
> use it in tile_net_handle_egress_timer(), but there you don't
> actually take the mutex that protects addition and removal from
> the array, so it's racy in case of hotplug.

We don't free the net_device structures themselves, so it's safe to do a
lookup in the array and then dereference the net_device pointer even if we
are doing an "ifconfig down" in another thread.  The only way you could
imagine the net_device getting structures getting freed was via module
unload, but it turns out that was pretty broken anyway, so I've just
removed it altogether in the latest version of the patch.  So once you have
a net_device pointer, it remains valid.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH 6/6] tilegx network driver: initial support
  2012-04-28 22:07             ` Chris Metcalf
  2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
@ 2012-04-29 11:15               ` Arnd Bergmann
  1 sibling, 0 replies; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-29 11:15 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, netdev

On Saturday 28 April 2012, Chris Metcalf wrote:
> On 4/13/2012 6:34 AM, Arnd Bergmann wrote:
> > On Thursday 12 April 2012, Chris Metcalf wrote:
> >> On 4/10/2012 6:42 AM, Arnd Bergmann wrote:
> >>> Ok, but please remove tile_net_devs then.
> >>> I think a better abstraction for tile_net_devs_for_channel would be
> >>> some interface that lets you add private data to a channel so when
> >>> you get data from a channel, you can extract that pointer from the driver
> >>> using the channel.
> >> I think what would be clearer is to document how and why we are using this
> >> additional data structure.  We do access via both arrays where it is
> >> efficient to do so, so getting rid of either of them doesn't seem right. 
> 
> In the latest round of changes (to be mailed shortly), we eliminated one of
> the arrays entirely.  We now just have an array of net_device pointers
> indexed by channel, which we need since we get packets from the hardware
> and are only given the channel.  To get the device, we have to look it up
> in the array.
> 
> Since this is now the only array of net_device pointers, I eliminated the
> bychannel*() API I discussed in the previous email, since its use didn't
> seem as compelling any more.
> 
> >> Let's keep the "normal" tile_net_devs[] as is, indexed by devno, and make
> >> the tile_net_devs_for_channel[] more abstracted by using the following code:
> > The tile_net_devs still feels dirty. You basically only
> > use it in tile_net_handle_egress_timer(), but there you don't
> > actually take the mutex that protects addition and removal from
> > the array, so it's racy in case of hotplug.
> 
> We don't free the net_device structures themselves, so it's safe to do a
> lookup in the array and then dereference the net_device pointer even if we
> are doing an "ifconfig down" in another thread.  The only way you could
> imagine the net_device getting structures getting freed was via module
> unload, but it turns out that was pretty broken anyway, so I've just
> removed it altogether in the latest version of the patch.  So once you have
> a net_device pointer, it remains valid.

Ok, sounds all good then.

	Arnd

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v2 6/6] tilegx network driver: initial support
  2012-04-06 20:42                 ` [PATCH v2 6/6] tilegx network driver: initial support Chris Metcalf
@ 2012-04-30 14:35                   ` Arnd Bergmann
  2001-09-17  4:00                     ` [PATCH v3] " Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Arnd Bergmann @ 2012-04-30 14:35 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: linux-kernel, netdev

On Friday 06 April 2012, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

All my previous comments have been addressed. A few more details
that I noticed only now:


> +/* A mutex for "tile_net_devs_for_channel". */
> +static struct mutex tile_net_devs_for_channel_mutex;

static DEFINE_MUTEX()

> +/* The per-cpu info. */
> +static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
> +
> +/* Access to "per_cpu_info". */
> +static struct tile_net_info *infos[NR_CPUS];

The arrays should not be needed. Using per_cpu() on the variable
in front of it does the same.

> +static int __init network_cpus_setup(char *str)
> +{
> +	int rc = cpulist_parse_crop(str, &network_cpus_map);
> +	if (rc != 0) {
> +		pr_warning("network_cpus=%s: malformed cpu list\n",
> +		       str);
> +	} else {
> +
> +		/* Remove dedicated cpus. */
> +		cpumask_and(&network_cpus_map, &network_cpus_map,
> +			    cpu_possible_mask);
> +
> +
> +		if (cpumask_empty(&network_cpus_map)) {
> +			pr_warning("Ignoring network_cpus='%s'.\n", str);
> +		} else {
> +			char buf[1024];
> +			cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
> +			pr_info("Linux network CPUs: %s\n", buf);
> +			network_cpus_used = true;
> +		}
> +	}
> +
> +	return 0;
> +}
> +__setup("network_cpus=", network_cpus_setup);

In device drivers, use module_param() instead of __setup() so that you can
set the arguments on the kernel command line and using modprobe with the
same syntax.

> +/* This function takes "skb", consisting of a header template and a
> + * (presumably) huge payload, and egresses it as one or more segments
> + * (aka packets), each consisting of a (possibly modified) copy of the
> + * header plus a piece of the payload, via "tcp segmentation offload".
> + *
> + * Usually, "data" will contain the header template, of size "sh_len",
> + * and "sh->frags" will contain "skb->data_len" bytes of payload, and
> + * there will be "sh->gso_segs" segments.
> + *
> + * Sometimes, if "sendfile()" requires copying, we will be called with
> + * "data" containing the header and payload, with "frags" being empty.
> + *
> + * Sometimes, for example when using NFS over TCP, a single segment can
> + * span 3 fragments.  This requires special care below.
> + *
> + * See "emulate_large_send_offload()" for some reference code, which
> + * does not handle checksumming.
> + */
> +static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
> +{

This function seems too long to be readable. I would suggest splitting
out some of the loop bodies in it into separate functions.

	Arnd


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v3] tilegx network driver: initial support
  2001-09-17  4:00                     ` [PATCH v3] " Chris Metcalf
@ 2012-05-03  5:41                       ` David Miller
  2012-05-03 15:45                         ` Chris Metcalf
  2012-05-03 16:41                         ` [PATCH v4] " Chris Metcalf
  0 siblings, 2 replies; 61+ messages in thread
From: David Miller @ 2012-05-03  5:41 UTC (permalink / raw)
  To: cmetcalf; +Cc: arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Mon, 17 Sep 2001 00:00:00 -0400

> +/* #define USE_SIM_PRINTF */
> +
> +#ifdef USE_SIM_PRINTF
> +
> +static __attribute__((unused, format (printf, 1, 2))) void
> +sim_printf(const char *format, ...)
 ...
> +/* HACK: Allow use of "sim_printf()" instead of "printk()". */
> +#define printk sim_printf
> +
> +#endif

This doesn't belong in a driver.

You want a debugging console driver that uses that special SIM output
facility instead.

Therefore, please remove this sim_printf stuff completely.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v3] tilegx network driver: initial support
  2012-05-03  5:41                       ` David Miller
@ 2012-05-03 15:45                         ` Chris Metcalf
  2012-05-03 17:07                           ` David Miller
  2012-05-03 16:41                         ` [PATCH v4] " Chris Metcalf
  1 sibling, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-03 15:45 UTC (permalink / raw)
  To: David Miller; +Cc: arnd, linux-kernel, netdev

On 5/3/2012 1:41 AM, David Miller wrote:
> From: Chris Metcalf <cmetcalf@tilera.com>
> Date: Mon, 17 Sep 2001 00:00:00 -0400
>
>> +/* #define USE_SIM_PRINTF */
>> +
>> +#ifdef USE_SIM_PRINTF
>> +
>> +static __attribute__((unused, format (printf, 1, 2))) void
>> +sim_printf(const char *format, ...)
>  ...
>> +/* HACK: Allow use of "sim_printf()" instead of "printk()". */
>> +#define printk sim_printf
>> +
>> +#endif
> This doesn't belong in a driver.
>
> You want a debugging console driver that uses that special SIM output
> facility instead.
>
> Therefore, please remove this sim_printf stuff completely.

Thanks, I've removed it from my branch.  (Since it's a trivial update, I
won't repost the change on LKML unless I get any more feedback that needs
addressing.)

I've checked in support for a "sim_console" boot flag that modifies the
behavior of the tile-specific console driver to use the simulator output
facility instead.  I'll plan to push that to LKML with the next batch of
changes I post.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v4] tilegx network driver: initial support
  2012-05-03  5:41                       ` David Miller
  2012-05-03 15:45                         ` Chris Metcalf
@ 2012-05-03 16:41                         ` Chris Metcalf
  2012-05-04  6:42                           ` David Miller
  1 sibling, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-03 16:41 UTC (permalink / raw)
  To: David Miller, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version removes the USE_SIM_PRINTF hack from the driver.

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1919 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1922 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..297c074
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1919 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+
+#include <gxio/mpipe.h>
+
+/* For TSO */
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+
+#include <arch/sim.h>
+
+
+
+/* First, "tile_net_init_module()" initializes each network cpu to
+ * handle incoming packets, and initializes all the network devices.
+ *
+ * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
+ * turn on packet processing, if needed.
+ *
+ * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
+ * stop egress, and possibly turn off packet processing.
+ *
+ * We start out with the ingress IRQ enabled on each CPU.  When it
+ * fires, it is automatically disabled, and we call "napi_schedule()".
+ * This will cause "tile_net_poll()" to be called, which will pull
+ * packets from the netio queue, filtering them out, or passing them
+ * to "netif_receive_skb()".  If our budget is exhausted, we will
+ * return, knowing we will be called again later.  Otherwise, we
+ * reenable the ingress IRQ, and call "napi_complete()".
+ *
+ *
+ * NOTE: Failing to free completions for an arbitrarily long time
+ * (which is defined to be illegal) does in fact cause bizarre problems.
+ *
+ * NOTE: The egress code can be interrupted by the interrupt handler.
+ */
+
+
+/* HACK: Define to support GSO.
+ * ISSUE: This may actually hurt performance of the TCP blaster.
+ */
+#undef TILE_NET_GSO
+
+/* HACK: Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Define to use "round robin" distribution. */
+#undef TILE_NET_ROUND_ROBIN
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+
+#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))
+
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warning("tile_net.cpus=%s: malformed cpu list\n",
+		       network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warning("Ignoring empty tile_net.cpus='%s'.\n",
+			   network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+	       s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				    gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+
+	void *va;
+
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	void *buf;
+	unsigned long len;
+
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = tile_io_addr_to_va((unsigned long)gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 * FIXME: The classifier will soon detect "multicast".
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va)
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p\n",
+			      buf, skb, skb->data);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		atomic_add(1, (atomic_t *)&priv->stats.rx_packets);
+		atomic_add(len, (atomic_t *)&priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	unsigned int work = 0;
+
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t* equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+
+	unsigned int i;
+
+	bool pending = false;
+
+	unsigned long irqflags;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				/* FIXME: HACK: We use one of the devices.
+				 * ISSUE: We never call "netif_napi_del()".
+				 */
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+
+/* Helper function for tile_net_open() and tile_net_stop(). */
+static int tile_net_update(void)
+{
+	struct net_device *dev = NULL;
+	int channel;
+	long count = 0;
+	int cpu;
+
+	/* HACK: This is too big for the linux stack. */
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	/* TODO: Add support for "dmac" splitting? */
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (dev == NULL) {
+			dev = tile_net_devs_for_channel[channel];
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can happen if there is no classifier.
+	 * ISSUE: Can anything else cause it to happen?
+	 */
+	if (gxio_mpipe_rules_commit(&rules) != 0) {
+		pr_warning("Failed to update classifier rules!\n");
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu, dev, 1);
+
+	/* HACK: Allow packets to flow. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+
+/* Helper function for "tile_net_init_cpus()". */
+static void tile_net_init_stacks(int network_cpus_count)
+{
+	int err;
+	int i;
+
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+
+	int num_buffers;
+
+	size_t stack_bytes;
+
+	pte_t pte = { 0 };
+
+	void *mem;
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes, honoring the 64KB minimum alignment. */
+	stack_bytes = ROUND_UP(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			       64 * 1024);
+	if (stack_bytes > HPAGE_SIZE)
+		panic("Cannot allocate %d physically contiguous buffers.",
+		      num_buffers);
+
+#if 0
+	sim_printf("Using %d buffers for %d network cpus.\n",
+		   num_buffers, network_cpus_count);
+#endif
+
+	/* Allocate two buffer stacks. */
+	small_buffer_stack = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (small_buffer_stack < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buffer_stacks()'");
+	large_buffer_stack = small_buffer_stack + 1;
+
+	/* Allocate the small memory stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					   small_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Allocate the large buffer stack. */
+	mem = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (mem == NULL)
+		panic("Could not allocate buffer memory!");
+	err = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					   large_buf_size,
+					   mem, stack_bytes, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_init_buffer_stack()'.", err);
+
+	/* Pin all the client memory. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	err = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+	err = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+						pte, 0);
+	if (err != 0)
+		panic("Error %d in 'gxio_mpipe_register_buffer_memory()'.",
+		      err);
+
+	/* Provide initial buffers. */
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true))
+			panic("Cannot provide initial buffers!");
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false))
+			panic("Cannot provide initial buffers!");
+	}
+}
+
+
+/* Actually initialize the mPIPE state. */
+static int tile_net_init_cpus(void)
+{
+	int network_cpus_count;
+
+	int ring;
+	int group;
+
+	int next_ring;
+
+	int cpu;
+
+	int i;
+
+#ifdef TILE_NET_ROUND_ROBIN
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_ROUND_ROBIN;
+#else
+	/* Use random rebalancing. */
+	gxio_mpipe_bucket_mode_t mode = GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY;
+#endif
+
+	if (!hash_default) {
+		pr_warning("Networking requires hash_default!\n");
+		goto fail;
+	}
+
+	if (gxio_mpipe_init(&context, 0) != 0) {
+		pr_warning("Failed to initialize mPIPE!\n");
+		goto fail;
+	}
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	/* ISSUE: Handle failures more gracefully. */
+	tile_net_init_stacks(network_cpus_count);
+
+	/* Allocate one NotifRing for each network cpu. */
+	ring = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					    0, 0);
+	if (ring < 0) {
+		pr_warning("Failed to allocate notif rings.\n");
+		goto fail;
+	}
+
+	/* ISSUE: Handle failures below more cleanly. */
+
+	/* Init NotifRings. */
+	next_ring = ring;
+
+	for_each_online_cpu(cpu) {
+
+		size_t notif_ring_size =
+			IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		/* ISSUE: This is overkill. */
+		size_t comps_size =
+			TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate comps memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		/* ISSUE: Is this needed? */
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL)
+			panic("Failed to allocate iqueue memory.");
+		addr = pfn_to_kaddr(page_to_pfn(page));
+
+		if (gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					   addr, notif_ring_size, 0) != 0)
+			panic("Failure in 'gxio_mpipe_iqueue_init()'.");
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	group = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (group < 0)
+		panic("Failure in 'gxio_mpipe_alloc_notif_groups()'.");
+
+#ifndef TILE_NET_ROUND_ROBIN
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+#endif
+
+	/* Allocate some buckets. */
+	first_bucket = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (first_bucket < 0)
+		panic("Failure in 'gxio_mpipe_alloc_buckets()'.");
+
+	/* Init group and buckets. */
+	if (gxio_mpipe_init_notif_group_and_buckets(&context, group, ring,
+						    network_cpus_count,
+						    first_bucket, num_buckets,
+						    mode) != 0)
+		panic("Fail in 'gxio_mpipe_init_notif_group_and_buckets().");
+
+
+	/* Create an irq and register it. */
+	ingress_irq = create_irq();
+	if (ingress_irq < 0)
+		panic("Failed to create irq for ingress.");
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	BUG_ON(request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			   0, NULL, NULL) != 0);
+
+	for_each_online_cpu(cpu) {
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		int ring = info->iqueue.ring;
+
+		if (!info->has_iqueue)
+			continue;
+
+		gxio_mpipe_request_notif_ring_interrupt(&context,
+							cpu_x(cpu), cpu_y(cpu),
+							1, ingress_irq, ring);
+	}
+
+	return 0;
+
+fail:
+	return -EIO;
+}
+
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(int echannel)
+{
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char* headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t* edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t* equeue;
+	int edma;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		pr_warning("Could not allocate memory for TSO headers.\n");
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		pr_warning("Could not allocate memory for eDMA ring.\n");
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		pr_warning("Could not allocate memory for equeue info.\n");
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring. */
+	edma = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (edma < 0) {
+		pr_warning("Could not allocate edma ring.\n");
+		goto fail_equeue;
+	}
+
+	/* Initialize the equeue.  This should not fail. */
+	if (gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				   edescs, edescs_size, 0) != 0)
+		panic("Failure in 'gxio_mpipe_equeue_init()'.");
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return -EIO;
+}
+
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Determine if this is the "loopify" device. */
+	bool loopify = (loopify_link_name != NULL) &&
+		!strcmp(dev->name, loopify_link_name);
+
+	int result;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	if (ingress_irq < 0) {
+		result = tile_net_init_cpus();
+		if (result != 0)
+			goto fail;
+	}
+
+	if (priv->channel < 0) {
+		const char* ln = loopify ? "loop0" : dev->name;
+		if (gxio_mpipe_link_open(&priv->link, &context, ln, 0) < 0) {
+			netdev_err(dev, "Failed to open '%s'.\n", ln);
+			result = -EIO;
+			goto fail;
+		}
+		priv->channel = gxio_mpipe_link_channel(&priv->link);
+		BUG_ON(priv->channel < 0 ||
+		       priv->channel >= TILE_NET_CHANNELS);
+	}
+
+	if (loopify && priv->loopify_channel < 0) {
+		if (gxio_mpipe_link_open(&priv->loopify_link,
+					 &context, "loop1", 0) < 0) {
+			netdev_err(dev, "Failed to open 'loop1'.\n");
+			result = -EIO;
+			goto fail;
+		}
+		priv->loopify_channel =
+			gxio_mpipe_link_channel(&priv->loopify_link);
+		BUG_ON(priv->loopify_channel < 0 ||
+			priv->loopify_channel >= TILE_NET_CHANNELS);
+	}
+
+	priv->echannel =
+		((priv->loopify_channel >= 0) ?
+		 priv->loopify_channel : priv->channel);
+
+	/* Initialize egress info (if needed). */
+	result = tile_net_init_egress(priv->echannel);
+	if (result != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	result = tile_net_update();
+	if (result != 0)
+		goto fail_channel;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail_channel:
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		else
+			priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		else
+			priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+	return result;
+}
+
+
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update();
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			pr_warning("Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			pr_warning("Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	BUG_ON(skb->data_len == 0);
+	BUG_ON(sh->nr_frags == 0);
+	BUG_ON(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	BUG_ON(skb_headlen(skb) != sh_len);
+	BUG_ON(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	BUG_ON(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	BUG_ON(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	BUG_ON(skb->protocol != htons(ETH_P_IP));
+	BUG_ON(ih->protocol != IPPROTO_TCP);
+	BUG_ON(skb->ip_summed != CHECKSUM_PARTIAL);
+	BUG_ON(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	BUG_ON(f_id + 1 != sh->nr_frags);
+	BUG_ON(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final)
+			uh->fin = uh->psh = 0;
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = f_size = f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	atomic_add(tx_packets, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add(tx_bytes, (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif /* TILE_NET_DUMP_PACKETS */
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+		   (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+
+/* Get System Network Statistics.
+ *
+ * Returns the address of the device statistics structure.
+ */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+
+/* Change the "mtu". */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+
+/* Change the Ethernet Address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops      = &tile_net_ops;
+	dev->watchdog_timeo  = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t* mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = priv->loopify_channel = priv->echannel = -1;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v3] tilegx network driver: initial support
  2012-05-03 15:45                         ` Chris Metcalf
@ 2012-05-03 17:07                           ` David Miller
  2012-05-03 17:25                             ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: David Miller @ 2012-05-03 17:07 UTC (permalink / raw)
  To: cmetcalf; +Cc: arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 3 May 2012 11:45:32 -0400

> Thanks, I've removed it from my branch.  (Since it's a trivial update, I
> won't repost the change on LKML unless I get any more feedback that needs
> addressing.)

Sorry, this approach doesn't work, you should post a new version
more expediently.

When I see something so terrible like I saw this time, I just provide
feedback on that major issue and stop reviewing.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v3] tilegx network driver: initial support
  2012-05-03 17:07                           ` David Miller
@ 2012-05-03 17:25                             ` Chris Metcalf
  0 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-05-03 17:25 UTC (permalink / raw)
  To: David Miller; +Cc: arnd, linux-kernel, netdev

On 5/3/2012 1:07 PM, David Miller wrote:
> From: Chris Metcalf <cmetcalf@tilera.com>
> Date: Thu, 3 May 2012 11:45:32 -0400
>
>> Thanks, I've removed it from my branch.  (Since it's a trivial update, I
>> won't repost the change on LKML unless I get any more feedback that needs
>> addressing.)
> Sorry, this approach doesn't work, you should post a new version
> more expediently.
>
> When I see something so terrible like I saw this time, I just provide
> feedback on that major issue and stop reviewing.

Thanks, good to know.  Will do.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v4] tilegx network driver: initial support
  2012-05-03 16:41                         ` [PATCH v4] " Chris Metcalf
@ 2012-05-04  6:42                           ` David Miller
  2012-05-09 10:42                             ` [PATCH v5] " Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: David Miller @ 2012-05-04  6:42 UTC (permalink / raw)
  To: cmetcalf; +Cc: arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 3 May 2012 12:41:56 -0400

> +/* First, "tile_net_init_module()" initializes each network cpu to
> + * handle incoming packets, and initializes all the network devices.
> + *
> + * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
> + * turn on packet processing, if needed.
> + *
> + * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
> + * stop egress, and possibly turn off packet processing.
> + *
> + * We start out with the ingress IRQ enabled on each CPU.  When it
> + * fires, it is automatically disabled, and we call "napi_schedule()".
> + * This will cause "tile_net_poll()" to be called, which will pull
> + * packets from the netio queue, filtering them out, or passing them
> + * to "netif_receive_skb()".  If our budget is exhausted, we will
> + * return, knowing we will be called again later.  Otherwise, we
> + * reenable the ingress IRQ, and call "napi_complete()".

This is not the place where you document how the generic networking
brings devices up and down, and what driver methods are called during
those actions.

Imagine if every driver writer decided to do this.

> +#define TILE_NET_MAX_COMPS 64
> +
> +

Please get rid of all of these more-than-one empty line sequences.

> +#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))

This is ALIGN() from linux/kernel.h, please us it.

At this rate I anticipate at least 20 rounds of review, this driver
still needs quite a bit of work.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v5] tilegx network driver: initial support
  2012-05-04  6:42                           ` David Miller
@ 2012-05-09 10:42                             ` Chris Metcalf
  2012-05-11 13:54                               ` Ben Hutchings
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-09 10:42 UTC (permalink / raw)
  To: David Miller, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version removes some "generic" comments on network driver
infrastructure, removes runs of multiple blank lines, and eliminates
the hand-rolled ROUND_UP() macro in favor of ALIGN().  There are also
a number of additional cleanups.

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1928 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1931 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..1ba52a7
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1928 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Define to support GSO. */
+#undef TILE_NET_GSO
+
+/* Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+		s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	void *va;
+	while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+	void *va;
+	void *buf;
+	unsigned long len;
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = tile_io_addr_to_va((unsigned long)gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va) {
+			/* Panic here since there's a reasonable chance
+			 * that corrupt buffers means generic memory
+			 * corruption, with unpredictable system effects.
+			 */
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p",
+			      buf, skb, skb->data);
+		}
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		atomic_add(1, (atomic_t *)&priv->stats.rx_packets);
+		atomic_add(len, (atomic_t *)&priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	int channel;
+	long count = 0;
+	int cpu;
+	int rc;
+	bool saw_channel;
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	saw_channel = false;
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed simply via gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+	size_t stack_bytes;
+	pte_t pte = { 0 };
+	void *small = NULL;
+	void *large = NULL;
+	int i, num_buffers, rc;
+	int network_cpus_count, cpu;
+	int ring, group, next_ring;
+	size_t comps_size = 0;
+	size_t notif_ring_size = 0;
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc =  gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	stack_bytes = ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			    64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (small == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  small_buf_size,
+					  small, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		goto fail;
+	}
+
+	/* Allocate the large buffer stack. */
+	large = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (large == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  large_buf_size,
+					  large, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Register all the client memory in mpipe TLBs. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					  0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail_pop;
+	}
+
+	/* Init NotifRings. */
+	ring = rc;
+	next_ring = rc;
+
+	/* ISSUE: This is more than strictly necessary. */
+	comps_size = TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+	notif_ring_size = IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+	for_each_online_cpu(cpu) {
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes comps memory\n",
+				   comps_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   notif_ring_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					    addr, notif_ring_size, 0);
+		if (rc != 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			goto fail_pop;
+		}
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		goto fail_pop;
+	}
+	group = rc;
+
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		goto fail_pop;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		goto fail_pop;
+	}
+
+	/* Create an irq and register it. Note that "ingress_irq" being
+	 * initialized is how we know not to call this function again.
+	 */
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		goto fail_pop;
+
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		goto fail_pop;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+
+fail_pop:
+	/* Do cleanups that require the mpipe context first. */
+	tile_net_pop_all_buffers(small_buffer_stack);
+	tile_net_pop_all_buffers(large_buffer_stack);
+
+fail:
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(comps_size));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(notif_ring_size));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small)
+		free_pages_exact(small, stack_bytes);
+	if (large)
+		free_pages_exact(large, stack_bytes);
+
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char *headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t *edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t *equeue;
+	int edma;
+
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update(dev);
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Used for paranoia to make sure we handle no ill-formed packets. */
+#define TSO_DROP_IF(cond) \
+	do { if (WARN_ON(cond)) return NETDEV_TX_OK; } while (0)
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	TSO_DROP_IF(skb->data_len == 0);
+	TSO_DROP_IF(sh->nr_frags == 0);
+	TSO_DROP_IF(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	TSO_DROP_IF(skb_headlen(skb) != sh_len);
+	TSO_DROP_IF(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	TSO_DROP_IF(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	TSO_DROP_IF(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	TSO_DROP_IF(skb->protocol != htons(ETH_P_IP));
+	TSO_DROP_IF(ih->protocol != IPPROTO_TCP);
+	TSO_DROP_IF(skb->ip_summed != CHECKSUM_PARTIAL);
+	TSO_DROP_IF(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	TSO_DROP_IF(f_id + 1 != sh->nr_frags);
+	TSO_DROP_IF(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final) {
+			uh->fin = 0;
+			uh->psh = 0;
+		}
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	atomic_add(tx_packets, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add(tx_bytes, (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+		   (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v5] tilegx network driver: initial support
  2012-05-09 10:42                             ` [PATCH v5] " Chris Metcalf
@ 2012-05-11 13:54                               ` Ben Hutchings
  2012-05-20  4:42                                 ` [PATCH v6] " Chris Metcalf
  2012-05-20 16:35                                 ` [PATCH v5] " Chris Metcalf
  0 siblings, 2 replies; 61+ messages in thread
From: Ben Hutchings @ 2012-05-11 13:54 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: David Miller, arnd, linux-kernel, netdev

Here's another very incomplete review for you.

On Wed, 2012-05-09 at 06:42 -0400, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
[...]
> --- /dev/null
> +++ b/drivers/net/ethernet/tile/tilegx.c
[...]
> +/* Define to support GSO. */
> +#undef TILE_NET_GSO

GSO is always enabled by the networking core.

> +/* Define to support TSO. */
> +#define TILE_NET_TSO

No, put NETIF_F_TSO in hw_features so it can be switched at run-time.

(Currently that won't work if you don't set dev->ethtool_ops, but that's
a bug that can be fixed.)

> +/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
> +#define TILE_NET_TX_QUEUE_LEN 0

This can be changed through sysfs, so there is no need for a compile-
time option.

> +/* Define to dump packets (prints out the whole packet on tx and rx). */
> +#undef TILE_NET_DUMP_PACKETS

Should really be controlled through a 'debug' module parameter (see
netif_msg_init(), netif_msg_pktdata(), etc.)

[...]
> +/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
> + * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
> + * 60 bytes of actual TCP header.  We round up to align to cache lines.
> + */
> +#define HEADER_BYTES 128
> +
> +/* Maximum completions per cpu per device (must be a power of two).
> + * ISSUE: What is the right number here?
> + */
> +#define TILE_NET_MAX_COMPS 64
> +
> +#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)

Should be MAX_SKB_FRAGS + 1.

[...]
> +/* Help the kernel transmit a packet. */
> +static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct tile_net_priv *priv = netdev_priv(dev);
> +
> +	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
> +
> +	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
> +	gxio_mpipe_equeue_t *equeue = egress->equeue;
> +
> +	struct tile_net_comps *comps =
> +		info->comps_for_echannel[priv->echannel];
> +
> +	struct skb_shared_info *sh = skb_shinfo(skb);
> +
> +	unsigned int len = skb->len;
> +	unsigned char *data = skb->data;
> +
> +	unsigned int num_frags;
> +	struct frag frags[MAX_FRAGS];
> +	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
> +
> +	unsigned int i;
> +
> +	int cid;
> +
> +	s64 slot;
> +
> +	unsigned long irqflags;

Please, no blank lines between your declarations.

[...]
> +	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
> +	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
> +	if (slot < 0) {
> +		local_irq_restore(irqflags);
> +		/* ISSUE: "Virtual device xxx asks to queue packet". */
> +		return NETDEV_TX_BUSY;
> +	}

You're supposed to stop queues when they're full.  And since that state
appears to be per-CPU, I think this device needs to be multiqueue with
one TX queue per CPU and ndo_select_queue defined accordingly.

> +	for (i = 0; i < num_frags; i++)
> +		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
> +
> +	/* Wait for a free completion entry.
> +	 * ISSUE: Is this the best logic?
> +	 * ISSUE: Can this cause undesirable "blocking"?
> +	 */
> +	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
> +		tile_net_free_comps(equeue, comps, 32, false);

I'm not convinced you should be processing completions here at all.  But
certainly you should have stopped the queue earlier rather than having
to wait here.

> +	/* Update the completions array. */
> +	cid = comps->comp_next % TILE_NET_MAX_COMPS;
> +	comps->comp_queue[cid].when = slot + num_frags;
> +	comps->comp_queue[cid].skb = skb;
> +	comps->comp_next++;
> +
> +	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
> +	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
> +	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
> +		   (atomic_t *)&priv->stats.tx_bytes);

You mustn't treat random fields to atomic_t.  For one thing, atomic_t
contains an int while stats are unsigned long...

Also, you're adding cache contention between all your CPUs here.  You
should maintain these stats per-CPU and then sum them in
tile_net_get_stats().  Then you can just use ordinary additions.

[...]
> +/* Ioctl commands. */
> +static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
> +{
> +	return -EOPNOTSUPP;
> +}

So why define it at all?

[...]
> +static void tile_net_dev_init(const char *name, const uint8_t *mac)
> +{
[...]
> +	/* Register the network device. */
> +	ret = register_netdev(dev);
> +	if (ret) {
> +		netdev_err(dev, "register_netdev failed %d\n", ret);
> +		free_netdev(dev);
> +		return;
> +	}
> +
> +	/* Get the MAC address and set it in the device struct; this must
> +	 * be done before the device is opened.
[...]

So you had better do this before calling register_netdev(), as the
device can be opened immediately after that...

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v6] tilegx network driver: initial support
  2012-05-11 13:54                               ` Ben Hutchings
@ 2012-05-20  4:42                                 ` Chris Metcalf
  2012-05-20 20:55                                   ` David Miller
  2012-05-20 16:35                                 ` [PATCH v5] " Chris Metcalf
  1 sibling, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-20  4:42 UTC (permalink / raw)
  To: Ben Hutchings, David Miller, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1888 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1891 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..ab0d03e
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1888 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	void *va;
+	while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va) {
+			/* Panic here since there's a reasonable chance
+			 * that corrupt buffers means generic memory
+			 * corruption, with unpredictable system effects.
+			 */
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p",
+			      buf, skb, skb->data);
+		}
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		tile_net_stats_add(1, &priv->stats.rx_packets);
+		tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ *
+ * FIXME: We should probably have stopped the queue earlier rather
+ * than having to wait here.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid;
+
+	/* Wait for a free completion entry, if needed. */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	/* This is too big to fit on the stack. */
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed simply via gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+	size_t stack_bytes;
+	pte_t pte = { 0 };
+	void *small = NULL;
+	void *large = NULL;
+	int i, num_buffers, rc;
+	int network_cpus_count, cpu;
+	int ring, group, next_ring;
+	size_t comps_size = 0;
+	size_t notif_ring_size = 0;
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc =  gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	stack_bytes = ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			    64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (small == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  small_buf_size,
+					  small, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		goto fail;
+	}
+
+	/* Allocate the large buffer stack. */
+	large = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (large == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  large_buf_size,
+					  large, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Register all the client memory in mpipe TLBs. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					  0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail_pop;
+	}
+
+	/* Init NotifRings. */
+	ring = rc;
+	next_ring = rc;
+
+	/* ISSUE: This is more than strictly necessary. */
+	comps_size = TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+	notif_ring_size = IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+	for_each_online_cpu(cpu) {
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes comps memory\n",
+				   comps_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   notif_ring_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					    addr, notif_ring_size, 0);
+		if (rc != 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			goto fail_pop;
+		}
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		goto fail_pop;
+	}
+	group = rc;
+
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		goto fail_pop;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		goto fail_pop;
+	}
+
+	/* Create an irq and register it. Note that "ingress_irq" being
+	 * initialized is how we know not to call this function again.
+	 */
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		goto fail_pop;
+
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		goto fail_pop;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+
+fail_pop:
+	/* Do cleanups that require the mpipe context first. */
+	tile_net_pop_all_buffers(small_buffer_stack);
+	tile_net_pop_all_buffers(large_buffer_stack);
+
+fail:
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(comps_size));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(notif_ring_size));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small)
+		free_pages_exact(small, stack_bytes);
+	if (large)
+		free_pages_exact(large, stack_bytes);
+
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update(dev);
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Used for paranoia to make sure we handle no ill-formed packets. */
+#define TSO_DROP_IF(cond) \
+	do { if (WARN_ON(cond)) return NETDEV_TX_OK; } while (0)
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	/* Empty packets (etc) would cause trouble below. */
+	TSO_DROP_IF(skb->data_len == 0);
+	TSO_DROP_IF(sh->nr_frags == 0);
+	TSO_DROP_IF(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	TSO_DROP_IF(skb_headlen(skb) != sh_len);
+	TSO_DROP_IF(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	TSO_DROP_IF(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	TSO_DROP_IF(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	TSO_DROP_IF(skb->protocol != htons(ETH_P_IP));
+	TSO_DROP_IF(ih->protocol != IPPROTO_TCP);
+	TSO_DROP_IF(skb->ip_summed != CHECKSUM_PARTIAL);
+	TSO_DROP_IF(csum_start != eh_len + ih_len);
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	TSO_DROP_IF(f_id + 1 != sh->nr_frags);
+	TSO_DROP_IF(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* See comment in tile_net_tx(). */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final) {
+			uh->fin = 0;
+			uh->psh = 0;
+		}
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_frags - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Try to reserve slots for egress.  If we fail due to the
+	 * queue being full, we return NETDEV_TX_BUSY.  This may lead
+	 * to "Virtual device xxx asks to queue packet" warnings.
+	 *
+	 * We might consider retrying briefly here since We expect in
+	 * principle that egress slots become available quickly as the
+	 * hardware engine drains packets into the network.
+	 *
+	 * FIXME (bug# 9593): We should stop queues when they're full.
+	 * We may want to consider making tile_net be multiqueue with
+	 * one TX queue per CPU and ndo_select_queue defined
+	 * accordingly.  Initially we saw bad things happen when
+	 * stopping the queue, so we are continuing to work on this
+	 * for a future fix.
+	 */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+
+	dev->tx_queue_len = 0;
+
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v5] tilegx network driver: initial support
  2012-05-11 13:54                               ` Ben Hutchings
  2012-05-20  4:42                                 ` [PATCH v6] " Chris Metcalf
@ 2012-05-20 16:35                                 ` Chris Metcalf
  1 sibling, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-05-20 16:35 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, arnd, linux-kernel, netdev

On 5/11/2012 9:54 AM, Ben Hutchings wrote:
> Here's another very incomplete review for you.

Thanks, I (we) appreciate it!

>> +/* Define to support GSO. */
>> +#undef TILE_NET_GSO
> GSO is always enabled by the networking core.
>
>> +/* Define to support TSO. */
>> +#define TILE_NET_TSO
> No, put NETIF_F_TSO in hw_features so it can be switched at run-time.

We already had that; the TSO define was just to decide whether the driver
would even offer TSO support at all.  But on reflection it seems pointless
not to offer TSO, so I've made it true unconditionally and deleted the
define.  Similarly I got rid of the (totally pointless) GSO define and let
the core control whether it switches GSO on or not.

We are looking at GRO support for a following change, but obviously we need
to set up ethtool_ops for that first, so we'll be doing that as well.

>> +/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
>> +#define TILE_NET_TX_QUEUE_LEN 0
> This can be changed through sysfs, so there is no need for a compile-
> time option.

Fair enough, and in practice we don't change this default anyway, so I
eliminated it.

>> +/* Define to dump packets (prints out the whole packet on tx and rx). */
>> +#undef TILE_NET_DUMP_PACKETS
> Should really be controlled through a 'debug' module parameter (see
> netif_msg_init(), netif_msg_pktdata(), etc.)

We almost never use this functionality anyway, so for now, I've just
removed it.  If we want to reintroduce something like it, we'll use the
netif_msg stuff.

>> +	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
>> +	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
>> +	if (slot < 0) {
>> +		local_irq_restore(irqflags);
>> +		/* ISSUE: "Virtual device xxx asks to queue packet". */
>> +		return NETDEV_TX_BUSY;
>> +	}
> You're supposed to stop queues when they're full.  And since that state
> appears to be per-CPU, I think this device needs to be multiqueue with
> one TX queue per CPU and ndo_select_queue defined accordingly.
>
> [...]
>
> I'm not convinced you should be processing completions here at all.  But
> certainly you should have stopped the queue earlier rather than having
> to wait here.

This is a larger issue.  We are working on improving performance in the
driver overall, and how we handle per-cpu or global queueing, how we stop
and restart the driver, etc., will be part of it.  (The underlying mpipe
resources are not per-cpu, so it may or may not make sense to have the
driver believe it's multiqueue.)  I added some placeholder comments and a
reference to our internal bug ID on this issue.

> You mustn't treat random fields to atomic_t.  For one thing, atomic_t
> contains an int while stats are unsigned long...
>
> Also, you're adding cache contention between all your CPUs here.  You
> should maintain these stats per-CPU and then sum them in
> tile_net_get_stats().  Then you can just use ordinary additions.

Oops, you're right that atomic_t is the wrong size.  What I've done is
switch to atomic_long_t, but moved the cast to a separate
tile_net_stats_add() function that has a BUILD_BUG_ON() to validate that
the sizes match, and also a long comment explaining why tilegx's memory
network architecture makes atomic adds exactly the right kind of thing to
do here.  It's easy to forget that 99% of the world has a model of atomics
based on the Intel architecture.

> [...]
>> +/* Ioctl commands. */
>> +static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
>> +{
>> +	return -EOPNOTSUPP;
>> +}
> So why define it at all?

Because a following patch (not yet posted to LKML) adds support for
SIOCSHWTSTAMP and the ioctl was originally written that way to put the
framework in place.

The few suggestions I didn't respond to directly where pretty
straightforward and I just implemented them as you suggested.

Thanks again!  The revised patch will follow momentarily.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v6] tilegx network driver: initial support
  2012-05-20  4:42                                 ` [PATCH v6] " Chris Metcalf
@ 2012-05-20 20:55                                   ` David Miller
  2012-05-23 20:42                                     ` [PATCH v7] " Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: David Miller @ 2012-05-20 20:55 UTC (permalink / raw)
  To: cmetcalf; +Cc: bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Sun, 20 May 2012 00:42:03 -0400

> +static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)

This function has 80 lines of local variable declarations,
that's absolutely rediculous.

A comment and an empty line interspacing many of these local
variable declarations, also rediculous, and part of why it
is 80 lines long.

This function is completely unreadable, if I have to scan
multiple pages before I get to real code, the function is
broken.

You either need to compartmentalize these variable declarations
and/or write helper functions to spread it out.

This is some of the most unpleasant code I've had to review in quite
some time.  Look at other networking drivers in the tree, such as
drivers/net/ethernet/broadcom/tg3.c, and try to mimick their style and
layout.  Anything in your driver that is different is likely to get
you into trouble and make your driver hard to review.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v7] tilegx network driver: initial support
  2012-05-20 20:55                                   ` David Miller
@ 2012-05-23 20:42                                     ` Chris Metcalf
  2012-05-24  4:31                                       ` David Miller
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-23 20:42 UTC (permalink / raw)
  To: bhutchings, arnd, David Miller, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
I worked with the original author of this driver to refactor the
code and conform more closely to conventional Linux coding style.
I'd appreciate any additional feedback - thanks!

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1798 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1801 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..abfff7f
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1798 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? 128 : 1664);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	void *va;
+	while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! "
+		      "va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 struct tile_net_info *info,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		/* FIXME: Update "drop" statistics. */
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, info, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ *
+ * FIXME: We should probably have stopped the queue earlier rather
+ * than having to wait here.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid;
+
+	/* Wait for a free completion entry, if needed. */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_128,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_1664,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc != 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return 0;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring++);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	netif_start_queue(dev);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int len = skb->len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len)
+			p_len = len;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME (bug 11489): add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, total_len, p_len;
+	unsigned int isum_start, tsum_start, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = (unsigned char *)ih - data;
+	th_off = (unsigned char *)th - data;
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+	total_len = p_len + sh_len;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_start = ((0xFFFF - ih->check) +
+		      (0xFFFF - ih->tot_len) +
+		      (0xFFFF - ih->id));
+	tsum_start = th->check + (0xFFFF ^ htons(len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len) {
+			p_len = len;
+			total_len = p_len + sh_len;
+		}
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(total_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_start + htons(total_len - ih_off) +
+				      htons(id)) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_start + htons(total_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int len = skb->len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+	
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len)
+			p_len = len;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do TSO handling for egress. */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned int num_edescs;
+	unsigned long irqflags;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Set first reserved egress slot; see comment in tile_net_tx(). */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, info->comps_for_echannel[channel],
+		 slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_frags - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Try to reserve slots for egress.  If we fail due to the
+	 * queue being full, we return NETDEV_TX_BUSY.  This may lead
+	 * to "Virtual device xxx asks to queue packet" warnings.
+	 *
+	 * We might consider retrying briefly here since we expect in
+	 * principle that egress slots become available quickly as the
+	 * hardware engine drains packets into the network.
+	 *
+	 * FIXME (bug# 11479): We should stop queues when they're full.
+	 * We may want to consider making tile_net be multiqueue with
+	 * one TX queue per CPU and ndo_select_queue defined
+	 * accordingly.  Initially we saw bad things happen when
+	 * stopping the queue, so we are continuing to work on this
+	 * for a future fix.
+	 */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max(len, (unsigned int)ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->tx_queue_len = 0;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v7] tilegx network driver: initial support
  2012-05-23 20:42                                     ` [PATCH v7] " Chris Metcalf
@ 2012-05-24  4:31                                       ` David Miller
  2012-05-25 14:42                                         ` [PATCH v8] " Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: David Miller @ 2012-05-24  4:31 UTC (permalink / raw)
  To: cmetcalf; +Cc: bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Wed, 23 May 2012 16:42:03 -0400

> + * FIXME (bug 11489): add support for IPv6.
 ...
> +	 * FIXME (bug# 11479): We should stop queues when they're full.
 ...

Mentioning bug numbers in the driver source is not appropriate.

This second problem looks extremely serious, rather than some minor
issue to look into at some time in the future.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v8] tilegx network driver: initial support
  2012-05-24  4:31                                       ` David Miller
@ 2012-05-25 14:42                                         ` Chris Metcalf
  2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-05-25 14:42 UTC (permalink / raw)
  To: bhutchings, arnd, David Miller, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version of the patch fixes the issue where we were failing
to properly stop the net_device queue when the mpipe egress queue
filled up.  I also removed the internal bug numbers from the sources.

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1854 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1857 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..cc00ba5
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1854 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct hrtimer egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+	/* Timer to wake up tx queue */
+	struct hrtimer tx_wake_timer;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? 128 : 1664);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	void *va;
+	while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 struct tile_net_info *info,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		struct tile_net_priv *priv = netdev_priv(dev);
+		tile_net_stats_add(1, &priv->stats.rx_dropped);
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, info, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			break;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			break;
+	}
+	return n;
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	hrtimer_start(&priv->tx_wake_timer,
+		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      HRTIMER_MODE_REL);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	priv = container_of(t, struct tile_net_priv, tx_wake_timer);
+	dev = priv->dev;
+
+	if (netif_queue_stopped(dev))
+		netif_wake_queue(dev);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		hrtimer_start(&info->egress_timer,
+			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      HRTIMER_MODE_REL);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_128,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_1664,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc != 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return 0;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring++);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	netif_start_queue(dev);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+				       struct tile_net_comps *comps,
+				       gxio_mpipe_equeue_t *equeue,
+				       int num_edescs)
+{
+	/* Try to acquire a completion entry. */
+	if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+	    tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+		/* Try to acquire an egress slot. */
+		s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+
+		/* Freeing some completions gives the equeue time to drain. */
+		tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+		slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+	}
+
+	/* Still nothing; give up and stop the queue for a short while. */
+	netif_stop_queue(dev);
+	tile_net_schedule_tx_wake_timer(dev);
+	return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int len = skb->len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len)
+			p_len = len;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, total_len, p_len;
+	unsigned int isum_start, tsum_start, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = (unsigned char *)ih - data;
+	th_off = (unsigned char *)th - data;
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+	total_len = p_len + sh_len;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_start = ((0xFFFF - ih->check) +
+		      (0xFFFF - ih->tot_len) +
+		      (0xFFFF - ih->id));
+	tsum_start = th->check + (0xFFFF ^ htons(len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len) {
+			p_len = len;
+			total_len = p_len + sh_len;
+		}
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(total_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_start + htons(total_len - ih_off) +
+				      htons(id)) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_start + htons(total_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int len = skb->len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* The last segment may be less than gso_size. */
+		len -= p_len;
+		if (len < p_len)
+			p_len = len;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do TSO handling for egress. */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	struct tile_net_comps *comps = info->comps_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned long irqflags;
+	int num_edescs;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Set first reserved egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_edescs;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_edescs; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_edescs - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Set first reserved egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_edescs; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->tx_queue_len = 0;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Initialize the transmit wake timer. */
+	hrtimer_init(&priv->tx_wake_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	priv->tx_wake_timer.function = tile_net_handle_tx_wake_timer;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* [PATCH v9] tilegx network driver: initial support
  2012-05-25 14:42                                         ` [PATCH v8] " Chris Metcalf
@ 2012-06-04 20:12                                           ` Chris Metcalf
  2012-06-06 16:41                                             ` David Miller
                                                               ` (3 more replies)
  0 siblings, 4 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-06-04 20:12 UTC (permalink / raw)
  To: bhutchings, arnd, David Miller, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This change fixes some bugs that were discovered during additional
testing of the TSO refactoring.  In addition, I added a comment
explaining why we provide TSO support as essentially driver-side GSO.

The previous v8 version of the patch (from 10 days ago) received no
feedback; if anyone would care to provide feedback on this version of
the driver, it would be much appreciated.  Thanks!

 drivers/net/ethernet/tile/Kconfig  |    2 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1875 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1879 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..098b1c4 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,8 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
+	select HIGH_RES_TIMERS if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..a729499
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1875 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct hrtimer egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+	/* Timer to wake up tx queue */
+	struct hrtimer tx_wake_timer;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? 128 : 1664);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	for (;;) {
+		tile_io_addr_t addr =
+			(tile_io_addr_t)gxio_mpipe_pop_buffer(&context, stack);
+		if (addr == 0)
+			break;
+		dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr)));
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 struct tile_net_info *info,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		struct tile_net_priv *priv = netdev_priv(dev);
+		tile_net_stats_add(1, &priv->stats.rx_dropped);
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, info, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			break;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			break;
+	}
+	return n;
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	hrtimer_start(&priv->tx_wake_timer,
+		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      HRTIMER_MODE_REL);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	priv = container_of(t, struct tile_net_priv, tx_wake_timer);
+	dev = priv->dev;
+
+	if (netif_queue_stopped(dev))
+		netif_wake_queue(dev);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		hrtimer_start(&info->egress_timer,
+			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      HRTIMER_MODE_REL_PINNED);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_128,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  GXIO_MPIPE_BUFFER_SIZE_1664,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring++,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc < 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return ring;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring);
+		if (rc < 0)
+			goto fail;
+		ring = rc;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	netif_start_queue(dev);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+				       struct tile_net_comps *comps,
+				       gxio_mpipe_equeue_t *equeue,
+				       int num_edescs)
+{
+	/* Try to acquire a completion entry. */
+	if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+	    tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+		/* Try to acquire an egress slot. */
+		s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+
+		/* Freeing some completions gives the equeue time to drain. */
+		tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+		slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+	}
+
+	/* Still nothing; give up and stop the queue for a short while. */
+	netif_stop_queue(dev);
+	tile_net_schedule_tx_wake_timer(dev);
+	return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int data_len = skb->data_len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, p_len;
+	unsigned int isum_seed, tsum_seed, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = skb_network_offset(skb);
+	th_off = skb_transport_offset(skb);
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_seed = ((0xFFFF - ih->check) +
+		     (0xFFFF - ih->tot_len) +
+		     (0xFFFF - ih->id));
+	tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(sh_len + p_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_seed + ih->tot_len +
+				      ih->id) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_seed + htons(sh_len + p_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do "TSO" handling for egress.
+ *
+ * Normally drivers set NETIF_F_TSO only to support hardware TSO;
+ * otherwise the stack uses scatter-gather to implement GSO in software.
+ * On our testing, enabling GSO support (via NETIF_F_SG) drops network
+ * performance down to around 7.5 Gbps on the 10G interfaces, although
+ * also dropping cpu utilization way down, to under 8%.  But
+ * implementing "TSO" in the driver brings performance back up to line
+ * rate, while dropping cpu usage even further, to less than 4%.  In
+ * practice, profiling of GSO shows that skb_segment() is what causes
+ * the performance overheads; we benefit in the driver from using
+ * preallocated memory to duplicate the TCP/IP headers.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	struct tile_net_comps *comps = info->comps_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned long irqflags;
+	int num_edescs;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Set first reserved egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* We set the tx_queue_len as 0, so there's no tx packet
+		 * enqueuing. We simply drop the packet when the tx queue
+		 * is full.
+		 */
+		tile_net_stats_add(1, &priv->stats.tx_dropped);
+		return NETDEV_TX_OK;
+	}
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_edescs;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_edescs; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_edescs - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Set first reserved egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* We set the tx_queue_len as 0, so there's no tx packet
+		 * enqueuing. We simply drop the packet when the tx queue
+		 * is full.
+		 */
+		tile_net_stats_add(1, &priv->stats.tx_dropped);
+		return NETDEV_TX_OK;
+	}
+
+	for (i = 0; i < num_edescs; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->tx_queue_len = 0;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Initialize the transmit wake timer. */
+	hrtimer_init(&priv->tx_wake_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	priv->tx_wake_timer.function = tile_net_handle_tx_wake_timer;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
@ 2012-06-06 16:41                                             ` David Miller
  2012-06-06 17:31                                             ` Eric Dumazet
                                                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 61+ messages in thread
From: David Miller @ 2012-06-06 16:41 UTC (permalink / raw)
  To: cmetcalf; +Cc: bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Mon, 4 Jun 2012 16:12:03 -0400

> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> ---
> This change fixes some bugs that were discovered during additional
> testing of the TSO refactoring.  In addition, I added a comment
> explaining why we provide TSO support as essentially driver-side GSO.
> 
> The previous v8 version of the patch (from 10 days ago) received no
> feedback; if anyone would care to provide feedback on this version of
> the driver, it would be much appreciated.  Thanks!

Someone other than me please review this driver.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
  2012-06-06 16:41                                             ` David Miller
@ 2012-06-06 17:31                                             ` Eric Dumazet
  2012-06-06 17:40                                             ` Eric Dumazet
  2012-06-06 18:10                                             ` [PATCH v9] " Eric Dumazet
  3 siblings, 0 replies; 61+ messages in thread
From: Eric Dumazet @ 2012-06-06 17:31 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: bhutchings, arnd, David Miller, linux-kernel, netdev

On Mon, 2012-06-04 at 16:12 -0400, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.

> +static void tile_net_setup(struct net_device *dev)
> +{
> +	ether_setup(dev);
> +	dev->netdev_ops = &tile_net_ops;
> +	dev->watchdog_timeo = TILE_NET_TIMEOUT;
> +	dev->features |= NETIF_F_LLTX;
> +	dev->features |= NETIF_F_HW_CSUM;
> +	dev->features |= NETIF_F_SG;
> +	dev->features |= NETIF_F_TSO;
> +	dev->tx_queue_len = 0;
> +	dev->mtu = 1500;
> +}

Why is tx_queue_len set to 0 ?



^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
  2012-06-06 16:41                                             ` David Miller
  2012-06-06 17:31                                             ` Eric Dumazet
@ 2012-06-06 17:40                                             ` Eric Dumazet
  2012-06-06 18:36                                               ` Chris Metcalf
  2012-06-06 18:10                                             ` [PATCH v9] " Eric Dumazet
  3 siblings, 1 reply; 61+ messages in thread
From: Eric Dumazet @ 2012-06-06 17:40 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: bhutchings, arnd, David Miller, linux-kernel, netdev

On Mon, 2012-06-04 at 16:12 -0400, Chris Metcalf wrote:

> +/* Allocate and push a buffer. */
> +static bool tile_net_provide_buffer(bool small)
> +{
> +	int stack = small ? small_buffer_stack : large_buffer_stack;
> +	const unsigned long buffer_alignment = 128;
> +	struct sk_buff *skb;
> +	int len;
> +
> +	len = sizeof(struct sk_buff **) + buffer_alignment;
> +	len += (small ? 128 : 1664);

1664 is a magic number, it should be a nice define

#define ..... ( ETH_DATA_LEN + .... )

> +	skb = dev_alloc_skb(len);
> +	if (skb == NULL)
> +		return false;
> +
> +	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
> +	skb_reserve(skb, sizeof(struct sk_buff **));
> +	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
> +
> +	/* Save a back-pointer to 'skb'. */
> +	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
> +
> +	/* Make sure "skb" and the back-pointer have been flushed. */
> +	wmb();

Interesting, have you considered using build_skb() instead of this
convoluted thing ?

This could save some cache misses...




^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
                                                               ` (2 preceding siblings ...)
  2012-06-06 17:40                                             ` Eric Dumazet
@ 2012-06-06 18:10                                             ` Eric Dumazet
  2012-06-06 18:17                                               ` David Miller
  2012-06-06 18:19                                               ` Ben Hutchings
  3 siblings, 2 replies; 61+ messages in thread
From: Eric Dumazet @ 2012-06-06 18:10 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: bhutchings, arnd, David Miller, linux-kernel, netdev

On Mon, 2012-06-04 at 16:12 -0400, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 

> +
> +/* Do "TSO" handling for egress.
> + *
> + * Normally drivers set NETIF_F_TSO only to support hardware TSO;
> + * otherwise the stack uses scatter-gather to implement GSO in software.
> + * On our testing, enabling GSO support (via NETIF_F_SG) drops network
> + * performance down to around 7.5 Gbps on the 10G interfaces, although
> + * also dropping cpu utilization way down, to under 8%.  But
> + * implementing "TSO" in the driver brings performance back up to line
> + * rate, while dropping cpu usage even further, to less than 4%.  In
> + * practice, profiling of GSO shows that skb_segment() is what causes
> + * the performance overheads; we benefit in the driver from using
> + * preallocated memory to duplicate the TCP/IP headers.
> + */

All this stuff cost about 300 lines of code in this driver, without IPv6
support.

I am pretty sure this performance problem should be solved in net/{core|
ipv4|ipv6} instead

What TCP performance do you get with TSO/GSO and SG off ?




^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-06 18:10                                             ` [PATCH v9] " Eric Dumazet
@ 2012-06-06 18:17                                               ` David Miller
  2012-06-06 18:19                                               ` Ben Hutchings
  1 sibling, 0 replies; 61+ messages in thread
From: David Miller @ 2012-06-06 18:17 UTC (permalink / raw)
  To: eric.dumazet; +Cc: cmetcalf, bhutchings, arnd, linux-kernel, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 06 Jun 2012 20:10:23 +0200

> I am pretty sure this performance problem should be solved in net/{core|
> ipv4|ipv6} instead
> 
> What TCP performance do you get with TSO/GSO and SG off ?

We have other drivers already doing this.

I tried a few years ago to make this generic, because NIU could
benefit from it as well, but I couldn't figure out a clean enough
way to abstract this.

Therefore it is absolutely reasonable to continue to let drivers
do this locally until we actually have a reasonable solution.

The gains are definitely significant for chips that lack real TSO
hardware, I absolutely do not require "proof" of this, it is clearly
evident to anyone who considers the issue.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-06 18:10                                             ` [PATCH v9] " Eric Dumazet
  2012-06-06 18:17                                               ` David Miller
@ 2012-06-06 18:19                                               ` Ben Hutchings
  1 sibling, 0 replies; 61+ messages in thread
From: Ben Hutchings @ 2012-06-06 18:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Chris Metcalf, arnd, David Miller, linux-kernel, netdev

On Wed, 2012-06-06 at 20:10 +0200, Eric Dumazet wrote:
> On Mon, 2012-06-04 at 16:12 -0400, Chris Metcalf wrote:
> > This change adds support for the tilegx network driver based on the
> > GXIO IORPC support in the tilegx software stack, using the on-chip
> > mPIPE packet processing engine.
> > 
> 
> > +
> > +/* Do "TSO" handling for egress.
> > + *
> > + * Normally drivers set NETIF_F_TSO only to support hardware TSO;
> > + * otherwise the stack uses scatter-gather to implement GSO in software.
> > + * On our testing, enabling GSO support (via NETIF_F_SG) drops network
> > + * performance down to around 7.5 Gbps on the 10G interfaces, although
> > + * also dropping cpu utilization way down, to under 8%.  But
> > + * implementing "TSO" in the driver brings performance back up to line
> > + * rate, while dropping cpu usage even further, to less than 4%.  In
> > + * practice, profiling of GSO shows that skb_segment() is what causes
> > + * the performance overheads; we benefit in the driver from using
> > + * preallocated memory to duplicate the TCP/IP headers.
> > + */
> 
> All this stuff cost about 300 lines of code in this driver, without IPv6
> support.
> 
> I am pretty sure this performance problem should be solved in net/{core|
> ipv4|ipv6} instead
> 
> What TCP performance do you get with TSO/GSO and SG off ?

It's a real problem and we have soft-TSO in the sfc driver for the same
reason.  GSO means more allocation, more DMA mapping, more calls into
the driver and more register writes.

If drivers could use GSO explicitly from their ndo_start_xmit function,
more like they do with GRO, much of this would presumably be avoidable.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-06 17:40                                             ` Eric Dumazet
@ 2012-06-06 18:36                                               ` Chris Metcalf
  2012-06-06 18:54                                                 ` David Miller
  0 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-06-06 18:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: bhutchings, arnd, David Miller, linux-kernel, netdev

On 6/6/2012 1:40 PM, Eric Dumazet wrote:
> On Mon, 2012-06-04 at 16:12 -0400, Chris Metcalf wrote:
>
>> +/* Allocate and push a buffer. */
>> +static bool tile_net_provide_buffer(bool small)
>> +{
>> +	int stack = small ? small_buffer_stack : large_buffer_stack;
>> +	const unsigned long buffer_alignment = 128;
>> +	struct sk_buff *skb;
>> +	int len;
>> +
>> +	len = sizeof(struct sk_buff **) + buffer_alignment;
>> +	len += (small ? 128 : 1664);
> 1664 is a magic number, it should be a nice define
>
> #define ..... ( ETH_DATA_LEN + .... )

Fair enough.  However, the magic-ness comes from the hardware header code
in arch/tile/gxio/mpipe.h, which provides a limited set of allowed buffer
sizes, including 1664.  But I can add these #defines at the top of this driver:

/* Buffer sizes and mpipe enum codes for buffer stacks.
 * See arch/tile/include/gxio/mpipe.h for the set of possible values.
 */
#define BUFFER_SIZE_SMALL_ENUM GXIO_MPIPE_BUFFER_SIZE_128
#define BUFFER_SIZE_SMALL 128
#define BUFFER_SIZE_LARGE_ENUM GXIO_MPIPE_BUFFER_SIZE_1664
#define BUFFER_SIZE_LARGE 1664


>> +	skb = dev_alloc_skb(len);
>> +	if (skb == NULL)
>> +		return false;
>> +
>> +	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
>> +	skb_reserve(skb, sizeof(struct sk_buff **));
>> +	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
>> +
>> +	/* Save a back-pointer to 'skb'. */
>> +	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
>> +
>> +	/* Make sure "skb" and the back-pointer have been flushed. */
>> +	wmb();
> Interesting, have you considered using build_skb() instead of this
> convoluted thing ?
>
> This could save some cache misses...

I hadn't looked at build_skb() before; we built up this driver mostly on a
base of 2.6.38, where it doesn't exist.  That said, it doesn't seem like it
matters; dev_alloc_skb() will just end up calling down to build_skb()
anyway, as far as I can tell.

The code where we do the two skb_reserves and then stuff in a backpointer
and do a barrier are because we track the skbuffs in hardware, and hardware
ignores the low 7 bits aof the address (thus the "buffer_alignment" part)
and we need to be able to pull the actual skb address out of the data when
the hardware returns a pointer to the data to us.

By the way, your question about tx_queue_len is a good one; I'm roping in
our other network developer folks to figure it out.  Originally it was a
performance optimization, I believe; I'm not sure it's still required. 
I'll follow up on that one when we've tracked it down.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v9] tilegx network driver: initial support
  2012-06-06 18:36                                               ` Chris Metcalf
@ 2012-06-06 18:54                                                 ` David Miller
  2001-09-17  4:00                                                   ` [PATCH v10] " Chris Metcalf
                                                                     ` (2 more replies)
  0 siblings, 3 replies; 61+ messages in thread
From: David Miller @ 2012-06-06 18:54 UTC (permalink / raw)
  To: cmetcalf; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Wed, 6 Jun 2012 14:36:02 -0400

> By the way, your question about tx_queue_len is a good one; I'm roping in
> our other network developer folks to figure it out.  Originally it was a
> performance optimization, I believe; I'm not sure it's still required. 

It's illegal, you cannot do this.

If you set the TX queue length to zero, amongst other very serious
and grave problems, your device cannot be used with the various
packet scheduler queueing disciplies.

Zero TX queue lengths should only be used for layering drivers which
are purely software entities rather than for real actual hardware.

As stated before, all of the areas where the tilegx driver tries to be
different end up being bugs.  It would therefore be nice, if, as I
suggested before, the driver is audited by you against a known
gold-standard Linux driver such as tg3 to spot inconsistencies like
this.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-04-06 20:42                                                   ` Chris Metcalf
@ 2012-06-07 20:39                                                     ` David Miller
  2012-06-07 20:44                                                       ` Chris Metcalf
  2012-06-07 20:52                                                     ` Joe Perches
  1 sibling, 1 reply; 61+ messages in thread
From: David Miller @ 2012-06-07 20:39 UTC (permalink / raw)
  To: cmetcalf; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Fri, 6 Apr 2012 16:42:03 -0400

> Date: Fri, 6 Apr 2012 16:42:03 -0400

You did not commit this file on April 6th.

Please don't use the date emitted by the GIT tools, just
let the email use the natural correct date which is the
one at the time you send the email out.

Otherwise your patch gets misordered as automated tools like
patchwork think this file should go all the way at the back
of the patch queue because of it's old date relative to
other pending patches.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-06-07 20:39                                                     ` David Miller
@ 2012-06-07 20:44                                                       ` Chris Metcalf
  2012-06-07 20:47                                                         ` Chris Metcalf
  2012-06-07 20:50                                                         ` Ben Hutchings
  0 siblings, 2 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-06-07 20:44 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

On 6/7/2012 4:39 PM, David Miller wrote:
> From: Chris Metcalf <cmetcalf@tilera.com>
> Date: Fri, 6 Apr 2012 16:42:03 -0400
>
>> Date: Fri, 6 Apr 2012 16:42:03 -0400
> You did not commit this file on April 6th.
>
> Please don't use the date emitted by the GIT tools, just
> let the email use the natural correct date which is the
> one at the time you send the email out.
>
> Otherwise your patch gets misordered as automated tools like
> patchwork think this file should go all the way at the back
> of the patch queue because of it's old date relative to
> other pending patches.

Yes, when I use "git rebase" to merge changes into the earlier patch, this
is the behavior I see.  I don't know if there's some way to tell git to
take the date on the later change instead when I "squash" them.  Or if,
perhaps, there is some other workflow I should be using.  It does seem like
the git history should reflect the latest time.

The issue of the date on the email is separate.  I tend to use "git
format-patch" to start with, munge up the headers to jam in some
"In-Reply-To" and "References" lines, manually update the "Date:", then
feed it to "sendmail -t".  Perhaps there's a different workflow I should be
using there, too.  (I tried deleting the "Date", but the one time I tried
that I ended up with some surprisingly bogus date in the email that hit
LKML, so I've been avoiding that approach.)

I'll resend the patch without a Date: line and see how it ends up this time.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* [PATCH v10] tilegx network driver: initial support
  2012-06-06 18:54                                                 ` David Miller
  2001-09-17  4:00                                                   ` [PATCH v10] " Chris Metcalf
  2012-04-06 20:42                                                   ` Chris Metcalf
@ 2012-06-07 20:45                                                   ` Chris Metcalf
  2012-06-12  0:03                                                     ` David Miller
  2 siblings, 1 reply; 61+ messages in thread
From: Chris Metcalf @ 2012-06-07 20:45 UTC (permalink / raw)
  To: David Miller, eric.dumazet, bhutchings, arnd, linux-kernel, netdev

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version makes the driver multi-queued and support non-zero
tx_queue_len.  I also made a couple of magic numbers into #defines.
I skimmed the tg3.c driver, but didn't see any other obvious
changes that would be appropriate.

 drivers/net/ethernet/tile/Kconfig  |    2 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1898 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1902 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..098b1c4 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,8 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
+	select HIGH_RES_TIMERS if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..ee7556a
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1898 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?  If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and echannel. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* The transmit wake timer for a given cpu and echannel. */
+struct tile_net_tx_wake {
+	struct hrtimer timer;
+	struct net_device *dev;
+};
+	
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct hrtimer egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+	/* Transmit wake timer for each egress channel. */
+	struct tile_net_tx_wake tx_wake[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* Buffer sizes and mpipe enum codes for buffer stacks.
+ * See arch/tile/include/gxio/mpipe.h for the set of possible values.
+ */
+#define BUFFER_SIZE_SMALL_ENUM GXIO_MPIPE_BUFFER_SIZE_128
+#define BUFFER_SIZE_SMALL 128
+#define BUFFER_SIZE_LARGE_ENUM GXIO_MPIPE_BUFFER_SIZE_1664
+#define BUFFER_SIZE_LARGE 1664
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+	BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+	atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+	const unsigned long buffer_alignment = 128;
+	struct sk_buff *skb;
+	int len;
+
+	len = sizeof(struct sk_buff **) + buffer_alignment;
+	len += (small ? BUFFER_SIZE_SMALL : BUFFER_SIZE_LARGE);
+	skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb' and guarantee alignment. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+	skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	/* Paranoia. */
+	if (skb->data != va) {
+		/* Panic here since there's a reasonable chance
+		 * that corrupt buffers means generic memory
+		 * corruption, with unpredictable system effects.
+		 */
+		panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+		      va, skb, skb->data);
+	}
+
+	return skb;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	for (;;) {
+		tile_io_addr_t addr =
+			(tile_io_addr_t)gxio_mpipe_pop_buffer(&context, stack);
+		if (addr == 0)
+			break;
+		dev_kfree_skb_irq(mpipe_buf_to_skb(tile_io_addr_to_va(addr)));
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+	/* Filter packets received before we're up. */
+	if (dev == NULL || !(dev->flags & IFF_UP))
+		return true;
+
+	/* Filter out packets that aren't for us. */
+	if (!(dev->flags & IFF_PROMISC) &&
+	    !is_multicast_ether_addr(buf) &&
+	    compare_ether_addr(dev->dev_addr, buf) != 0)
+		return true;
+
+	return false;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+				 gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Encode the actual packet length. */
+	skb_put(skb, len);
+
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Acknowledge "good" hardware checksums. */
+	if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	netif_receive_skb(skb);
+
+	/* Update stats. */
+	tile_net_stats_add(1, &priv->stats.rx_packets);
+	tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+	/* Need a new buffer. */
+	if (idesc->size == BUFFER_SIZE_SMALL_ENUM)
+		info->num_needed_small_buffers++;
+	else
+		info->num_needed_large_buffers++;
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(gxio_mpipe_idesc_t *idesc)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset;
+	void *va;
+	void *buf;
+	unsigned long len;
+	bool filter;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		struct tile_net_priv *priv = netdev_priv(dev);
+		tile_net_stats_add(1, &priv->stats.rx_dropped);
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the "l2_offset", if allowed. */
+	l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+	/* Get the raw buffer VA (includes "headroom"). */
+	va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = idesc->l2_size - l2_offset;
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+	filter = filter_packet(dev, buf);
+	if (filter) {
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+	} else {
+		struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		tile_net_receive_skb(dev, skb, idesc, len);
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers();
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			break;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			break;
+	}
+	return n;
+}
+
+/* Add a completion.  This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+		     struct tile_net_comps *comps,
+		     uint64_t when, struct sk_buff *skb)
+{
+	int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = when;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	hrtimer_start(&info->tx_wake[priv->echannel].timer,
+		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+	struct tile_net_tx_wake *tx_wake =
+		container_of(t, struct tile_net_tx_wake, timer);
+	netif_wake_subqueue(tx_wake->dev, smp_processor_id());
+	return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled. */
+static void tile_net_schedule_egress_timer(void)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (!info->egress_timer_scheduled) {
+		hrtimer_start(&info->egress_timer,
+			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      HRTIMER_MODE_REL_PINNED);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer();
+
+	local_irq_restore(irqflags);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct net_device *dev = arg;
+
+	if (!info->has_iqueue)
+		return;
+
+	if (dev != NULL) {
+		if (!info->napi_added) {
+			netif_napi_add(dev, &info->napi,
+				       tile_net_poll, TILE_NET_WEIGHT);
+			info->napi_added = true;
+		}
+		if (!info->napi_enabled) {
+			napi_enable(&info->napi);
+			info->napi_enabled = true;
+		}
+		enable_percpu_irq(ingress_irq, 0);
+	} else {
+		disable_percpu_irq(ingress_irq);
+		if (info->napi_enabled) {
+			napi_disable(&info->napi);
+			info->napi_enabled = false;
+		}
+		/* FIXME: Drain the iqueue. */
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	static gxio_mpipe_rules_t rules;  /* too big to fit on the stack */
+	bool saw_channel = false;
+	int channel;
+	int rc;
+	int cpu;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (saw_channel)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+	pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+	int rc;
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	buffer_stack_size =
+		ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+		      64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		return rc;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (small_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  BUFFER_SIZE_SMALL_ENUM,
+					  small_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	/* Allocate the large buffer stack. */
+	large_buffer_stack_va =
+		alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+	if (large_buffer_stack_va == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   buffer_stack_size);
+		return -ENOMEM;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  BUFFER_SIZE_LARGE_ENUM,
+					  large_buffer_stack_va,
+					  buffer_stack_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		return rc;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       hash_pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+					int cpu, int ring)
+{
+	struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+	int order, i, rc;
+	struct page *page;
+	void *addr;
+
+	/* Allocate the "comps". */
+	order = get_order(COMPS_SIZE);
+	page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+	if (page == NULL) {
+		netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+			   COMPS_SIZE);
+		return -ENOMEM;
+	}
+	addr = pfn_to_kaddr(page_to_pfn(page));
+	memset(addr, 0, COMPS_SIZE);
+	for (i = 0; i < TILE_NET_CHANNELS; i++)
+		info->comps_for_echannel[i] =
+			addr + i * sizeof(struct tile_net_comps);
+
+	/* If this is a network cpu, create an iqueue. */
+	if (cpu_isset(cpu, network_cpus_map)) {
+		order = get_order(NOTIF_RING_SIZE);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   NOTIF_RING_SIZE);
+			return -ENOMEM;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring++,
+					    addr, NOTIF_RING_SIZE, 0);
+		if (rc < 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			return rc;
+		}
+		info->has_iqueue = true;
+	}
+
+	return ring;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+					int ring, int network_cpus_count)
+{
+	int group, rc;
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		return rc;
+	}
+	group = rc;
+
+	/* Initialize global num_buckets value. */
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets, and set global first_bucket value. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		return rc;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores.  Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+	int cpu, rc;
+
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		return rc;
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		return rc;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+	int cpu;
+
+	/* Do cleanups that require the mpipe context first. */
+	if (small_buffer_stack >= 0)
+		tile_net_pop_all_buffers(small_buffer_stack);
+	if (large_buffer_stack >= 0)
+		tile_net_pop_all_buffers(large_buffer_stack);
+
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(COMPS_SIZE));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(NOTIF_RING_SIZE));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small_buffer_stack_va)
+		free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+	if (large_buffer_stack_va)
+		free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+	small_buffer_stack_va = NULL;
+	large_buffer_stack_va = NULL;
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	int i, num_buffers, rc;
+	int cpu;
+	int first_ring, ring;
+	int network_cpus_count = cpus_weight(network_cpus_map);
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc = gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Set up the buffer stacks. */
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+	rc = init_buffer_stacks(dev, num_buffers);
+	if (rc != 0)
+		goto fail;
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Init NotifRings per-cpu. */
+	first_ring = rc;
+	ring = first_ring;
+	for_each_online_cpu(cpu) {
+		rc = alloc_percpu_mpipe_resources(dev, cpu, ring);
+		if (rc < 0)
+			goto fail;
+		ring = rc;
+	}
+
+	/* Initialize NotifGroup and buckets. */
+	rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+	if (rc != 0)
+		goto fail;
+
+	/* Create and enable interrupts. */
+	rc = tile_net_setup_interrupts(dev);
+	if (rc != 0)
+		goto fail;
+
+	return 0;
+
+fail:
+	tile_net_init_mpipe_fail();
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	struct page *headers_page, *edescs_page, *equeue_page;
+	gxio_mpipe_edesc_t *edescs;
+	gxio_mpipe_equeue_t *equeue;
+	unsigned char *headers;
+	int headers_order, edescs_order, equeue_order;
+	size_t edescs_size;
+	int edma;
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu, rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize the transmit wake timer for this device for each cpu. */
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_init(&tx_wake->timer, CLOCK_MONOTONIC,
+			     HRTIMER_MODE_REL);
+		tx_wake->timer.function = tile_net_handle_tx_wake_timer;
+		tx_wake->dev = dev;
+	}
+
+	for_each_online_cpu(cpu)
+		netif_start_subqueue(dev, cpu);
+	netif_carrier_on(dev);
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		struct tile_net_tx_wake *tx_wake =
+			&info->tx_wake[priv->echannel];
+
+		hrtimer_cancel(&tx_wake->timer);
+		netif_stop_subqueue(dev, cpu);
+	}
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+	tile_net_devs_for_channel[priv->channel] = NULL;
+	(void)tile_net_update(dev);
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+	priv->echannel = -1;
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+				       struct tile_net_comps *comps,
+				       gxio_mpipe_equeue_t *equeue,
+				       int num_edescs)
+{
+	/* Try to acquire a completion entry. */
+	if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+	    tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+		/* Try to acquire an egress slot. */
+		s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+
+		/* Freeing some completions gives the equeue time to drain. */
+		tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+		slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+		if (slot >= 0)
+			return slot;
+	}
+
+	/* Still nothing; give up and stop the queue for a short while. */
+	netif_stop_subqueue(dev, smp_processor_id());
+	tile_net_schedule_tx_wake_timer(dev);
+	return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int num_edescs = 0;
+	int segment;
+
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		unsigned int p_used = 0;
+
+		/* One edesc for header and for each piece of the payload. */
+		for (num_edescs++; p_used < p_len; num_edescs++) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+		}
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+				s64 slot)
+{
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	struct iphdr *ih;
+	struct tcphdr *th;
+	unsigned int data_len = skb->data_len;
+	unsigned char *data = skb->data;
+	unsigned int ih_off, th_off, sh_len, p_len;
+	unsigned int isum_seed, tsum_seed, id, seq;
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	int segment;
+
+	/* Locate original headers and compute various lengths. */
+	ih = ip_hdr(skb);
+	th = tcp_hdr(skb);
+	ih_off = skb_network_offset(skb);
+	th_off = skb_transport_offset(skb);
+	sh_len = th_off + tcp_hdrlen(skb);
+	p_len = sh->gso_size;
+
+	/* Set up seed values for IP and TCP csum and initialize id and seq. */
+	isum_seed = ((0xFFFF - ih->check) +
+		     (0xFFFF - ih->tot_len) +
+		     (0xFFFF - ih->id));
+	tsum_seed = th->check + (0xFFFF ^ htons(skb->len));
+	id = ntohs(ih->id);
+	seq = ntohl(th->seq);
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Copy to the header memory for this segment. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		memcpy(buf, data, sh_len);
+
+		/* Update copied ip header. */
+		ih = (struct iphdr *)(buf + ih_off);
+		ih->tot_len = htons(sh_len + p_len - ih_off);
+		ih->id = htons(id);
+		ih->check = csum_long(isum_seed + ih->tot_len +
+				      ih->id) ^ 0xffff;
+
+		/* Update copied tcp header. */
+		th = (struct tcphdr *)(buf + th_off);
+		th->seq = htonl(seq);
+		th->check = csum_long(tsum_seed + htons(sh_len + p_len));
+		if (segment != sh->gso_segs - 1) {
+			th->fin = 0;
+			th->psh = 0;
+		}
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Flush the headers so they are ready for hardware DMA. */
+	wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+		       struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct skb_shared_info *sh = skb_shinfo(skb);
+	unsigned int data_len = skb->data_len;
+	unsigned int p_len = sh->gso_size;
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+	long f_id = -1;    /* id of the current fragment */
+	long f_size = -1;  /* size of the current fragment */
+	long f_used = -1;  /* bytes used from the current fragment */
+	long n;            /* size of the current piece of payload */
+	unsigned long tx_packets = 0, tx_bytes = 0;
+	unsigned int csum_start, sh_len;
+	int segment;
+
+	/* Prepare to egress the headers: set up header edesc. */
+	csum_start = skb_checksum_start_offset(skb);
+	sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+
+	/* This is only used to specify the TLB. */
+	edesc_head.stack_idx = large_buffer_stack;
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+		void *va;
+		unsigned char *buf;
+		unsigned int p_used = 0;
+
+		/* Egress the header. */
+		buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+			NET_IP_ALIGN;
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += sh_len + p_len;
+
+		/* The last segment may be less than gso_size. */
+		data_len -= p_len;
+		if (data_len < p_len)
+			p_len = data_len;
+	}
+
+	/* Update stats. */
+	tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+	tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do "TSO" handling for egress.
+ *
+ * Normally drivers set NETIF_F_TSO only to support hardware TSO;
+ * otherwise the stack uses scatter-gather to implement GSO in software.
+ * On our testing, enabling GSO support (via NETIF_F_SG) drops network
+ * performance down to around 7.5 Gbps on the 10G interfaces, although
+ * also dropping cpu utilization way down, to under 8%.  But
+ * implementing "TSO" in the driver brings performance back up to line
+ * rate, while dropping cpu usage even further, to less than 4%.  In
+ * practice, profiling of GSO shows that skb_segment() is what causes
+ * the performance overheads; we benefit in the driver from using
+ * preallocated memory to duplicate the TCP/IP headers.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int channel = priv->echannel;
+	struct tile_net_egress *egress = &egress_for_echannel[channel];
+	struct tile_net_comps *comps = info->comps_for_echannel[channel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	unsigned long irqflags;
+	int num_edescs;
+	s64 slot;
+
+	/* Determine how many mpipe edesc's are needed. */
+	num_edescs = tso_count_edescs(skb);
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Set up copies of header data properly. */
+	tso_headers_prepare(skb, egress->headers, slot);
+
+	/* Actually pass the data to the network hardware. */
+	tso_egress(dev, equeue, skb, egress->headers, slot);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	struct tile_net_priv *priv = netdev_priv(dev);
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+	unsigned int num_edescs;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+	unsigned long irqflags;
+	gxio_mpipe_edesc_t edesc = { { 0 } };
+	unsigned int i;
+	s64 slot;
+
+	if (skb_is_gso(skb))
+		return tile_net_tx_tso(skb, dev);
+
+	num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* This is only used to specify the TLB. */
+	edesc.stack_idx = large_buffer_stack;
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_edescs; i++) {
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edescs[i] = edesc;
+	}
+
+	/* Mark the final edesc. */
+	edescs[num_edescs - 1].bound = 1;
+
+	/* Add checksum info to the initial edesc, if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb_checksum_start_offset(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Try to acquire a completion entry and an egress slot. */
+	slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_edescs; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+	/* Add a completion record. */
+	add_comp(equeue, comps, slot - 1, skb);
+
+	/* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+	tile_net_stats_add(1, &priv->stats.tx_packets);
+	tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+			   &priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer();
+
+	return NETDEV_TX_OK;
+}
+
+/* Return subqueue id on this core (one per core). */
+static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	return smp_processor_id();
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		netif_wake_subqueue(dev, cpu);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_select_queue = tile_net_select_queue,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+	dev->features |= NETIF_F_LLTX;
+	dev->features |= NETIF_F_HW_CSUM;
+	dev->features |= NETIF_F_SG;
+	dev->features |= NETIF_F_TSO;
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev_mqs(sizeof(*priv), name, tile_net_setup,
+			       NR_CPUS, 1);
+	if (!dev) {
+		pr_err("alloc_netdev_mqs(%s) failed\n", name);
+		return;
+	}
+
+	/* Initialize "priv". */
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(*priv));
+	priv->dev = dev;
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-06-07 20:44                                                       ` Chris Metcalf
@ 2012-06-07 20:47                                                         ` Chris Metcalf
  2012-06-07 20:50                                                         ` Ben Hutchings
  1 sibling, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-06-07 20:47 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

On 6/7/2012 4:44 PM, Chris Metcalf wrote:
> On 6/7/2012 4:39 PM, David Miller wrote:
>> From: Chris Metcalf <cmetcalf@tilera.com>
>> Date: Fri, 6 Apr 2012 16:42:03 -0400
>>
>>> Date: Fri, 6 Apr 2012 16:42:03 -0400
>> You did not commit this file on April 6th.
>>
>> Please don't use the date emitted by the GIT tools, just
>> let the email use the natural correct date which is the
>> one at the time you send the email out.
>>
>> Otherwise your patch gets misordered as automated tools like
>> patchwork think this file should go all the way at the back
>> of the patch queue because of it's old date relative to
>> other pending patches.
> Yes, when I use "git rebase" to merge changes into the earlier patch, this
> is the behavior I see.  I don't know if there's some way to tell git to
> take the date on the later change instead when I "squash" them.  Or if,
> perhaps, there is some other workflow I should be using.  It does seem like
> the git history should reflect the latest time.
>
> The issue of the date on the email is separate.  I tend to use "git
> format-patch" to start with, munge up the headers to jam in some
> "In-Reply-To" and "References" lines, manually update the "Date:", then
> feed it to "sendmail -t".  Perhaps there's a different workflow I should be
> using there, too.  (I tried deleting the "Date", but the one time I tried
> that I ended up with some surprisingly bogus date in the email that hit
> LKML, so I've been avoiding that approach.)
>
> I'll resend the patch without a Date: line and see how it ends up this time.

Well, I see where the sendmail "Date:" weirdness was coming from; for some
reason "git format-patch" was emitting a first line like this: "From
4d76049b3a48f1b32aed1eeb17b4d3a2cb1b1ff6 Mon Sep 17 00:00:00 2001", and
sendmail was helpfully pulling the "Date:" line from there.  Deleting that
line as well does the right thing, as I see from the third version of this
patch on LKML.  Why git is doing this is a good question.

Sorry for the spam, but hopefully that will avoid the issue in the future.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-06-07 20:44                                                       ` Chris Metcalf
  2012-06-07 20:47                                                         ` Chris Metcalf
@ 2012-06-07 20:50                                                         ` Ben Hutchings
  1 sibling, 0 replies; 61+ messages in thread
From: Ben Hutchings @ 2012-06-07 20:50 UTC (permalink / raw)
  To: Chris Metcalf; +Cc: David Miller, eric.dumazet, arnd, linux-kernel, netdev

On Thu, 2012-06-07 at 16:44 -0400, Chris Metcalf wrote:
> On 6/7/2012 4:39 PM, David Miller wrote:
> > From: Chris Metcalf <cmetcalf@tilera.com>
> > Date: Fri, 6 Apr 2012 16:42:03 -0400
> >
> >> Date: Fri, 6 Apr 2012 16:42:03 -0400
> > You did not commit this file on April 6th.
> >
> > Please don't use the date emitted by the GIT tools, just
> > let the email use the natural correct date which is the
> > one at the time you send the email out.
> >
> > Otherwise your patch gets misordered as automated tools like
> > patchwork think this file should go all the way at the back
> > of the patch queue because of it's old date relative to
> > other pending patches.
> 
> Yes, when I use "git rebase" to merge changes into the earlier patch, this
> is the behavior I see.  I don't know if there's some way to tell git to
> take the date on the later change instead when I "squash" them.
[...]

git commit --amend --date="$(date)"

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-04-06 20:42                                                   ` Chris Metcalf
  2012-06-07 20:39                                                     ` David Miller
@ 2012-06-07 20:52                                                     ` Joe Perches
  1 sibling, 0 replies; 61+ messages in thread
From: Joe Perches @ 2012-06-07 20:52 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: David Miller, eric.dumazet, bhutchings, arnd, linux-kernel, netdev

On Fri, 2012-04-06 at 16:42 -0400, Chris Metcalf wrote:
> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> ---
> This version makes the driver multi-queued and support non-zero
> tx_queue_len.  I also made a couple of magic numbers into #defines.
> I skimmed the tg3.c driver, but didn't see any other obvious
> changes that would be appropriate.
> 

Hi Chris.

Can you please use git send-email instead of
generating odd header dates like April 6, 2012
and Sept 16, 2001?

git send-email will use your system date.



^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-06-07 20:45                                                   ` Chris Metcalf
@ 2012-06-12  0:03                                                     ` David Miller
  2012-06-12 13:14                                                       ` Chris Metcalf
  0 siblings, 1 reply; 61+ messages in thread
From: David Miller @ 2012-06-12  0:03 UTC (permalink / raw)
  To: cmetcalf; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 7 Jun 2012 16:45:02 -0400

> This change adds support for the tilegx network driver based on the
> GXIO IORPC support in the tilegx software stack, using the on-chip
> mPIPE packet processing engine.
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

Applied.

Although:

> +/* The transmit wake timer for a given cpu and echannel. */
> +struct tile_net_tx_wake {
> +	struct hrtimer timer;
> +	struct net_device *dev;
> +};
> +	
   ^^^^^^^

I had to remove that trailing whitespace when I applied this.

^ permalink raw reply	[flat|nested] 61+ messages in thread

* Re: [PATCH v10] tilegx network driver: initial support
  2012-06-12  0:03                                                     ` David Miller
@ 2012-06-12 13:14                                                       ` Chris Metcalf
  0 siblings, 0 replies; 61+ messages in thread
From: Chris Metcalf @ 2012-06-12 13:14 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, bhutchings, arnd, linux-kernel, netdev

On 6/11/2012 8:03 PM, David Miller wrote:
> From: Chris Metcalf <cmetcalf@tilera.com>
> Date: Thu, 7 Jun 2012 16:45:02 -0400
>
>> This change adds support for the tilegx network driver based on the
>> GXIO IORPC support in the tilegx software stack, using the on-chip
>> mPIPE packet processing engine.
>>
>> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
> Applied.

Glad to hear it.  Thanks for bearing with the multiple revisions.  I (we)
appreciate all your feedback, and that of Ben, Eric, Arnd, and other folks
who contributed their time.

The driver does depend on tilegx iorpc framework code that is currently
only in linux-next, but it should all come together properly for 3.6.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com




^ permalink raw reply	[flat|nested] 61+ messages in thread

end of thread, other threads:[~2012-06-12 13:14 UTC | newest]

Thread overview: 61+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-04 20:39 [PATCH 0/6] arch/tile: provide tilegx networking support Chris Metcalf
2012-04-04 20:39 ` [PATCH 1/6] arch/tile: introduce GXIO IORPC framework for tilegx Chris Metcalf
2012-04-04 20:58 ` [PATCH 4/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
2012-04-06 17:41 ` [PATCH 2/6] arch/tile: fix set_pte() to properly handle kernel MMIO mappings Chris Metcalf
2012-04-06 17:52 ` [PATCH 3/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
2012-04-09 13:24   ` Arnd Bergmann
2012-04-09 20:53     ` Chris Metcalf
2012-04-06 20:38 ` [PATCH 5/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
2012-04-09 13:34   ` Arnd Bergmann
2012-04-09 21:04     ` Chris Metcalf
2012-04-06 20:42 ` [PATCH 6/6] tilegx network driver: initial support Chris Metcalf
2012-04-09 13:49   ` Arnd Bergmann
2012-04-09 21:30     ` Chris Metcalf
2012-04-10 10:42       ` Arnd Bergmann
2012-04-12 23:23         ` Chris Metcalf
2012-04-13 10:34           ` Arnd Bergmann
2012-04-28 22:07             ` Chris Metcalf
2012-04-04 20:39               ` [PATCH v2 0/6] arch/tile: networking support for tilegx Chris Metcalf
2012-04-04 20:39                 ` [PATCH v2 1/6] arch/tile: introduce GXIO IORPC framework " Chris Metcalf
2012-04-04 20:58                 ` [PATCH v2 3/6] arch/tile: common DMA code for the GXIO IORPC subsystem Chris Metcalf
2012-04-06 17:52                 ` [PATCH v2 2/6] arch/tile: support MMIO-based readb/writeb etc Chris Metcalf
2012-04-06 20:38                 ` [PATCH v2 4/6] arch/tile: provide kernel support for the tilegx mPIPE shim Chris Metcalf
2012-04-06 20:42                 ` [PATCH v2 6/6] tilegx network driver: initial support Chris Metcalf
2012-04-30 14:35                   ` Arnd Bergmann
2001-09-17  4:00                     ` [PATCH v3] " Chris Metcalf
2012-05-03  5:41                       ` David Miller
2012-05-03 15:45                         ` Chris Metcalf
2012-05-03 17:07                           ` David Miller
2012-05-03 17:25                             ` Chris Metcalf
2012-05-03 16:41                         ` [PATCH v4] " Chris Metcalf
2012-05-04  6:42                           ` David Miller
2012-05-09 10:42                             ` [PATCH v5] " Chris Metcalf
2012-05-11 13:54                               ` Ben Hutchings
2012-05-20  4:42                                 ` [PATCH v6] " Chris Metcalf
2012-05-20 20:55                                   ` David Miller
2012-05-23 20:42                                     ` [PATCH v7] " Chris Metcalf
2012-05-24  4:31                                       ` David Miller
2012-05-25 14:42                                         ` [PATCH v8] " Chris Metcalf
2012-06-04 20:12                                           ` [PATCH v9] " Chris Metcalf
2012-06-06 16:41                                             ` David Miller
2012-06-06 17:31                                             ` Eric Dumazet
2012-06-06 17:40                                             ` Eric Dumazet
2012-06-06 18:36                                               ` Chris Metcalf
2012-06-06 18:54                                                 ` David Miller
2001-09-17  4:00                                                   ` [PATCH v10] " Chris Metcalf
2012-04-06 20:42                                                   ` Chris Metcalf
2012-06-07 20:39                                                     ` David Miller
2012-06-07 20:44                                                       ` Chris Metcalf
2012-06-07 20:47                                                         ` Chris Metcalf
2012-06-07 20:50                                                         ` Ben Hutchings
2012-06-07 20:52                                                     ` Joe Perches
2012-06-07 20:45                                                   ` Chris Metcalf
2012-06-12  0:03                                                     ` David Miller
2012-06-12 13:14                                                       ` Chris Metcalf
2012-06-06 18:10                                             ` [PATCH v9] " Eric Dumazet
2012-06-06 18:17                                               ` David Miller
2012-06-06 18:19                                               ` Ben Hutchings
2012-05-20 16:35                                 ` [PATCH v5] " Chris Metcalf
2012-04-28 19:41                 ` [PATCH v2 5/6] arch/tile: break out the "csum a long" function to <asm/checksum.h> Chris Metcalf
2012-04-29 11:15               ` [PATCH 6/6] tilegx network driver: initial support Arnd Bergmann
2012-04-15 23:06         ` Chris Metcalf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.