linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][v3][6/21] Add InfiniBand SA (Subnet Administration) query support
       [not found] <20041213109.3tK6alRLJABxH4bu@topspin.com>
@ 2004-12-13 18:09 ` Roland Dreier
  2004-12-13 18:09   ` [PATCH][v3][7/21] Add Mellanox HCA low-level driver Roland Dreier
  2004-12-14 17:10 ` [openib-general] [PATCH][v3][5/21] Add InfiniBand MAD (management datagram) support Tom Duffy
  1 sibling, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add support for sending queries to the SA (Subnet Administration).  In
particular the PathRecord and MCMember (multicast group member) used
by the IP-over-InfiniBand driver are implemented.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/drivers/infiniband/core/Makefile	2004-12-13 09:44:42.966395657 -0800
+++ linux-bk/drivers/infiniband/core/Makefile	2004-12-13 09:44:43.579305364 -0800
@@ -2,7 +2,8 @@
 
 obj-$(CONFIG_INFINIBAND) += \
     ib_core.o \
-    ib_mad.o
+    ib_mad.o \
+    ib_sa.o
 
 ib_core-objs := \
     packer.o \
@@ -17,3 +18,5 @@
     mad.o \
     smi.o \
     agent.o
+
+ib_sa-objs := sa_query.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/core/sa_query.c	2004-12-13 09:44:43.603301828 -0800
@@ -0,0 +1,855 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * These two structures must be packed because they have 64-bit fields
+ * that are only 32-bit aligned.  64-bit architectures will lay them
+ * out wrong otherwise.  (And unfortunately they are sent on the wire
+ * so we can't change the layout)
+ */
+struct ib_sa_hdr {
+	u64			sm_key;
+	u16			attr_offset;
+	u16			reserved;
+	ib_sa_comp_mask		comp_mask;
+} __attribute__ ((packed));
+
+struct ib_sa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	struct ib_sa_hdr	sa_hdr;
+	u8			data[200];
+} __attribute__ ((packed));
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_mr        *mr;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_port  *port;
+	struct ib_sa_mad   *mad;
+	struct ib_sa_sm_ah *sm_ah;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+	int                 id;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+enum {
+	IB_SA_ATTR_CLASS_PORTINFO    = 0x01,
+	IB_SA_ATTR_NOTICE	     = 0x02,
+	IB_SA_ATTR_INFORM_INFO	     = 0x03,
+	IB_SA_ATTR_NODE_REC	     = 0x11,
+	IB_SA_ATTR_PORT_INFO_REC     = 0x12,
+	IB_SA_ATTR_SL2VL_REC	     = 0x13,
+	IB_SA_ATTR_SWITCH_REC	     = 0x14,
+	IB_SA_ATTR_LINEAR_FDB_REC    = 0x15,
+	IB_SA_ATTR_RANDOM_FDB_REC    = 0x16,
+	IB_SA_ATTR_MCAST_FDB_REC     = 0x17,
+	IB_SA_ATTR_SM_INFO_REC	     = 0x18,
+	IB_SA_ATTR_LINK_REC	     = 0x20,
+	IB_SA_ATTR_GUID_INFO_REC     = 0x30,
+	IB_SA_ATTR_SERVICE_REC	     = 0x31,
+	IB_SA_ATTR_PARTITION_REC     = 0x33,
+	IB_SA_ATTR_RANGE_REC	     = 0x34,
+	IB_SA_ATTR_PATH_REC	     = 0x35,
+	IB_SA_ATTR_VL_ARB_REC	     = 0x36,
+	IB_SA_ATTR_MC_GROUP_REC	     = 0x37,
+	IB_SA_ATTR_MC_MEMBER_REC     = 0x38,
+	IB_SA_ATTR_TRACE_REC	     = 0x39,
+	IB_SA_ATTR_MULTI_PATH_REC    = 0x3a,
+	IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b
+};
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(void *port_ptr)
+{
+	struct ib_sa_port *port = port_ptr;
+	struct ib_sa_sm_ah *new_ah, *old_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	old_ah = port->sm_ah;
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+	if (old_ah)
+		kref_put(&old_ah->ref, free_sm_ah);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE) {
+		struct ib_sa_device *sa_dev =
+			ib_get_client_data(event->device, &sa_client);
+
+		schedule_work(&sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, id);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms)
+{
+	struct ib_sa_port *port = query->port;
+	unsigned long flags;
+	int ret;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+		.wr	     = {
+			 .ud = {
+				 .mad_hdr     = &query->mad->mad_hdr,
+				 .remote_qpn  = 1,
+				 .remote_qkey = IB_QP1_QKEY,
+				 .timeout_ms  = timeout_ms
+			 }
+		 }
+	};
+
+retry:
+	if (!idr_pre_get(&query_idr, GFP_ATOMIC))
+		return -ENOMEM;
+	spin_lock_irqsave(&idr_lock, flags);
+	ret = idr_get_new(&query_idr, query, &query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		return ret;
+
+	wr.wr_id = query->id;
+
+	spin_lock_irqsave(&port->ah_lock, flags);
+	kref_get(&port->sm_ah->ref);
+	query->sm_ah = port->sm_ah;
+	wr.wr.ud.ah  = port->sm_ah->ah;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	gather_list.addr   = dma_map_single(port->agent->device->dma_device,
+					    query->mad,
+					    sizeof (struct ib_sa_mad),
+					    DMA_TO_DEVICE);
+	gather_list.length = sizeof (struct ib_sa_mad);
+	gather_list.lkey   = port->mr->lkey;
+	pci_unmap_addr_set(query, mapping, gather_list.addr);
+
+	ret = ib_post_send_mad(port->agent, &wr, &bad_wr);
+	if (ret) {
+		dma_unmap_single(port->agent->device->dma_device,
+				 pci_unmap_addr(query, mapping),
+				 sizeof (struct ib_sa_mad),
+				 DMA_TO_DEVICE);
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, query->id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	return ret;
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(sa_query->mad);
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port   = &sa_dev->port[port_num - sa_dev->start_port];
+	struct ib_mad_agent *agent  = port->agent;
+	int ret;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+	query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+	if (!query->sa_query.mad) {
+		kfree(query);
+		return -ENOMEM;
+	}
+
+	query->callback = callback;
+	query->context  = context;
+
+	init_mad(query->sa_query.mad, agent);
+
+	query->sa_query.callback              = ib_sa_path_rec_callback;
+	query->sa_query.release               = ib_sa_path_rec_release;
+	query->sa_query.port                  = port;
+	query->sa_query.mad->mad_hdr.method   = IB_MGMT_METHOD_GET;
+	query->sa_query.mad->mad_hdr.attr_id  = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+		rec, query->sa_query.mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms);
+	if (ret) {
+		*sa_query = NULL;
+		kfree(query->sa_query.mad);
+		kfree(query);
+	}
+
+	return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(sa_query->mad);
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec, 
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, int gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port   = &sa_dev->port[port_num - sa_dev->start_port];
+	struct ib_mad_agent *agent  = port->agent;
+	int ret;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+	query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+	if (!query->sa_query.mad) {
+		kfree(query);
+		return -ENOMEM;
+	}
+
+	query->callback = callback;
+	query->context  = context;
+
+	init_mad(query->sa_query.mad, agent);
+
+	query->sa_query.callback              = ib_sa_mcmember_rec_callback;
+	query->sa_query.release               = ib_sa_mcmember_rec_release;
+	query->sa_query.port                  = port;
+	query->sa_query.mad->mad_hdr.method   = method;
+	query->sa_query.mad->mad_hdr.attr_id  = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, query->sa_query.mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms);
+	if (ret) {
+		*sa_query = NULL;
+		kfree(query->sa_query.mad);
+		kfree(query);
+	}
+
+	return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_mcmember_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	query = idr_find(&query_idr, mad_send_wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	if (!query)
+		return;
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+		/* No callback -- already got recv */
+		break;
+	case IB_WC_RESP_TIMEOUT_ERR:
+		query->callback(query, -ETIMEDOUT, NULL);
+		break;
+	case IB_WC_WR_FLUSH_ERR:
+		query->callback(query, -EINTR, NULL);
+		break;
+	default:
+		query->callback(query, -EIO, NULL);
+		break;
+	}
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(query, mapping),
+			 sizeof (struct ib_sa_mad),
+			 DMA_TO_DEVICE);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+
+	query->release(query);
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, mad_send_wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	query = idr_find(&query_idr, mad_recv_wc->wc->wr_id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	if (query) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	int s, e, i;
+
+	if (device->node_type == IB_NODE_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kmalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		sa_dev->port[i].mr       = NULL;
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		sa_dev->port[i].mr = ib_get_dma_mr(sa_dev->port[i].agent->qp->pd,
+						   IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(sa_dev->port[i].mr)) {
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			goto err;
+		}
+
+		INIT_WORK(&sa_dev->port[i].update_task,
+			  update_sm_ah, &sa_dev->port[i]);
+	}
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		update_sm_ah(&sa_dev->port[i]);
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	return;
+
+err:
+	while (--i >= 0) {
+		ib_dereg_mr(sa_dev->port[i].mr);
+		ib_unregister_mad_agent(sa_dev->port[i].agent);
+	}
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		ib_unregister_mad_agent(sa_dev->port[i].agent);
+		kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	spin_lock_init(&idr_lock);
+	spin_lock_init(&tid_lock);
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret)
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+		
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	ib_unregister_client(&sa_client);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/include/ib_sa.h	2004-12-13 09:44:43.630297851 -0800
@@ -0,0 +1,269 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/compiler.h>
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+
+enum {
+	IB_SA_CLASS_VERSION	= 2,	/* IB spec version 1.1/1.2 */
+
+	IB_SA_METHOD_DELETE	= 0x15
+};
+
+enum ib_sa_selector {
+	IB_SA_GTE  = 0,
+	IB_SA_LTE  = 1,
+	IB_SA_EQ   = 2,
+	/*
+	 * The meaning of "best" depends on the attribute: for
+	 * example, for MTU best will return the largest available
+	 * MTU, while for packet life time, best will return the
+	 * smallest available life time.
+	 */
+	IB_SA_BEST = 3
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n)	((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec."  No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+/* reserved:								 0 */
+/* reserved:								 1 */
+#define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID				IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC			IB_SA_COMP_MASK( 6)
+/* reserved:								 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL       		IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT			IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS			IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
+/* reserved:								14 */
+#define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR			IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE				IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME			IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+	/* reserved */
+	/* reserved */
+	union ib_gid dgid;
+	union ib_gid sgid;
+	u16          dlid;
+	u16          slid;
+	int          raw_traffic;
+	/* reserved */
+	u32          flow_label;
+	u8           hop_limit;
+	u8           traffic_class;
+	int          reversible;
+	u8           numb_path;
+	u16          pkey;
+	/* reserved */
+	u8           sl;
+	u8           mtu_selector;
+	enum ib_mtu  mtu;
+	u8           rate_selector;
+	u8           rate;
+	u8           packet_life_time_selector;
+	u8           packet_life_time;
+	u8           preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY				IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID				IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR			IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU				IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS		IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY				IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR		IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE				IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME		IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL				IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL			IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT			IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE			IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE			IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN			IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+	union ib_gid mgid;
+	union ib_gid port_gid;
+	u32          qkey;
+	u16          mlid;
+	u8           mtu_selector;
+	enum         ib_mtu mtu;
+	u8           traffic_class;
+	u16          pkey;
+	u8 	     rate_selector;
+	u8 	     rate;
+	u8 	     packet_life_time_selector;
+	u8 	     packet_life_time;
+	u8           sl;
+	u32          flow_label;
+	u8           hop_limit;
+	u8           scope;
+	u8           join_state;
+	int          proxy_join;
+};
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec, 
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query);
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, int gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **query);
+
+/**
+ * ib_sa_mcmember_rec_set - Start an MCMember set query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Set query to the SA (eg to join a multicast
+ * group).  The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query.  The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_set() is negative, it is
+ * an error code.  Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_set(struct ib_device *device, u8 port_num,
+		       struct ib_sa_mcmember_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, int gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_mcmember_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query)
+{
+	return ib_sa_mcmember_rec_query(device, port_num,
+					IB_MGMT_METHOD_SET,
+					rec, comp_mask,
+					timeout_ms, gfp_mask, callback,
+					context, query);
+}
+
+/**
+ * ib_sa_mcmember_rec_delete - Start an MCMember delete query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Delete query to the SA (eg to leave a multicast
+ * group).  The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query.  The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_delete() is negative, it
+ * is an error code.  Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_delete(struct ib_device *device, u8 port_num,
+			  struct ib_sa_mcmember_rec *rec,
+			  ib_sa_comp_mask comp_mask,
+			  int timeout_ms, int gfp_mask,
+			  void (*callback)(int status,
+					   struct ib_sa_mcmember_rec *resp,
+					   void *context),
+			  void *context,
+			  struct ib_sa_query **query)
+{
+	return ib_sa_mcmember_rec_query(device, port_num,
+					IB_SA_METHOD_DELETE,
+					rec, comp_mask,
+					timeout_ms, gfp_mask, callback,
+					context, query);
+}
+
+
+#endif /* IB_SA_H */


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][7/21] Add Mellanox HCA low-level driver
  2004-12-13 18:09 ` [PATCH][v3][6/21] Add InfiniBand SA (Subnet Administration) query support Roland Dreier
@ 2004-12-13 18:09   ` Roland Dreier
  2004-12-13 18:09     ` [PATCH][v3][8/21] Add Mellanox HCA low-level driver (midlayer interface) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add a low-level driver for Mellanox MT23108 and MT25208 HCAs.  The
MT25208 is only fully supported when in MT23108 compatibility mode;
only the very beginnings of support for native MT25208 mode (required
for HCAs without local memory) is present.

(As a side note, I believe this driver would be the first in-tree
consumer of the PCI MSI/MSI-X API)

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/drivers/infiniband/Kconfig	2004-12-13 09:44:40.230798660 -0800
+++ linux-bk/drivers/infiniband/Kconfig	2004-12-13 09:44:43.936252779 -0800
@@ -8,4 +8,6 @@
 	  any protocols you wish to use as well as drivers for your
 	  InfiniBand hardware.
 
+source "drivers/infiniband/hw/mthca/Kconfig"
+
 endmenu
--- linux-bk.orig/drivers/infiniband/Makefile	2004-12-13 09:44:40.278791590 -0800
+++ linux-bk/drivers/infiniband/Makefile	2004-12-13 09:44:43.909256756 -0800
@@ -1 +1,2 @@
 obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= hw/mthca/
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/Kconfig	2004-12-13 09:44:43.962248949 -0800
@@ -0,0 +1,26 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI && INFINIBAND
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_MTHCA
+	default n
+	---help---
+	  This option causes the mthca driver produce a bunch of debug
+	  messages.  Select this is you are developing the driver or
+	  trying to diagnose a problem.
+
+config INFINIBAND_MTHCA_SSE_DOORBELL
+	bool "SSE doorbell code"
+	depends on INFINIBAND_MTHCA && X86 && !X86_64
+	default n
+	---help---
+	  This option will have the mthca driver use SSE instructions
+	  to ring hardware doorbell registers.  This may improve
+	  performance for some workloads, but the driver will not run
+	  on processors without SSE instructions.
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/Makefile	2004-12-13 09:44:43.990244825 -0800
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_allocator.c	2004-12-13 09:44:44.017240848 -0800
@@ -0,0 +1,168 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_allocator.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h> 
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	u32 obj;
+
+	spin_lock(&alloc->lock);
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock(&alloc->lock);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	obj &= alloc->max - 1;
+	spin_lock(&alloc->lock);
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+	spin_unlock(&alloc->lock);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page) {
+		int i = index & (PAGE_SIZE / sizeof (void *) - 1);
+		return array->page_list[p].page[i];
+	} else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & (PAGE_SIZE / sizeof (void *) - 1)] =
+		value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	}
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_config_reg.h	2004-12-13 09:44:44.041237312 -0800
@@ -0,0 +1,44 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_config_reg.h 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#include <asm/page.h>
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_ECR_OFFSET       (MTHCA_ECR_BASE     - MTHCA_HCR_BASE)
+#define MTHCA_ECR_CLR_OFFSET   (MTHCA_ECR_CLR_BASE - MTHCA_HCR_BASE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+
+#define MTHCA_MAP_HCR_SIZE     (MTHCA_ECR_CLR_BASE   + \
+			        MTHCA_ECR_CLR_SIZE   - \
+			        MTHCA_HCR_BASE)
+
+#endif /* MTHCA_CONFIG_REG_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_dev.h	2004-12-13 09:44:44.069233188 -0800
@@ -0,0 +1,380 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_dev.h 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/semaphore.h>
+#include <asm/scatterlist.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"0.06-pre"
+#define DRV_RELDATE	"November 8, 2004"
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE		/* MT25208 with extended features */
+};
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI        = 1 << 3,
+	MTHCA_FLAG_MSI_X      = 1 << 4,
+	MTHCA_FLAG_NO_LAM     = 1 << 5
+};
+
+enum {
+	MTHCA_KAR_PAGE  = 1,
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  =  0x40
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+struct mthca_cmd {
+	int                       use_events;
+	struct semaphore          hcr_sem;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      max_sg;
+	int      num_qps;
+	int      reserved_qps;
+	int      num_srqs;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int      mtt_seg_size;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      num_rdbs;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc mpt_alloc;
+	int                max_mtt_order;
+	unsigned long    **mtt_buddy;
+	u64                mtt_base;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc alloc;
+	spinlock_t         lock;
+	struct mthca_array cq;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc alloc;
+	int                sqp_start;
+	spinlock_t         lock;
+	struct mthca_array qp;
+};
+
+struct mthca_av_table {
+	struct pci_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct semaphore   sem;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+
+	u32              rev_id;
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct scatterlist *mem;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+
+	void __iomem    *hcr;
+	void __iomem    *clr_base;
+	void __iomem    *kar;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_pd       driver_pd;
+	struct mthca_mr       driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+};
+
+#define mthca_dbg(mdev, format, arg...) \
+	dev_dbg(&mdev->pdev->dev, format, ## arg)
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+			case 1: (dest) = *(u8 *) __p;       break;    \
+			case 2: (dest) = be16_to_cpup(__p); break;    \
+			case 4: (dest) = be32_to_cpup(__p); break;    \
+			case 8: (dest) = be64_to_cpup(__p); break;    \
+			default: __buggy_use_of_MTHCA_GET();          \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		__typeof__(source) *__p =                             \
+			(__typeof__(source) *) ((char *) (dest) + (offset)); \
+		switch (sizeof(source)) {                             \
+			case 1: *__p = (source);            break;    \
+			case 2: *__p = cpu_to_be16(source); break;    \
+			case 4: *__p = cpu_to_be32(source); break;    \
+			case 8: *__p = cpu_to_be64(source); break;    \
+			default: __buggy_use_of_MTHCA_PUT();          \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq,
+		  int solicited);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask);
+int mthca_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		    struct ib_send_wr **bad_wr);
+int mthca_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		       struct ib_recv_wr **bad_wr);
+int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
+		       int index, int *dbd, u32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   enum ib_sig_type recv_policy,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    enum ib_sig_type recv_policy,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      u16 slid,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+#endif /* MTHCA_DEV_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_doorbell.h	2004-12-13 09:44:44.095229358 -0800
@@ -0,0 +1,112 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_doorbell.h 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/preempt.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#elif defined(CONFIG_INFINIBAND_MTHCA_SSE_DOORBELL)
+/* Use SSE to write 64 bits atomically without a lock. */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline unsigned long mthca_get_fpu(void)
+{
+	unsigned long cr0;
+
+	preempt_disable();
+	asm volatile("mov %%cr0,%0; clts" : "=r" (cr0));
+	return cr0;
+}
+
+static inline void mthca_put_fpu(unsigned long cr0)
+{
+	asm volatile("mov %0,%%cr0" : : "r" (cr0));
+	preempt_enable();
+}
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	/* i386 stack is aligned to 8 bytes, so this should be OK: */
+	u8 xmmsave[8] __attribute__((aligned(8)));
+	unsigned long cr0;
+
+	cr0 = mthca_get_fpu();
+
+	asm volatile (
+		"movlps %%xmm0,(%0); \n\t"
+		"movlps (%1),%%xmm0; \n\t"
+		"movlps %%xmm0,(%2); \n\t"
+		"movlps (%0),%%xmm0; \n\t"
+		:
+		: "r" (xmmsave), "r" (val), "r" (dest)
+		: "memory" );
+
+	mthca_put_fpu(cr0);
+}
+
+#else
+/* Just fall back to a spinlock to protect the doorbell */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(val[0], dest);
+	__raw_writel(val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_main.c	2004-12-13 09:44:44.121225529 -0800
@@ -0,0 +1,922 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_main.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+
+#ifdef CONFIG_INFINIBAND_MTHCA_SSE_DOORBELL
+#include <asm/cpufeature.h>
+#endif
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 0;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+static int msi = 0;
+module_param(msi, int, 0444);
+MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+#define msi   (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static const char mthca_version[] __devinitdata =
+	"ib_mthca: Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int __devinit mthca_tune_pci(struct mthca_dev *mdev)
+{
+	int cap;
+	u16 val;
+
+	/* First try to max out Read Byte Count */
+	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	if (cap) {
+		if (pci_read_config_word(mdev->pdev, cap + PCI_X_CMD, &val)) {
+			mthca_err(mdev, "Couldn't read PCI-X command register, "
+				  "aborting.\n");
+			return -ENODEV;
+		}
+		val = (val & ~PCI_X_CMD_MAX_READ) | (3 << 2);
+		if (pci_write_config_word(mdev->pdev, cap + PCI_X_CMD, val)) {
+			mthca_err(mdev, "Couldn't write PCI-X command register, "
+				  "aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->hca_type == TAVOR)
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
+	if (cap) {
+		if (pci_read_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, &val)) {
+			mthca_err(mdev, "Couldn't read PCI Express device control "
+				  "register, aborting.\n");
+			return -ENODEV;
+		}
+		val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12);
+		if (pci_write_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, val)) {
+			mthca_err(mdev, "Couldn't write PCI Express device control "
+				  "register, aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->hca_type == ARBEL_NATIVE ||
+		   mdev->hca_type == ARBEL_COMPAT)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+	u8 status;
+
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %ld, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	mdev->limits.max_sg             = dev_lim->max_sg;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	return 0;
+}
+
+static int __devinit mthca_init_tavor(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_init_hca_param init_hca;
+	struct mthca_adapter        adapter;
+
+	err = mthca_SYS_EN(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "SYS_EN returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		goto err_out_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_out_disable;
+	}
+	err = mthca_QUERY_DDR(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command failed, aborting.\n");
+		goto err_out_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DDR returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_out_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	
+	err = mthca_make_profile(mdev, &dev_lim, &init_hca);
+	if (err)
+		goto err_out_disable;
+
+	err = mthca_INIT_HCA(mdev, &init_hca, &status);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+		goto err_out_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_out_disable;
+	}
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_out_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_out_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	mdev->rev_id            = adapter.revision_id;
+
+	return 0;
+
+err_out_close:
+	mthca_CLOSE_HCA(mdev, 0, &status);
+
+err_out_disable:
+	mthca_SYS_DIS(mdev, &status);
+
+	return err;
+}
+
+static int __devinit mthca_load_fw(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+	int num_ent, num_sg, fw_pages, cur_order;
+	int i;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.mem = kmalloc(sizeof *mdev->fw.arbel.mem *
+				     mdev->fw.arbel.fw_pages,
+				     GFP_KERNEL);
+	if (!mdev->fw.arbel.mem) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	memset(mdev->fw.arbel.mem, 0,
+	       sizeof *mdev->fw.arbel.mem * mdev->fw.arbel.fw_pages);
+
+	fw_pages = mdev->fw.arbel.fw_pages;
+	num_ent = 0;
+	/*
+	 * We allocate in as big chunks as we can, up to a maximum of
+	 * 256 KB per chunk.
+	 */
+	cur_order = get_order(1 << 18);
+	while (fw_pages > 0) {
+		while (1 << cur_order > fw_pages)
+			--cur_order;
+
+		/*
+		 * We allocate with GFP_HIGHUSER because only the
+		 * firmware is going to touch these pages, so there's
+		 * no need for a kernel virtual address.  We use
+		 * __GFP_NOWARN because we'll deal with any allocation
+		 * failures ourselves.
+		 */
+		mdev->fw.arbel.mem[num_ent].page   = alloc_pages(GFP_HIGHUSER | __GFP_NOWARN,
+								 cur_order);
+		mdev->fw.arbel.mem[num_ent].length = PAGE_SIZE << cur_order;
+		if (!mdev->fw.arbel.mem[num_ent].page) {
+			--cur_order;
+			if (cur_order < 0) {
+				mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+				err = -ENOMEM;
+				goto err_free;
+			}
+		} else {
+			++num_ent;
+			fw_pages -= 1 << cur_order;
+		}
+	}
+	num_sg = pci_map_sg(mdev->pdev, mdev->fw.arbel.mem, num_ent,
+			    PCI_DMA_BIDIRECTIONAL);
+	if (num_sg <= 0) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	err = mthca_MAP_FA(mdev, num_sg, mdev->fw.arbel.mem, &status);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command failed, aborting.\n");
+		goto err_unmap;
+	}
+	if (status) {
+		mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_unmap;
+	}
+	err = mthca_RUN_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command failed, aborting.\n");
+		goto err_unmap_fa;
+	}
+	if (status) {
+		mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev, &status);
+
+err_unmap:
+	pci_unmap_sg(mdev->pdev, mdev->fw.arbel.mem,
+		   mdev->fw.arbel.fw_pages, PCI_DMA_BIDIRECTIONAL);
+err_free:
+	for (i = 0; i < mdev->fw.arbel.fw_pages; ++i)
+		if (mdev->fw.arbel.mem[i].page)
+			__free_pages(mdev->fw.arbel.mem[i].page,
+				     get_order(mdev->fw.arbel.mem[i].length));
+	kfree(mdev->fw.arbel.mem);
+	return err;
+}
+
+static int __devinit mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim dev_lim;
+	u8 status;
+	int err;
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_ENABLE_LAM(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n");
+		return err;
+	}
+	if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (status) {
+		mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to start FW, aborting.\n");
+		goto err_out_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		goto err_out_disable;
+	}
+
+	mthca_warn(mdev, "Sorry, native MT25208 mode support is not done, "
+		   "aborting.\n");
+	err = -ENODEV;
+
+err_out_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev, &status);
+	return err;
+}
+
+static int __devinit mthca_init_hca(struct mthca_dev *mdev)
+{
+	if (mdev->hca_type == ARBEL_NATIVE)
+		return mthca_init_arbel(mdev);
+	else
+		return mthca_init_tavor(mdev);
+}
+
+static int __devinit mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_out_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_out_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_out_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_out_eq_table_free;
+	}
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_out_cmd_poll;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_out_cq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_out_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_out_av_table_free;
+	}
+
+	return 0;
+
+err_out_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_out_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_out_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_out_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_out_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_out_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_out_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_out_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+	return err;
+}
+
+static int __devinit mthca_request_regions(struct pci_dev *pdev,
+					   int ddr_hidden)
+{
+	int err;
+
+	/*
+	 * We request our first BAR in two chunks, since the MSI-X
+	 * vector table is right in the middle.
+	 *
+	 * This is why we can't just use pci_request_regions() -- if
+	 * we did then setting up MSI-X would fail, since the PCI core
+	 * wants to do request_mem_region on the MSI-X vector table.
+	 */
+	if (!request_mem_region(pci_resource_start(pdev, 0) +
+				MTHCA_HCR_BASE,
+				MTHCA_MAP_HCR_SIZE,
+				DRV_NAME))
+		return -EBUSY;
+
+	if (!request_mem_region(pci_resource_start(pdev, 0) +
+				MTHCA_CLR_INT_BASE,
+				MTHCA_CLR_INT_SIZE,
+				DRV_NAME)) {
+		err = -EBUSY;
+		goto err_out_bar0_beg;
+	}
+
+	err = pci_request_region(pdev, 2, DRV_NAME);
+	if (err)
+		goto err_out_bar0_end;
+
+	if (!ddr_hidden) {
+		err = pci_request_region(pdev, 4, DRV_NAME);
+		if (err)
+			goto err_out_bar2;
+	}
+
+	return 0;
+
+err_out_bar0_beg:
+	release_mem_region(pci_resource_start(pdev, 0) +
+			   MTHCA_HCR_BASE,
+			   MTHCA_MAP_HCR_SIZE);
+
+err_out_bar0_end:
+	release_mem_region(pci_resource_start(pdev, 0) +
+			   MTHCA_CLR_INT_BASE,
+			   MTHCA_CLR_INT_SIZE);
+
+err_out_bar2:
+	pci_release_region(pdev, 2);
+	return err;
+}
+
+static void mthca_release_regions(struct pci_dev *pdev,
+				  int ddr_hidden)
+{
+	release_mem_region(pci_resource_start(pdev, 0) +
+			   MTHCA_HCR_BASE,
+			   MTHCA_MAP_HCR_SIZE);
+	release_mem_region(pci_resource_start(pdev, 0) +
+			   MTHCA_CLR_INT_BASE,
+			   MTHCA_CLR_INT_SIZE);
+	pci_release_region(pdev, 2);
+	if (!ddr_hidden)
+		pci_release_region(pdev, 4);
+}
+
+static int __devinit mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	struct msix_entry entries[3];
+	int err;
+
+	entries[0].entry = 0;
+	entries[1].entry = 1;
+	entries[2].entry = 2;
+
+	err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries));
+	if (err) {
+		if (err > 0)
+			mthca_info(mdev, "Only %d MSI-X vectors available, "
+				   "not using MSI-X\n", err);
+		return err;
+	}
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector = entries[2].vector;
+
+	return 0;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	u8 status;
+	int i;
+
+	mthca_CLOSE_HCA(mdev, 0, &status);
+
+	if (mdev->hca_type == ARBEL_NATIVE) {
+		mthca_UNMAP_FA(mdev, &status);
+
+		pci_unmap_sg(mdev->pdev, mdev->fw.arbel.mem,
+			     mdev->fw.arbel.fw_pages, PCI_DMA_BIDIRECTIONAL);
+
+		for (i = 0; i < mdev->fw.arbel.fw_pages; ++i)
+			if (mdev->fw.arbel.mem[i].page)
+				__free_pages(mdev->fw.arbel.mem[i].page,
+					     get_order(mdev->fw.arbel.mem[i].length));
+		kfree(mdev->fw.arbel.mem);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev, &status);
+	} else
+		mthca_SYS_DIS(mdev, &status);
+}
+
+static int __devinit mthca_init_one(struct pci_dev *pdev,
+				    const struct pci_device_id *id)
+{
+	static int mthca_version_printed = 0;
+	int ddr_hidden = 0;
+	int err;
+	unsigned long mthca_base;
+	struct mthca_dev *mdev;
+
+	if (!mthca_version_printed) {
+		printk(KERN_INFO "%s", mthca_version);
+		++mthca_version_printed;
+	}
+
+	printk(KERN_INFO PFX "Initializing %s (%s)\n",
+	       pci_pretty_name(pdev), pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.");
+		err = -ENODEV;
+		goto err_out_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 2) != 1 << 23) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.");
+		err = -ENODEV;
+		goto err_out_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = mthca_request_regions(pdev, ddr_hidden);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_out_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_out_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_out_free_res;
+		}
+	}
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_out_free_res;
+	}
+
+	mdev->pdev     = pdev;
+	mdev->hca_type = id->driver_data;
+
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_out_free_dev;
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+	if (msi && !(mdev->mthca_flags & MTHCA_FLAG_MSI_X) &&
+	    !pci_enable_msi(pdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI;
+
+	sema_init(&mdev->cmd.hcr_sem, 1);
+	sema_init(&mdev->cmd.poll_sem, 1);
+	mdev->cmd.use_events = 0;
+
+	mthca_base = pci_resource_start(pdev, 0);
+	mdev->hcr = ioremap(mthca_base + MTHCA_HCR_BASE, MTHCA_MAP_HCR_SIZE);
+	if (!mdev->hcr) {
+		mthca_err(mdev, "Couldn't map command register, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_out_free_dev;
+	}
+	mdev->clr_base = ioremap(mthca_base + MTHCA_CLR_INT_BASE,
+				 MTHCA_CLR_INT_SIZE);
+	if (!mdev->clr_base) {
+		mthca_err(mdev, "Couldn't map command register, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_out_iounmap;
+	}
+
+	mthca_base = pci_resource_start(pdev, 2);
+	mdev->kar = ioremap(mthca_base + PAGE_SIZE * MTHCA_KAR_PAGE, PAGE_SIZE);
+	if (!mdev->kar) {
+		mthca_err(mdev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_out_iounmap_clr;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_out_iounmap_kar;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_out_iounmap_kar;
+
+	err = mthca_setup_hca(mdev);
+	if (err)
+		goto err_out_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_out_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_out_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+
+	return 0;
+
+err_out_unregister:
+	mthca_unregister_device(mdev);
+
+err_out_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+
+err_out_close:
+	mthca_close_hca(mdev);
+
+err_out_iounmap_kar:
+	iounmap(mdev->kar);
+
+err_out_iounmap_clr:
+	iounmap(mdev->clr_base);
+
+err_out_iounmap:
+	iounmap(mdev->hcr);
+
+err_out_free_dev:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+		pci_disable_msi(pdev);
+
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_out_free_res:
+	mthca_release_regions(pdev, ddr_hidden);
+
+err_out_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __devexit mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	u8 status;
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p, &status);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		mthca_close_hca(mdev);
+
+		iounmap(mdev->hcr);
+		iounmap(mdev->clr_base);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+			pci_disable_msi(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		mthca_release_regions(pdev, mdev->mthca_flags &
+				      MTHCA_FLAG_DDR_HIDDEN);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= "ib_mthca",
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= __devexit_p(mthca_remove_one)
+};
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	/*
+	 * TODO: measure whether dynamically choosing doorbell code at
+	 * runtime affects our performance.  Is there a "magic" way to
+	 * choose without having to follow a function pointer every
+	 * time we ring a doorbell?
+	 */
+#ifdef CONFIG_INFINIBAND_MTHCA_SSE_DOORBELL
+	if (!cpu_has_xmm) {
+		printk(KERN_ERR PFX "mthca was compiled with SSE doorbell code, but\n");
+		printk(KERN_ERR PFX "the current CPU does not support SSE.\n");
+		printk(KERN_ERR PFX "Turn off CONFIG_INFINIBAND_MTHCA_SSE_DOORBELL "
+		       "and recompile.\n");
+		return -ENODEV;
+	}
+#endif
+
+	ret = pci_register_driver(&mthca_driver);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][8/21] Add Mellanox HCA low-level driver (midlayer interface)
  2004-12-13 18:09   ` [PATCH][v3][7/21] Add Mellanox HCA low-level driver Roland Dreier
@ 2004-12-13 18:09     ` Roland Dreier
  2004-12-13 18:09       ` [PATCH][v3][9/21] Add Mellanox HCA low-level driver (FW commands) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add midlayer interface code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_provider.c	2004-12-13 09:44:44.599155121 -0800
@@ -0,0 +1,622 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_provider.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/* Temporary until we get core support straightened out */
+enum {
+	IB_SMP_ATTRIB_NODE_INFO        = 0x0011,
+	IB_SMP_ATTRIB_GUID_INFO        = 0x0014,
+	IB_SMP_ATTRIB_PORT_INFO        = 0x0015,
+	IB_SMP_ATTRIB_PKEY_TABLE       = 0x0016
+};
+
+static int mthca_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	props->fw_ver        = to_mdev(ibdev)->fw_ver;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version       = 1;
+	in_mad->mad_hdr.mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->mad_hdr.class_version  	   = 1;
+	in_mad->mad_hdr.method         	   = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id   	   = cpu_to_be16(IB_SMP_ATTRIB_NODE_INFO);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1,
+			    1, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->vendor_id      = be32_to_cpup((u32 *) (out_mad->data + 76)) &
+		0xffffff;
+	props->vendor_part_id = be16_to_cpup((u16 *) (out_mad->data + 70));
+	props->hw_ver         = be16_to_cpup((u16 *) (out_mad->data + 72));
+	memcpy(&props->sys_image_guid, out_mad->data + 44, 8);
+	memcpy(&props->node_guid,      out_mad->data + 52, 8);
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version       = 1;
+	in_mad->mad_hdr.mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->mad_hdr.class_version  	   = 1;
+	in_mad->mad_hdr.method         	   = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id   	   = cpu_to_be16(IB_SMP_ATTRIB_PORT_INFO);
+	in_mad->mad_hdr.attr_mod           = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1,
+			    port, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->lid               = be16_to_cpup((u16 *) (out_mad->data + 56));
+	props->lmc               = (*(u8 *) (out_mad->data + 74)) & 0x7;
+	props->sm_lid            = be16_to_cpup((u16 *) (out_mad->data + 58));
+	props->sm_sl             = (*(u8 *) (out_mad->data + 76)) & 0xf;
+	props->state             = (*(u8 *) (out_mad->data + 72)) & 0xf;
+	props->port_cap_flags    = be32_to_cpup((u32 *) (out_mad->data + 60));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->qkey_viol_cntr    = be16_to_cpup((u16 *) (out_mad->data + 88));
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	return 0;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version       = 1;
+	in_mad->mad_hdr.mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->mad_hdr.class_version  	   = 1;
+	in_mad->mad_hdr.method         	   = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id   	   = cpu_to_be16(IB_SMP_ATTRIB_PKEY_TABLE);
+	in_mad->mad_hdr.attr_mod           = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1,
+			    port, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	*pkey = be16_to_cpu(((u16 *) (out_mad->data + 40))[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version       = 1;
+	in_mad->mad_hdr.mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->mad_hdr.class_version  	   = 1;
+	in_mad->mad_hdr.method         	   = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id   	   = cpu_to_be16(IB_SMP_ATTRIB_PORT_INFO);
+	in_mad->mad_hdr.attr_mod           = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1,
+			    port, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw, out_mad->data + 48, 8);
+
+	memset(in_mad, 0, sizeof *in_mad);
+	in_mad->mad_hdr.base_version       = 1;
+	in_mad->mad_hdr.mgmt_class     	   = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	in_mad->mad_hdr.class_version  	   = 1;
+	in_mad->mad_hdr.method         	   = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id   	   = cpu_to_be16(IB_SMP_ATTRIB_GUID_INFO);
+	in_mad->mad_hdr.attr_mod           = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1,
+			    port, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw + 8, out_mad->data + 40 + (index % 8) * 16, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr)
+{
+	struct mthca_qp *qp;
+	int err;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->sq.max    = init_attr->cap.max_send_wr;
+		qp->rq.max    = init_attr->cap.max_recv_wr;
+		qp->sq.max_gs = init_attr->cap.max_send_sge;
+		qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     init_attr->rq_sig_type, qp);
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->sq.max    = init_attr->cap.max_send_wr;
+		qp->rq.max    = init_attr->cap.max_recv_wr;
+		qp->sq.max_gs = init_attr->cap.max_send_sge;
+		qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type, init_attr->rq_sig_type,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+        init_attr->cap.max_inline_data = 0;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries)
+{
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	for (nent = 1; nent < entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent, cq);
+	if (err) {
+		kfree(cq);
+		cq = ERR_PTR(err);
+	} else
+		cq->ibcq.cqe = nent;
+
+	return &cq->ibcq;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static int mthca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify notify)
+{
+	mthca_arm_cq(to_mdev(cq->device), to_mcq(cq),
+		     notify == IB_CQ_SOLICITED);
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
+				       struct ib_phys_buf *buffer_list,
+				       int                 num_phys_buf,
+				       int                 acc,
+				       u64                *iova_start)
+{
+	struct mthca_mr *mr;
+	u64 *page_list;
+	u64 total_size;
+	u64 mask;
+	int shift;
+	int npages;
+	int err;
+	int i, j, n;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK))
+		return ERR_PTR(-EINVAL);
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK))
+		return ERR_PTR(-EINVAL);
+
+	mask = 0;
+	total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (buffer_list[i].addr & ~PAGE_MASK)
+			return ERR_PTR(-EINVAL);
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return ERR_PTR(-EINVAL);
+
+		total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+	}
+
+	/* Find largest page shift we can use to cover buffers */
+	for (shift = PAGE_SHIFT; shift < 31; ++shift)
+		if (num_phys_buf > 1) {
+			if ((1ULL << shift) & mask)
+				break;
+		} else {
+			if (1ULL << shift >= 
+			    buffer_list[0].size + 
+			    (buffer_list[0].addr & ((1ULL << shift) - 1)))
+				break;
+		}
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+	buffer_list[0].addr &= ~0ull << shift;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+	if (!npages)
+		return &mr->ibmr;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+		     ++j)
+			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+		  "in PD %x; shift %d, npages %d.\n",
+		  (unsigned long long) buffer_list[0].addr,
+		  (unsigned long long) *iova_start,
+		  to_mpd(pd)->pd_num,
+		  shift, npages);
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  *iova_start, total_size,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	return &mr->ibmr;
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	mthca_free_mr(to_mdev(mr->device), to_mmr(mr));
+	kfree(mr);
+	return 0;
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	return sprintf(buf, "%x.%x.%x\n", (int) (dev->fw_ver >> 32),
+		       (int) (dev->fw_ver >> 16) & 0xffff,
+		       (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+	struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+	switch (dev->hca_type) {
+	case TAVOR:        return sprintf(buf, "MT23108\n");
+	case ARBEL_COMPAT: return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case ARBEL_NATIVE: return sprintf(buf, "MT25208\n");
+	default:           return sprintf(buf, "unknown\n");
+	}
+}
+
+static CLASS_DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static CLASS_DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+
+static struct class_device_attribute *mthca_class_attributes[] = {
+	&class_device_attr_hw_rev,
+	&class_device_attr_fw_ver,
+	&class_device_attr_hca_type
+};
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.node_type            = IB_NODE_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.dma_device           = &dev->pdev->dev;
+	dev->ib_dev.class_dev.dev        = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.post_send            = mthca_post_send;
+	dev->ib_dev.post_recv            = mthca_post_receive;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.req_notify_cq        = mthca_req_notify_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	ret = ib_register_device(&dev->ib_dev);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_class_attributes); ++i) {
+		ret = class_device_create_file(&dev->ib_dev.class_dev,
+					       mthca_class_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	ib_unregister_device(&dev->ib_dev);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_provider.h	2004-12-13 09:44:44.636149671 -0800
@@ -0,0 +1,214 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_provider.h 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+struct mthca_mr {
+	struct ib_mr ibmr;
+	int order;
+	u32 first_seg;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    ecr_mask;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	int                    cons_index;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+};
+
+struct mthca_av;
+
+struct mthca_ah {
+	struct ib_ah     ibah;
+	int              on_hca;
+	u32              key;
+	struct mthca_av *av;
+	dma_addr_t       avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the send cq locked first.  No other
+ * nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an atomic_t ref count.  The
+ * pointer from the cq/qp_table to the struct counts as one reference.
+ * This reference also is good for access through the consumer API, so
+ * modifying the CQ/QP etc doesn't need to take another reference.
+ * Access because of a completion being polled does need a reference.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
+ * - decrement ref count
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when the
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq {
+	struct ib_cq           ibcq;
+	spinlock_t             lock;
+	atomic_t               refcount;
+	int                    cqn;
+	int                    cons_index;
+	int                    is_direct;
+	union {
+		struct mthca_buf_list direct;
+		struct mthca_buf_list *page_list;
+	}                      queue;
+	struct mthca_mr        mr;
+	wait_queue_head_t      wait;
+};
+
+struct mthca_wq {
+	int   max;
+	int   cur;
+	int   next;
+	int   last_comp;
+	void *last;
+	int   max_gs;
+	int   wqe_shift;
+	enum ib_sig_type policy;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	spinlock_t             lock;
+	atomic_t               refcount;
+	u32                    qpn;
+	int                    transport;
+	enum ib_qp_state       state;
+	int                    is_direct;
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	int                    send_wqe_offset;
+
+	u64                   *wrid;
+	union {
+		struct mthca_buf_list direct;
+		struct mthca_buf_list *page_list;
+	}                      queue;
+
+	wait_queue_head_t      wait;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             port;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][9/21] Add Mellanox HCA low-level driver (FW commands)
  2004-12-13 18:09     ` [PATCH][v3][8/21] Add Mellanox HCA low-level driver (midlayer interface) Roland Dreier
@ 2004-12-13 18:09       ` Roland Dreier
  2004-12-13 18:09         ` [PATCH][v3][10/21] Add Mellanox HCA low-level driver (EQ) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add firmware command processing code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_cmd.c	2004-12-13 09:44:45.011094434 -0800
@@ -0,0 +1,1562 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_cmd.c 1321 2004-12-10 19:38:54Z roland $
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1
+};
+#else
+enum {
+	CMD_TIME_CLASS_A = 60 * HZ,
+	CMD_TIME_CLASS_B = 60 * HZ,
+	CMD_TIME_CLASS_C = 60 * HZ
+};
+#endif
+
+enum {
+	GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+	struct completion done;
+	struct timer_list timer;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+	
+	if (down_interruptible(&dev->cmd.hcr_sem))
+		return -EINTR;
+
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end)) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+
+	if (go_bit(dev)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel(cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel(cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel(cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel(cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel(cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel(cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel(cpu_to_be32((1 << HCR_GO_BIT)                |
+				 (event ? (1 << HCA_E_BIT) : 0)   |
+				 (op_modifier << HCR_OPMOD_SHIFT) |
+				 op),                       dev->hcr + 6 * 4);
+
+out:
+	up(&dev->cmd.hcr_sem);
+	return err;
+}
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	unsigned long end;
+
+	if (down_interruptible(&dev->cmd.poll_sem))
+		return -EINTR;
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end)) {
+		set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm) {
+		memcpy_fromio(out_param, dev->hcr + HCR_OUT_PARAM_OFFSET, sizeof (u64));
+		be64_to_cpus(out_param);
+	}
+
+	*status = be32_to_cpu(__raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	context->token += dev->cmd.token_mask + 1;
+
+	complete(&context->done);
+}
+
+static void event_timeout(unsigned long context_ptr)
+{
+	struct mthca_cmd_context *context =
+		(struct mthca_cmd_context *) context_ptr;
+
+	context->result = -EBUSY;
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	if (down_interruptible(&dev->cmd.event_sem))
+		return -EINTR;
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	context->timer.expires  = jiffies + timeout;
+	add_timer(&context->timer);
+
+	wait_for_completion(&context->done);
+	del_timer_sync(&context->timer);
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	*status = context->status;
+	if (*status)
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, *status);
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.use_events)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout,
+		     u8 *status)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout, status);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.use_events)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+		init_timer(&dev->cmd.context[i].timer);
+		dev->cmd.context[i].timer.data     =
+			(unsigned long) &dev->cmd.context[i];
+		dev->cmd.context[i].timer.function = event_timeout;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.use_events = 1;
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.use_events = 0;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, HZ, status);
+
+	if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, HZ, status);
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, int count,
+		 struct scatterlist *sglist, u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int lg;
+	int nent = 0;
+	int i, j;
+	int err = 0;
+	int ts = 0;
+
+	inbox = pci_alloc_consistent(dev->pdev, PAGE_SIZE, &indma);
+	memset(inbox, 0, PAGE_SIZE);
+
+	for (i = 0; i < count; ++i) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(sg_dma_address(sglist + i) | sg_dma_len(sglist + i)) - 1;
+		if (lg < 12) {
+			mthca_warn(dev, "Got FW area not aligned to 4K (%llx/%x).\n",
+				   (unsigned long long) sg_dma_address(sglist + i),
+				   sg_dma_len(sglist + i));
+			err = -EINVAL;
+			goto out;
+		}
+		for (j = 0; j < sg_dma_len(sglist + i) / (1 << lg); ++j, ++nent) {
+			*((__be64 *) (inbox + nent * 4 + 2)) =
+				cpu_to_be64((sg_dma_address(sglist + i) +
+					     (j << lg)) |
+					    (lg - 12));
+			ts += 1 << (lg - 10);
+			if (nent == PAGE_SIZE / 16) {
+				err = mthca_cmd(dev, indma, nent, 0, CMD_MAP_FA,
+						CMD_TIME_CLASS_B, status);
+				if (err || *status)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent) {
+		err = mthca_cmd(dev, indma, nent, 0, CMD_MAP_FA,
+				CMD_TIME_CLASS_B, status);
+	}
+
+	mthca_dbg(dev, "Mapped %d KB of host memory for FW.\n", ts);
+
+out:
+	pci_free_consistent(dev->pdev, PAGE_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_FW_OUT_SIZE, &outdma);
+	if (!outbox) {
+		return -ENOMEM;
+	}
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more signifant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Arbel page size is always 4 KB; round up number of
+		 * system pages needed.
+		 */
+		dev->fw.arbel.fw_pages =
+			(dev->fw.arbel.fw_pages + (1 << (PAGE_SHIFT - 12)) - 1) >>
+			(PAGE_SHIFT - 12);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_FW_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	u8 info;
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	outbox = pci_alloc_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C, status);
+
+	if (err)
+		goto out;
+
+	if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", 
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	pci_free_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status)
+{
+	u8 info;
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n", 
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	u8 field;
+	u16 size;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+	dev_lim->max_srq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+	dev_lim->max_qp_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+	
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+	if (dev->hca_type == ARBEL_NATIVE) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET);
+		dev_lim->hca.arbel.mtt_entry_sz = size;
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->hca.arbel.mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+	}
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status)
+{
+	u32 *outbox;
+	dma_addr_t outdma;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+
+	outbox = pci_alloc_consistent(dev->pdev, QUERY_ADAPTER_OUT_SIZE, &outdma);
+	if (!outbox)
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET);
+	MTHCA_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET);
+	MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+	MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+out:
+	pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS_OFFSET        	 0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+
+	inbox = pci_alloc_consistent(dev->pdev, INIT_HCA_IN_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, indma, 0, 0, CMD_INIT_HCA,
+			HZ, status);
+
+	pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status)
+{
+	u32 *inbox;
+	dma_addr_t indma;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_FLAG_1X          (1 << 8)
+#define INIT_IB_FLAG_4X          (1 << 9)
+#define INIT_IB_FLAG_12X         (1 << 11)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	inbox = pci_alloc_consistent(dev->pdev, INIT_IB_IN_SIZE, &indma);
+	if (!inbox)
+		return -ENOMEM;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->enable_1x     ? INIT_IB_FLAG_1X  : 0;
+	flags |= param->enable_4x     ? INIT_IB_FLAG_4X  : 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, indma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A, status);
+
+	pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, HZ, status);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, HZ, status);
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mpt_entry,
+			       MTHCA_MPT_ENTRY_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, mpt_index, 0, CMD_SW2HW_MPT,
+			CMD_TIME_CLASS_B, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_MPT_ENTRY_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	if (mpt_entry) {
+		outdma = pci_map_single(dev->pdev, mpt_entry,
+					MTHCA_MPT_ENTRY_SIZE,
+					PCI_DMA_FROMDEVICE);
+		if (pci_dma_mapping_error(outdma))
+			return -ENOMEM;
+	}
+
+	err = mthca_cmd_box(dev, 0, outdma, mpt_index, !mpt_entry,
+			    CMD_HW2SW_MPT,
+			    CMD_TIME_CLASS_B, status);
+
+	if (mpt_entry)
+		pci_unmap_single(dev->pdev, outdma,
+				 MTHCA_MPT_ENTRY_SIZE,
+				 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+		    int num_mtt, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mtt_entry,
+			       (num_mtt + 2) * 8,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, num_mtt, 0, CMD_WRITE_MTT,
+			CMD_TIME_CLASS_B, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 (num_mtt + 2) * 8, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, eq_context,
+			       MTHCA_EQ_CONTEXT_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, eq_num, 0, CMD_SW2HW_EQ,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_EQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, eq_context,
+				MTHCA_EQ_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, eq_num, 0,
+			    CMD_HW2SW_EQ,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_EQ_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, cq_context,
+			       MTHCA_CQ_CONTEXT_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_CQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, cq_context,
+				MTHCA_CQ_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, cq_num, 0,
+			    CMD_HW2SW_CQ,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_CQ_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+		    int is_ee, void *qp_context, u32 optmask,
+		    u8 *status)
+{
+	static const u16 op[] = {
+		[MTHCA_TRANS_RST2INIT]  = CMD_RST2INIT_QPEE,
+		[MTHCA_TRANS_INIT2INIT] = CMD_INIT2INIT_QPEE,
+		[MTHCA_TRANS_INIT2RTR]  = CMD_INIT2RTR_QPEE,
+		[MTHCA_TRANS_RTR2RTS]   = CMD_RTR2RTS_QPEE,
+		[MTHCA_TRANS_RTS2RTS]   = CMD_RTS2RTS_QPEE,
+		[MTHCA_TRANS_SQERR2RTS] = CMD_SQERR2RTS_QPEE,
+		[MTHCA_TRANS_ANY2ERR]   = CMD_2ERR_QPEE,
+		[MTHCA_TRANS_RTS2SQD]   = CMD_RTS2SQD_QPEE,
+		[MTHCA_TRANS_SQD2SQD]   = CMD_SQD2SQD_QPEE,
+		[MTHCA_TRANS_SQD2RTS]   = CMD_SQD2RTS_QPEE,
+		[MTHCA_TRANS_ANY2RST]   = CMD_ERR2RST_QPEE
+	};
+	u8 op_mod = 0;
+
+	dma_addr_t indma;
+	int err;
+
+	if (trans < 0 || trans >= ARRAY_SIZE(op))
+		return -EINVAL;
+
+	if (trans == MTHCA_TRANS_ANY2RST) {
+		indma  = 0;
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		qp_context = pci_alloc_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+						  &indma);
+		op_mod = 2;	/* write outbox, any->reset */
+	} else {
+		indma = pci_map_single(dev->pdev, qp_context,
+				       MTHCA_QP_CONTEXT_SIZE,
+				       PCI_DMA_TODEVICE);
+		if (pci_dma_mapping_error(indma))
+			return -ENOMEM;
+
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(qp_context));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+	}
+
+	if (trans == MTHCA_TRANS_ANY2RST) {
+		err = mthca_cmd_box(dev, 0, indma, (!!is_ee << 24) | num,
+				    op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(qp_context));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+	} else
+		err = mthca_cmd(dev, indma, (!!is_ee << 24) | num,
+				op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+	if (trans != MTHCA_TRANS_ANY2RST)
+		pci_unmap_single(dev->pdev, indma,
+				 MTHCA_QP_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+	else
+		pci_free_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+				    qp_context, indma);
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   void *qp_context, u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, qp_context,
+				MTHCA_QP_CONTEXT_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, (!!is_ee << 24) | num, 0,
+			    CMD_QUERY_QPEE,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_QP_CONTEXT_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETY:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int port,
+		  void *in_mad, void *response_mad, u8 *status) {
+	void *box;
+	dma_addr_t dma;
+	int err;
+
+#define MAD_IFC_BOX_SIZE 512
+
+	box = pci_alloc_consistent(dev->pdev, MAD_IFC_BOX_SIZE, &dma);
+	if (!box)
+		return -ENOMEM;
+
+	memcpy(box, in_mad, 256);
+
+	err = mthca_cmd_box(dev, dma, dma + 256, port, !!ignore_mkey,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C, status);
+
+	if (!err && !*status)
+		memcpy(response_mad, box + 256, 256);
+
+	pci_free_consistent(dev->pdev, MAD_IFC_BOX_SIZE, box, dma);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+		   u8 *status)
+{
+	dma_addr_t outdma = 0;
+	int err;
+
+	outdma = pci_map_single(dev->pdev, mgm,
+				MTHCA_MGM_ENTRY_SIZE,
+				PCI_DMA_FROMDEVICE);
+	if (pci_dma_mapping_error(outdma))
+		return -ENOMEM;
+
+	err = mthca_cmd_box(dev, 0, outdma, index, 0,
+			    CMD_READ_MGM,
+			    CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, outdma,
+			 MTHCA_MGM_ENTRY_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	return err;
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+		    u8 *status)
+{
+	dma_addr_t indma;
+	int err;
+
+	indma = pci_map_single(dev->pdev, mgm,
+			       MTHCA_MGM_ENTRY_SIZE,
+			       PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd(dev, indma, index, 0, CMD_WRITE_MGM,
+			CMD_TIME_CLASS_A, status);
+
+	pci_unmap_single(dev->pdev, indma,
+			 MTHCA_MGM_ENTRY_SIZE, PCI_DMA_TODEVICE);
+	return err;
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+		    u8 *status)
+{
+	dma_addr_t indma;
+	u64 imm;
+	int err;
+
+	indma = pci_map_single(dev->pdev, gid, 16, PCI_DMA_TODEVICE);
+	if (pci_dma_mapping_error(indma))
+		return -ENOMEM;
+
+	err = mthca_cmd_imm(dev, indma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A, status);
+	*hash = imm;
+
+	pci_unmap_single(dev->pdev, indma, 16, PCI_DMA_TODEVICE);
+	return err;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_cmd.h	2004-12-13 09:44:45.036090752 -0800
@@ -0,0 +1,265 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_cmd.h 1321 2004-12-10 19:38:54Z roland $
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <ib_verbs.h>
+
+#define MTHCA_CMD_MAILBOX_ALIGN 16UL
+#define MTHCA_CMD_MAILBOX_EXTRA (MTHCA_CMD_MAILBOX_ALIGN - 1)
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+        /* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+        /* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_SRQ = 1 << 6
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int mtt_entry_sz;
+			int mpt_entry_sz;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32 vendor_id;
+	u32 device_id;
+	u32 revision_id;
+	u8  inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u8  log_num_qps;
+	u64 eec_base;
+	u8  log_num_eecs;
+	u64 srqc_base;
+	u8  log_num_srqs;
+	u64 cqc_base;
+	u8  log_num_cqs;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u8  log_num_eqs;
+	u64 rdb_base;
+	u64 mc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_mc_table_sz;
+	u64 mpt_base;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+};
+
+struct mthca_init_ib_param {
+	int enable_1x;
+	int enable_4x;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status);
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_FA(struct mthca_dev *dev, int count,
+		 struct scatterlist *sglist, u8 *status);
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status);
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status);
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+		    int mpt_index, u8 *status);
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+		    int num_mtt, u8 *status);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+		   int eq_num, u8 *status);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+		   int cq_num, u8 *status);
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+		    int is_ee, void *qp_context, u32 optmask,
+		    u8 *status);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   void *qp_context, u8 *status);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int port,
+		  void *in_mad, void *response_mad, u8 *status);
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+		   u8 *status);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+		    u8 *status);
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+		    u8 *status);
+
+#define MAILBOX_ALIGN(x) ((void *) ALIGN((unsigned long) (x), MTHCA_CMD_MAILBOX_ALIGN))
+
+#endif /* MTHCA_CMD_H */


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][10/21] Add Mellanox HCA low-level driver (EQ)
  2004-12-13 18:09       ` [PATCH][v3][9/21] Add Mellanox HCA low-level driver (FW commands) Roland Dreier
@ 2004-12-13 18:09         ` Roland Dreier
  2004-12-13 18:09           ` [PATCH][v3][11/21] Add Mellanox HCA low-level driver (initialization) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add event queue code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_eq.c	2004-12-13 09:44:45.329047594 -0800
@@ -0,0 +1,663 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_eq.c 1321 2004-12-10 19:38:54Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	u32 flags;
+	u64 start;
+	u32 logsize_usrpage;
+	u32 pd;
+	u8  reserved1[3];
+	u8  intr;
+	u32 lost_count;
+	u32 lkey;
+	u32 reserved2[2];
+	u32 consumer_index;
+	u32 producer_index;
+	u32 reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_LAST_WQE       = 0x13,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LAST_WQE)
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			u32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16 reserved1;
+			u16 token;
+			u32 reserved2;
+			u8  reserved3[3];
+			u8  status;
+			u64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			u32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			u32 reserved1[2];
+			u32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, int eqn, int ci)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_REQ_NOT | eqn);
+	doorbell[1] = 0;
+
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
+	doorbell[1] = cpu_to_be32(cqn);
+
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, int entry)
+{
+	return eq->page_list[entry * MTHCA_EQ_ENTRY_SIZE / PAGE_SIZE].buf
+		+ (entry * MTHCA_EQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline int next_eqe_sw(struct mthca_eq *eq)
+{
+	return !(MTHCA_EQ_ENTRY_OWNER_HW &
+		 get_eqe(eq, eq->cons_index)->owner);
+}
+
+static inline void set_eqe_hw(struct mthca_eq *eq, int entry)
+{
+	get_eqe(eq, entry)->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static void mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+
+	while (next_eqe_sw(eq)) {
+		int set_ci = 0;
+		eqe = get_eqe(eq, eq->cons_index);
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_event(dev, disarm_cqn);
+			break;
+			
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			/* cmd_event() may add more commands.
+			 * The card will think the queue has overflowed if
+			 * we don't tell it we've been processing events.
+			 */
+			set_ci = 1;
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on eqn %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		};
+
+		set_eqe_hw(eq, eq->cons_index);
+		eq->cons_index = (eq->cons_index + 1) & (eq->nent - 1);
+
+		if (set_ci) {
+			wmb(); /* see comment below */
+			set_eq_ci(dev, eq->eqn, eq->cons_index);
+			set_ci = 0;
+		}	
+	}
+
+	/*
+	 * This barrier makes sure that all updates to
+	 * ownership bits done by set_eqe_hw() hit memory
+	 * before the consumer index is updated.  set_eq_ci()
+	 * allows the HCA to possibly write more EQ entries,
+	 * and we want to avoid the exceedingly unlikely
+	 * possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	set_eq_ci(dev, eq->eqn, eq->cons_index);
+	eq_req_not(dev, eq->eqn);
+}
+
+static irqreturn_t mthca_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	while ((ecr = readl(dev->hcr + MTHCA_ECR_OFFSET + 4)) != 0) {
+		work = 1;
+
+		writel(ecr, dev->hcr + MTHCA_ECR_CLR_OFFSET + 4);
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i)
+			if (ecr & dev->eq_table.eq[i].ecr_mask)
+				mthca_eq_int(dev, &dev->eq_table.eq[i]);
+	}
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_msi_x_interrupt(int irq, void *eq_ptr,
+					 struct pt_regs *regs)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	writel(eq->ecr_mask, dev->hcr + MTHCA_ECR_CLR_OFFSET + 4);
+	mthca_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int __devinit mthca_create_eq(struct mthca_dev *dev,
+				     int nent,
+				     u8 intr,
+				     struct mthca_eq *eq)
+{
+	int npages = (nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	void *mailbox = NULL;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+	u8 status;
+
+	/* Make sure EQ size is aligned to a power of 2 size. */
+	for (i = 1; i < nent; i <<= 1)
+		; /* nothing */
+	nent = i;
+
+	eq->dev = dev;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = kmalloc(sizeof *eq_context + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_free;
+	eq_context = MAILBOX_ALIGN(mailbox);
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = pci_alloc_consistent(dev->pdev,
+							    PAGE_SIZE, &t);
+		if (!eq->page_list[i].buf)
+			goto err_out_free;
+
+		dma_list[i] = t;
+		pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	for (i = 0; i < nent; ++i)
+		set_eqe_hw(eq, i);
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	eq->nent = nent;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	eq_context->start           = cpu_to_be64(0);
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
+						  MTHCA_KAR_PAGE);
+	eq_context->pd              = cpu_to_be32(dev->driver_pd.pd_num);
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, eq_context, eq->eqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+	if (status) {
+		mthca_warn(dev, "SW2HW_EQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	kfree(mailbox);
+
+	eq->ecr_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	eq_req_not(dev, eq->eqn);
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    eq->page_list[i].buf,
+					    pci_unmap_addr(&eq->page_list[i],
+							   mapping));
+
+	kfree(eq->page_list);
+	kfree(dma_list);
+	kfree(mailbox);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	void *mailbox = NULL;
+	int err;
+	u8 status;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = kmalloc(sizeof (struct mthca_eq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		return;
+
+	err = mthca_HW2SW_EQ(dev, MAILBOX_ALIGN(mailbox),
+			     eq->eqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+	if (status)
+		mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n",
+			   status);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(MAILBOX_ALIGN(mailbox) + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    pci_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	kfree(mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq)
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+}
+
+int __devinit mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI ||
+	    dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 31 ? 4 : 0);
+	}
+
+	intr = (dev->mthca_flags & MTHCA_FLAG_MSI) ?
+		128 : dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_free;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME " (comp)",
+			[MTHCA_EQ_ASYNC] = DRV_NAME " (async)",
+			[MTHCA_EQ_CMD]   = DRV_NAME " (cmd)" 
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  mthca_msi_x_interrupt, 0,
+					  eq_name[i], dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		err = request_irq(dev->pdev->irq, mthca_interrupt, SA_SHIRQ,
+				  DRV_NAME, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for async EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, status);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void __devexit mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	u8 status;
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][11/21] Add Mellanox HCA low-level driver (initialization)
  2004-12-13 18:09         ` [PATCH][v3][10/21] Add Mellanox HCA low-level driver (EQ) Roland Dreier
@ 2004-12-13 18:09           ` Roland Dreier
  2004-12-13 18:09             ` [PATCH][v3][12/21] Add Mellanox HCA low-level driver (QP/CQ) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add device initializaton code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_profile.c	2004-12-13 09:44:45.555014305 -0800
@@ -0,0 +1,215 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_profile.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include "mthca_profile.h"
+
+static int default_profile[MTHCA_RES_NUM] = {
+	[MTHCA_RES_QP]    = 1 << 16,
+	[MTHCA_RES_EQP]   = 1 << 16,
+	[MTHCA_RES_CQ]    = 1 << 16,
+	[MTHCA_RES_EQ]    = 32,
+	[MTHCA_RES_RDB]   = 1 << 18,
+	[MTHCA_RES_MCG]   = 1 << 13,
+	[MTHCA_RES_MPT]   = 1 << 17,
+	[MTHCA_RES_MTT]   = 1 << 20,
+	[MTHCA_RES_UDAV]  = 1 << 15
+};
+
+enum {
+	MTHCA_RDB_ENTRY_SIZE = 32,
+	MTHCA_MTT_SEG_SIZE   = 64
+};
+
+enum {
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+int mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	/* just use default profile for now */
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mthca_resource *profile;
+	struct mthca_resource tmp;
+	int i, j;
+
+	default_profile[MTHCA_RES_UAR] = dev_lim->uar_size / PAGE_SIZE;
+
+	profile = kmalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = MTHCA_MPT_ENTRY_SIZE;
+	profile[MTHCA_RES_MTT].size  = MTHCA_MTT_SEG_SIZE;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = default_profile[i];
+		profile[i].log_num  = max(ffs(default_profile[i]) - 1, 0);
+		profile[i].size    *= default_profile[i];
+	}
+
+	/* 
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp            = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = dev->ddr_start + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > dev->fw.tavor.fw_start - dev->ddr_start) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit between DDR start at 0x%016llx "
+				  "and FW start at 0x%016llx.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) dev->ddr_start,
+				  (unsigned long long) dev->fw.tavor.fw_start);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+		  (int) (total_size >> 10),
+		  (int) ((dev->fw.tavor.fw_start - dev->ddr_start) >> 10),
+		  (int) ((dev->fw.tavor.fw_start - dev->ddr_start - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			dev->limits.num_rdbs = profile[i].num;
+			init_hca->rdb_base   = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts = profile[i].num;
+			init_hca->mpt_base   = profile[i].start;
+			init_hca->log_mpt_sz = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->limits.mtt_seg_size = MTHCA_MTT_SEG_SIZE;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(MTHCA_MTT_SEG_SIZE) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	kfree(profile);
+	return 0;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_profile.h	2004-12-13 09:44:45.581010475 -0800
@@ -0,0 +1,51 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_profile.h 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_NUM
+};
+
+int mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_reset.c	2004-12-13 09:44:45.607006645 -0800
@@ -0,0 +1,221 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_reset.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  cpu_to_be32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (mdev->hca_type == TAVOR) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s (%s)\n",
+					  pci_pretty_name(bridge), pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s (%s)\n",
+				  pci_pretty_name(mdev->pdev), pci_name(mdev->pdev));
+		}
+			
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mthca_err(mdev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't allocate memory to save HCA "
+				  "bridge PCI header, aborting.\n");
+			goto out;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto out;
+			}
+		}
+	}
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto out;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto out;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto out;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	if (bridge)
+		pci_dev_put(bridge);
+	kfree(bridge_header);
+	kfree(hca_header);
+
+	return err;
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][12/21] Add Mellanox HCA low-level driver (QP/CQ)
  2004-12-13 18:09           ` [PATCH][v3][11/21] Add Mellanox HCA low-level driver (initialization) Roland Dreier
@ 2004-12-13 18:09             ` Roland Dreier
  2004-12-13 18:09               ` [PATCH][v3][13/21] Add Mellanox HCA low-level driver (last bits) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add CQ (completion queue) and QP (queue pair) code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_cq.c	2004-12-13 09:44:45.891964666 -0800
@@ -0,0 +1,817 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_cq.c 1298 2004-11-29 03:26:10Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	u32 flags;
+	u64 start;
+	u32 logsize_usrpage;
+	u32 error_eqn;
+	u32 comp_eqn;
+	u32 pd;
+	u32 lkey;
+	u32 last_notified_index;
+	u32 solicit_producer_index;
+	u32 consumer_index;
+	u32 producer_index;
+	u32 cqn;
+	u32 reserved[3];
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	u32 my_qpn;
+	u32 my_ee;
+	u32 rqpn;
+	u16 sl_g_mlpath;
+	u16 rlid;
+	u32 imm_etype_pkey_eec;
+	u32 byte_cnt;
+	u32 wqe;
+	u8  opcode;
+	u8  is_send;
+	u8  reserved;
+	u8  owner;
+};
+
+struct mthca_err_cqe {
+	u32 my_qpn;
+	u32 reserved1[3];
+	u8  syndrome;
+	u8  reserved2;
+	u16 db_cnt;
+	u32 reserved3;
+	u32 wqe;
+	u8  opcode;
+	u8  reserved4[2];
+	u8  owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	if (cq->is_direct)
+		return cq->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return cq->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline int cqe_sw(struct mthca_cq *cq, int i)
+{
+	return !(MTHCA_CQ_ENTRY_OWNER_HW &
+		 get_cqe(cq, i)->owner);
+}
+
+static inline int next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(cq, cq->cons_index);
+}
+
+static inline void set_cqe_hw(struct mthca_cq *cq, int entry)
+{
+	get_cqe(cq, entry)->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static inline void inc_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				  int nent)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = cpu_to_be32(MTHCA_CQ_DB_INC_CI | cq->cqn);
+	doorbell[1] = cpu_to_be32(nent - 1);
+		
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	spin_lock(&dev->cq_table.lock);
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+	spin_unlock(&dev->cq_table.lock);
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn)
+{
+	struct mthca_cq *cq;
+	struct mthca_cqe *cqe;
+	int prod_index;
+	int nfreed = 0;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(cq, prod_index & (cq->ibcq.cqe - 1));
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe - 1)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while (prod_index > cq->cons_index) {
+		cqe = get_cqe(cq, (prod_index - 1) & (cq->ibcq.cqe - 1));
+		if (cqe->my_qpn == cpu_to_be32(qpn))
+			++nfreed;
+		else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index - 1 + nfreed) &
+				       (cq->ibcq.cqe - 1)),
+			       cqe,
+			       MTHCA_CQ_ENTRY_SIZE);
+		--prod_index;
+	}
+
+	if (nfreed) {
+		wmb();
+		inc_cons_index(dev, cq, nfreed);
+		cq->cons_index = (cq->cons_index + nfreed) & (cq->ibcq.cqe - 1);
+	}
+
+	spin_unlock_irq(&cq->lock);
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			    struct mthca_qp *qp, int wqe_index, int is_send,
+			    struct mthca_err_cqe *cqe,
+			    struct ib_wc *entry, int *free_cqe)
+{
+	int err;
+	int dbd;
+	u32 new_wqe;
+
+	if (1 && cqe->syndrome != SYNDROME_WR_FLUSH_ERR) {
+		int j;
+		
+		mthca_dbg(dev, "%x/%d: error CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+	}
+
+	/*
+	 * For completions in error, only work request ID, status (and
+	 * freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe);
+	if (err)
+		return err;
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return 0;
+
+	cqe->db_cnt   = cpu_to_be16(be16_to_cpu(cqe->db_cnt) - dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+
+	return 0;
+}
+
+static void dump_cqe(struct mthca_cqe *cqe)
+{
+	int j;
+
+	for (j = 0; j < 8; ++j)
+		printk(KERN_DEBUG "  [%2x] %08x\n",
+		       j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error = 0;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+
+	if (!next_cqe_sw(cq))
+		return -EAGAIN;
+
+	rmb();
+
+	cqe = get_cqe(cq, cq->cons_index);
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+
+		dump_cqe(cqe);
+	}
+
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK) {
+		is_error = 1;
+		is_send = cqe->opcode & 1;
+	} else
+		is_send = cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		if (*cur_qp) {
+			spin_unlock(&(*cur_qp)->lock);
+			if (atomic_dec_and_test(&(*cur_qp)->refcount))
+				wake_up(&(*cur_qp)->wait);
+		}
+
+		spin_lock(&dev->qp_table.lock);
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (*cur_qp)
+			atomic_inc(&(*cur_qp)->refcount);
+		spin_unlock(&dev->qp_table.lock);
+
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+
+		spin_lock(&(*cur_qp)->lock);
+	}
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index +
+					       (*cur_qp)->rq.max];
+	} else {
+		wq = &(*cur_qp)->rq;
+		wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	}
+
+	if (wq->last_comp < wqe_index)
+		wq->cur -= wqe_index - wq->last_comp;
+	else
+		wq->cur -= wq->max - wq->last_comp + wqe_index;
+
+	wq->last_comp = wqe_index;
+
+	if (0)
+		mthca_dbg(dev, "%s completion for QP %06x, index %d (nr %d)\n",
+			  is_send ? "Send" : "Receive",
+			  (*cur_qp)->qpn, wqe_index, wq->max);
+
+	if (is_error) {
+		err = handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				       (struct mthca_err_cqe *) cqe,
+				       entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->opcode = IB_WC_SEND; /* XXX */
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = be16_to_cpu(cqe->sl_g_mlpath) >> 12;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = be16_to_cpu(cqe->sl_g_mlpath) & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= be16_to_cpu(cqe->sl_g_mlpath) & 0x80 ?
+					IB_WC_GRH : 0;
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (free_cqe) {
+		set_cqe_hw(cq, cq->cons_index);
+		++(*freed);
+		cq->cons_index = (cq->cons_index + 1) & (cq->ibcq.cqe - 1);
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+	}
+
+	if (qp) {
+		spin_unlock(&qp->lock);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+	wmb();
+	inc_cons_index(dev, cq, freed);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq,
+		  int solicited)
+{
+	u32 doorbell[2];
+
+	doorbell[0] =  cpu_to_be32((solicited ?
+				    MTHCA_CQ_DB_REQ_NOT_SOL :
+				    MTHCA_CQ_DB_REQ_NOT)      |
+				   cq->cqn);
+	doorbell[1] = 0xffffffff;
+
+	mthca_write64(doorbell,
+		      dev->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_cq *cq)
+{
+	int size = nent * MTHCA_CQ_ENTRY_SIZE;
+	dma_addr_t t;
+	void *mailbox = NULL;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+	u8 status;
+	int i;
+
+	might_sleep();
+
+	mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out;
+
+	cq_context = MAILBOX_ALIGN(mailbox);
+
+	if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) {
+		if (0)
+			mthca_dbg(dev, "Creating direct CQ of size %d\n", size);
+
+		cq->is_direct = 1;
+		npages        = 1;
+		shift         = get_order(size) + PAGE_SHIFT;
+
+		cq->queue.direct.buf = pci_alloc_consistent(dev->pdev,
+							    size, &t);
+		if (!cq->queue.direct.buf)
+			goto err_out;
+
+		pci_unmap_addr_set(&cq->queue.direct, mapping, t);
+
+		memset(cq->queue.direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		cq->is_direct = 0;
+		npages        = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift         = PAGE_SHIFT;
+
+		if (0)
+			mthca_dbg(dev, "Creating indirect CQ with %d pages\n", npages);
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out;
+
+		cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list,
+					      GFP_KERNEL);
+		if (!cq->queue.page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			cq->queue.page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			cq->queue.page_list[i].buf =
+				pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+			if (!cq->queue.page_list[i].buf)
+				goto err_out_free;
+			
+			dma_list[i] = t;
+			pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t);
+
+			memset(cq->queue.page_list[i].buf, 0, PAGE_SIZE);
+		}
+	}
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(cq, i);
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		goto err_out_free;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &cq->mr);
+	if (err)
+		goto err_out_free_cq;
+
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->refcount, 1);
+	init_waitqueue_head(&cq->wait);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->start           = cpu_to_be64(0);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
+						  MTHCA_KAR_PAGE);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(dev->driver_pd.pd_num);
+	cq_context->lkey            = cpu_to_be32(cq->mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	if (status) {
+		mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	kfree(dma_list);
+	kfree(mailbox);
+
+	return 0;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &cq->mr);
+
+ err_out_free_cq:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+ err_out_free:
+	if (cq->is_direct)
+		pci_free_consistent(dev->pdev, size,
+				    cq->queue.direct.buf,
+				    pci_unmap_addr(&cq->queue.direct, mapping));
+	else {
+		for (i = 0; i < npages; ++i)
+			if (cq->queue.page_list[i].buf)
+				pci_free_consistent(dev->pdev, PAGE_SIZE,
+						    cq->queue.page_list[i].buf,
+						    pci_unmap_addr(&cq->queue.page_list[i],
+								   mapping));
+
+		kfree(cq->queue.page_list);
+	}
+
+ err_out:
+	kfree(dma_list);
+	kfree(mailbox);
+
+	return err;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	void *mailbox;
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, MAILBOX_ALIGN(mailbox), cq->cqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n",
+			   status);
+
+	if (0) {
+		u32 *ctx = MAILBOX_ALIGN(mailbox);
+		int j;
+		
+		printk(KERN_ERR "context for CQN %x\n", cq->cqn);
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	atomic_dec(&cq->refcount);
+	wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+	mthca_free_mr(dev, &cq->mr);
+
+	if (cq->is_direct)
+		pci_free_consistent(dev->pdev,
+				    cq->ibcq.cqe * MTHCA_CQ_ENTRY_SIZE,
+				    cq->queue.direct.buf,
+				    pci_unmap_addr(&cq->queue.direct,
+						   mapping));
+	else {
+		int i;
+
+		for (i = 0;
+		     i < (cq->ibcq.cqe * MTHCA_CQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+			     PAGE_SIZE;
+		     ++i)
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    cq->queue.page_list[i].buf,
+					    pci_unmap_addr(&cq->queue.page_list[i],
+							   mapping));
+
+		kfree(cq->queue.page_list);
+	}
+
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	kfree(mailbox);
+}
+
+int __devinit mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_qp.c	2004-12-13 09:44:45.916960983 -0800
@@ -0,0 +1,1479 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_qp.c 1324 2004-12-13 17:55:08Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72 /* largest UD header possible */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+struct mthca_qp_path {
+	u32 port_pkey;
+	u8  rnr_retry;
+	u8  g_mylmc;
+	u16 rlid;
+	u8  ackto;
+	u8  mgid_index;
+	u8  static_rate;
+	u8  hop_limit;
+	u32 sl_tclass_flowlabel;
+	u8  rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	u32 flags;
+	u32 sched_queue;
+	u32 mtu_msgmax;
+	u32 usr_page;
+	u32 local_qpn;
+	u32 remote_qpn;
+	u32 reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	u32 rdd;
+	u32 pd;
+	u32 wqe_base;
+	u32 wqe_lkey;
+	u32 params1;
+	u32 reserved2;
+	u32 next_send_psn;
+	u32 cqn_snd;
+	u32 next_snd_wqe[2];
+	u32 last_acked_psn;
+	u32 ssn;
+	u32 params2;
+	u32 rnr_nextrecvpsn;
+	u32 ra_buff_indx;
+	u32 cqn_rcv;
+	u32 next_rcv_wqe[2];
+	u32 qkey;
+	u32 srqn;
+	u32 rmsn;
+	u32 reserved3[19];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	u32 opt_param_mask;
+	u32 reserved1;
+	struct mthca_qp_context context;
+	u32 reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_REW               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_NEXT_DBD       = 1 << 7,
+	MTHCA_NEXT_FENCE     = 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE = 1 << 3,
+	MTHCA_NEXT_EVENT_GEN = 1 << 2,
+	MTHCA_NEXT_SOLICIT   = 1 << 1,
+
+	MTHCA_MLX_VL15       = 1 << 17,
+	MTHCA_MLX_SLR        = 1 << 16
+};
+
+struct mthca_next_seg {
+	u32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	u32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	u32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	u32 imm;		/* immediate data */
+} __attribute__((packed));
+
+struct mthca_ud_seg {
+	u32 reserved1;
+	u32 lkey;
+	u64 av_addr;
+	u32 reserved2[4];
+	u32 dqpn;
+	u32 qkey;
+	u32 reserved3[2];
+} __attribute__((packed));
+
+struct mthca_bind_seg {
+	u32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32 reserved;
+	u32 new_rkey;
+	u32 lkey;
+	u64 addr;
+	u64 length;
+} __attribute__((packed));
+
+struct mthca_raddr_seg {
+	u64 raddr;
+	u32 rkey;
+	u32 reserved;
+} __attribute__((packed));
+
+struct mthca_atomic_seg {
+	u64 swap_add;
+	u64 compare;
+} __attribute__((packed));
+
+struct mthca_data_seg {
+	u32 byte_count;
+	u32 lkey;
+	u64 addr;
+} __attribute__((packed));
+
+struct mthca_mlx_seg {
+	u32 nda_op;
+	u32 nds;
+	u32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	u16 rlid;
+	u16 vcrc;
+} __attribute__((packed));
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		atomic_inc(&qp->refcount);
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static const struct {
+	int trans;
+	u32 req_param[NUM_TRANS];
+	u32 opt_param[NUM_TRANS];
+} state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_INIT]  = {
+			.trans = MTHCA_TRANS_RST2INIT,
+			.req_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_ACCESS_FLAGS),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			},
+			/* bug-for-bug compatibility with VAPI: */
+			.opt_param = {
+				[MLX] = IB_QP_PORT
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_INIT]  = {
+			.trans = MTHCA_TRANS_INIT2INIT,
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_PORT       |
+					 IB_QP_ACCESS_FLAGS),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.trans = MTHCA_TRANS_INIT2RTR,
+			.req_param = {
+				[RC]  = (IB_QP_AV                  |
+					 IB_QP_PATH_MTU            |
+					 IB_QP_DEST_QPN            |
+					 IB_QP_RQ_PSN              |
+					 IB_QP_MAX_DEST_RD_ATOMIC  |
+					 IB_QP_MIN_RNR_TIMER),
+			},
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_ALT_PATH     |
+					 IB_QP_ACCESS_FLAGS |
+					 IB_QP_PKEY_INDEX),
+				[MLX] = (IB_QP_PKEY_INDEX |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_RTR2RTS,
+			.req_param = {
+				[UD]  = IB_QP_SQ_PSN,
+				[RC]  = (IB_QP_TIMEOUT           |
+					 IB_QP_RETRY_CNT         |
+					 IB_QP_RNR_RETRY         |
+					 IB_QP_SQ_PSN            |
+					 IB_QP_MAX_QP_RD_ATOMIC),
+				[MLX] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_PKEY_INDEX            |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_RTS2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_ACCESS_FLAGS          |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_PATH_MIG_STATE        |
+					 IB_QP_MIN_RNR_TIMER),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.trans = MTHCA_TRANS_RTS2SQD,
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_SQD2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.trans = MTHCA_TRANS_SQD2SQD,
+			.opt_param = {
+				[UD]  = (IB_QP_PKEY_INDEX            |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_AV                    |
+					 IB_QP_TIMEOUT               |
+					 IB_QP_RETRY_CNT             |
+					 IB_QP_RNR_RETRY             |
+					 IB_QP_MAX_QP_RD_ATOMIC      |
+					 IB_QP_MAX_DEST_RD_ATOMIC    |
+					 IB_QP_CUR_STATE             |
+					 IB_QP_ALT_PATH              |
+					 IB_QP_ACCESS_FLAGS          |
+					 IB_QP_PKEY_INDEX            |
+					 IB_QP_MIN_RNR_TIMER         |
+					 IB_QP_PATH_MIG_STATE),
+				[MLX] = (IB_QP_PKEY_INDEX            |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+		[IB_QPS_RTS]   = {
+			.trans = MTHCA_TRANS_SQERR2RTS,
+			.opt_param = {
+				[UD]  = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+				[RC]  = (IB_QP_CUR_STATE             |
+					 IB_QP_MIN_RNR_TIMER),
+				[MLX] = (IB_QP_CUR_STATE             |
+					 IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+		[IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR }
+	}
+};
+
+static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	u8 status;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.enable_1x = 1;
+	param.enable_4x = 1;
+	param.vl_cap    = dev->limits.vl_cap;
+	param.mtu_cap   = dev->limits.mtu_cap;
+	param.gid_cap   = dev->limits.gid_table_len;
+	param.pkey_cap  = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port, &status);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+	if (status)
+		mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	void *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 req_param, opt_param;
+	u8 status;
+	int err;
+
+	if (attr_mask & IB_QP_CUR_STATE) {
+		if (attr->cur_qp_state != IB_QPS_RTR &&
+		    attr->cur_qp_state != IB_QPS_RTS &&
+		    attr->cur_qp_state != IB_QPS_SQD &&
+		    attr->cur_qp_state != IB_QPS_SQE)
+			return -EINVAL;
+		else
+			cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->lock);
+		cur_state = qp->state;
+		spin_unlock_irq(&qp->lock);
+	}
+
+	if (attr_mask & IB_QP_STATE) {
+               if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR)
+			return -EINVAL;
+		new_state = attr->qp_state;
+	} else
+		new_state = cur_state;
+
+	if (state_table[cur_state][new_state].trans == MTHCA_TRANS_INVALID) {
+		mthca_dbg(dev, "Illegal QP transition "
+			  "%d->%d\n", cur_state, new_state);
+		return -EINVAL;
+	}
+
+	req_param = state_table[cur_state][new_state].req_param[qp->transport];
+	opt_param = state_table[cur_state][new_state].opt_param[qp->transport];
+
+	if ((req_param & attr_mask) != req_param) {
+		mthca_dbg(dev, "QP transition "
+			  "%d->%d missing req attr 0x%08x\n",
+			  cur_state, new_state,
+			  req_param & ~attr_mask);
+		return -EINVAL;
+	}
+
+	if (attr_mask & ~(req_param | opt_param | IB_QP_STATE)) {
+		mthca_dbg(dev, "QP transition (transport %d) "
+			  "%d->%d has extra attr 0x%08x\n",
+			  qp->transport,
+			  cur_state, new_state,
+			  attr_mask & ~(req_param | opt_param |
+						 IB_QP_STATE));
+		return -EINVAL;
+	}
+
+	mailbox = kmalloc(sizeof (*qp_param) + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	qp_param = MAILBOX_ALIGN(mailbox);
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+	/* leave sched_queue as 0 */
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = cpu_to_be32((IB_MTU_2048 << 29) |
+						     (11 << 24));
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		qp_context->mtu_msgmax = cpu_to_be32((attr->path_mtu << 29) |
+						     (31 << 24));
+	}
+	qp_context->usr_page   = cpu_to_be32(MTHCA_KAR_PAGE);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(to_msqp(qp)->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->pri_path.rnr_retry = attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		qp_context->pri_path.g_mylmc     = attr->ah_attr.src_path_bits & 0x7f;
+		qp_context->pri_path.rlid        = cpu_to_be16(attr->ah_attr.dlid);
+		qp_context->pri_path.static_rate = (!!attr->ah_attr.static_rate) << 3;
+		if (attr->ah_attr.ah_flags & IB_AH_GRH) {
+			qp_context->pri_path.g_mylmc |= 1 << 7;
+			qp_context->pri_path.mgid_index = attr->ah_attr.grh.sgid_index;
+			qp_context->pri_path.hop_limit = attr->ah_attr.grh.hop_limit;
+			qp_context->pri_path.sl_tclass_flowlabel =
+				cpu_to_be32((attr->ah_attr.sl << 28)                |
+					    (attr->ah_attr.grh.traffic_class << 20) |
+					    (attr->ah_attr.grh.flow_label));
+			memcpy(qp_context->pri_path.rgid,
+			       attr->ah_attr.grh.dgid.raw, 16);
+		} else {
+			qp_context->pri_path.sl_tclass_flowlabel =
+				cpu_to_be32(attr->ah_attr.sl << 28);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);	
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	/* XXX alt_path */
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SRE           |
+					     MTHCA_QP_BIT_SWE           |
+					     MTHCA_QP_BIT_SAE);
+	if (qp->sq.policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	/* XXX initiator resources */
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	/* XXX RDMA/atomic enable, responder resources */
+
+	if (qp->rq.policy == IB_SIGNAL_ALL_WR)
+		qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	/* XXX ra_buff_indx */
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	err = mthca_MODIFY_QP(dev, state_table[cur_state][new_state].trans,
+			      qp->qpn, 0, qp_param, 0, &status);
+	if (status) {
+		mthca_warn(dev, "modify QP %d returned status %02x.\n",
+			   state_table[cur_state][new_state].trans, status);
+		err = -EINVAL;
+	}
+
+	if (!err) {
+		spin_lock_irq(&qp->lock);
+		/* XXX deal with async transitions to ERROR */
+		qp->state = new_state;
+		spin_unlock_irq(&qp->lock);
+	}
+
+	kfree(mailbox);
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/* 
+	 * If we are moving QP0 to RTR, bring the IB link up; if we
+	 * are moving QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, to_msqp(qp)->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status);
+	}
+
+	return err;
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int i;
+	int npages, shift;
+	dma_addr_t t;
+	u64 *dma_list = NULL;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	if (qp->transport == MLX)
+		size += 2 * sizeof (struct mthca_data_seg);
+	else if (qp->transport == UD)
+		size += sizeof (struct mthca_ud_seg);
+	else /* bind seg is as big as atomic + raddr segs */
+		size += sizeof (struct mthca_bind_seg);
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	if (size <= MTHCA_MAX_DIRECT_QP_SIZE) {
+		qp->is_direct = 1;
+		npages = 1;
+		shift = get_order(size) + PAGE_SHIFT;
+
+		if (0)
+			mthca_dbg(dev, "Creating direct QP of size %d (shift %d)\n",
+				  size, shift);
+
+		qp->queue.direct.buf = pci_alloc_consistent(dev->pdev, size, &t);
+		if (!qp->queue.direct.buf)
+			goto err_out;
+
+		pci_unmap_addr_set(&qp->queue.direct, mapping, t);
+
+		memset(qp->queue.direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		qp->is_direct = 0;
+		npages = size / PAGE_SIZE;
+		shift = PAGE_SHIFT;
+
+		if (0)
+			mthca_dbg(dev, "Creating indirect QP with %d pages\n", npages);
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_out;
+
+		qp->queue.page_list = kmalloc(npages *
+					      sizeof *qp->queue.page_list,
+					      GFP_KERNEL);
+		if (!qp->queue.page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i) {
+			qp->queue.page_list[i].buf =
+				pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+			if (!qp->queue.page_list[i].buf)
+				goto err_out_free;
+
+			memset(qp->queue.page_list[i].buf, 0, PAGE_SIZE);
+
+			pci_unmap_addr_set(&qp->queue.page_list[i], mapping, t);
+			dma_list[i] = t;
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num, dma_list, shift,
+				  npages, 0, size,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &qp->mr);
+	if (err)
+		goto err_out_free;
+
+	kfree(dma_list);
+	return 0;
+
+ err_out_free:
+	if (qp->is_direct) {
+		pci_free_consistent(dev->pdev, size,
+				    qp->queue.direct.buf,
+				    pci_unmap_addr(&qp->queue.direct, mapping));
+	} else
+		for (i = 0; i < npages; ++i) {
+			if (qp->queue.page_list[i].buf)
+				pci_free_consistent(dev->pdev, PAGE_SIZE,
+						    qp->queue.page_list[i].buf,
+						    pci_unmap_addr(&qp->queue.page_list[i],
+								   mapping));
+
+		}
+
+ err_out:
+	kfree(qp->wrid);
+	kfree(dma_list);
+	return err;
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 enum ib_sig_type recv_policy,
+				 struct mthca_qp *qp)
+{
+	int err;
+
+	spin_lock_init(&qp->lock);
+	atomic_set(&qp->refcount, 1);
+	qp->state    	 = IB_QPS_RESET;
+	qp->sq.policy    = send_policy;
+	qp->rq.policy    = recv_policy;
+	qp->rq.cur       = 0;
+	qp->sq.cur       = 0;
+	qp->rq.next      = 0;
+	qp->sq.next      = 0;
+	qp->rq.last_comp = qp->rq.max - 1;
+	qp->sq.last_comp = qp->sq.max - 1;
+	qp->rq.last      = NULL;
+	qp->sq.last      = NULL;
+
+	err = mthca_alloc_wqe_buf(dev, pd, qp);
+	return err;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   enum ib_sig_type recv_policy,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}		
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, recv_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    enum ib_sig_type recv_policy,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	int err = 0;
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->port = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, recv_policy,
+				    &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	u8 status;
+	int size;
+	int i;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	atomic_dec(&qp->refcount);
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, MTHCA_TRANS_ANY2RST, qp->qpn, 0, NULL, 0, &status);
+
+	mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn);
+	if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn);
+
+	mthca_free_mr(dev, &qp->mr);
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	if (qp->is_direct) {
+		pci_free_consistent(dev->pdev, size,
+				    qp->queue.direct.buf,
+				    pci_unmap_addr(&qp->queue.direct, mapping));
+	} else {
+		for (i = 0; i < size / PAGE_SIZE; ++i) {
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    qp->queue.page_list[i].buf,
+					    pci_unmap_addr(&qp->queue.page_list[i],
+							   mapping));
+		}
+	}
+
+	kfree(qp->wrid);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	}
+	else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_send_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+
+	ib_ud_header_init(256, /* assume a MAD */
+			  sqp->ud_header.grh_present,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid == 0xffff ?
+				   MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == 0xffff)
+		sqp->ud_header.lrh.source_lid = 0xffff;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_cached_pkey_get(&dev->ib_dev, sqp->port,
+				   sqp->pkey_index,
+				   &sqp->ud_header.bth.pkey);
+	else
+		ib_cached_pkey_get(&dev->ib_dev, sqp->port,
+				   wr->wr.ud.pkey_index,
+				   &sqp->ud_header.bth.pkey);
+	cpu_to_be16s(&sqp->ud_header.bth.pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+int mthca_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		    struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	int size0 = 0;
+	u32 f0 = 0;
+	int ind;
+	u8 op0 = 0;
+
+	static const u8 opcode[] = {
+		[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+		[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+		[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+		[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+		[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+		[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+		[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+	};
+
+	spin_lock_irqsave(&qp->lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (qp->sq.cur + nreq >= qp->sq.max) {
+			mthca_err(dev, "SQ full (%d posted, %d max, %d nreq)\n",
+				  qp->sq.cur, qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (qp->transport == UD) {
+			((struct mthca_ud_seg *) wqe)->lkey =
+				cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+			((struct mthca_ud_seg *) wqe)->av_addr =
+				cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+			((struct mthca_ud_seg *) wqe)->dqpn =
+				cpu_to_be32(wr->wr.ud.remote_qpn);
+			((struct mthca_ud_seg *) wqe)->qkey =
+				cpu_to_be32(wr->wr.ud.remote_qkey);
+
+			wqe += sizeof (struct mthca_ud_seg);
+			size += sizeof (struct mthca_ud_seg) / 16;
+		} else if (qp->transport == MLX) {
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (prev_wqe) {
+			((struct mthca_next_seg *) prev_wqe)->nda_op =
+				cpu_to_be32(((ind << qp->sq.wqe_shift) +
+					     qp->send_wqe_offset) |
+					    opcode[wr->opcode]);
+			smp_wmb();
+			((struct mthca_next_seg *) prev_wqe)->ee_nds =
+				cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
+		}
+
+		if (!size0) {
+			size0 = size;
+			op0   = opcode[wr->opcode];
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (nreq) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32(((qp->sq.next << qp->sq.wqe_shift) +
+					   qp->send_wqe_offset) | f0 | op0);
+		doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+		wmb();
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->sq.cur += nreq;
+	qp->sq.next = ind;
+
+	spin_unlock_irqrestore(&qp->lock, flags);
+	return err;
+}
+
+int mthca_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		       struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	int size0 = 0;
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->lock, flags);
+	
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (qp->rq.cur + nreq >= qp->rq.max) {
+			mthca_err(dev, "RQ %06x full\n", qp->qpn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags =
+			(wr->recv_flags & IB_RECV_SIGNALED) ?
+			cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (wr->num_sge > qp->rq.max_gs) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32(wr->sg_list[i].length);
+			((struct mthca_data_seg *) wqe)->lkey =
+				cpu_to_be32(wr->sg_list[i].lkey);
+			((struct mthca_data_seg *) wqe)->addr =
+				cpu_to_be64(wr->sg_list[i].addr);
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		if (prev_wqe) {
+			((struct mthca_next_seg *) prev_wqe)->nda_op =
+				cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
+			smp_wmb();
+			((struct mthca_next_seg *) prev_wqe)->ee_nds =
+				cpu_to_be32(MTHCA_NEXT_DBD | size);
+		}
+
+		if (!size0)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+
+out:
+	if (nreq) {
+		u32 doorbell[2];
+
+		doorbell[0] = cpu_to_be32((qp->rq.next << qp->rq.wqe_shift) | size0);
+		doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);
+
+		wmb();
+
+		mthca_write64(doorbell,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.cur += nreq;
+	qp->rq.next = ind;
+
+	spin_unlock_irqrestore(&qp->lock, flags);
+	return err;
+}
+
+int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
+		       int index, int *dbd, u32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+
+	return 0;
+}
+
+int __devinit mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+					    dev->qp_table.sqp_start + i * 2,
+					    &status);
+		if (err)
+			goto err_out;
+		if (status) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "status %02x, aborting.\n",
+				   status);
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+	u8 status;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][13/21] Add Mellanox HCA low-level driver (last bits)
  2004-12-13 18:09             ` [PATCH][v3][12/21] Add Mellanox HCA low-level driver (QP/CQ) Roland Dreier
@ 2004-12-13 18:09               ` Roland Dreier
  2004-12-13 18:09                 ` [PATCH][v3][14/21] Add Mellanox HCA low-level driver (MAD) Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add code for remaining InfiniBand objects (address vectors, multicast
groups, memory regions and protection domains)

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_av.c	2004-12-13 09:44:46.775834455 -0800
@@ -0,0 +1,205 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_av.c 1298 2004-11-29 03:26:10Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+
+#include "mthca_dev.h"
+
+struct mthca_av {
+	u32 port_pd;
+	u8  reserved1;
+	u8  g_slid;
+	u16 dlid;
+	u8  reserved2;
+	u8  gid_index;
+	u8  msg_sr;
+	u8  hop_limit;
+	u32 sl_tclass_flowlabel;
+	u32 dgid[4];
+};
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->on_hca = 0;
+
+	if (!atomic_read(&pd->sqp_count) &&
+	    !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto host_alloc;
+
+		av = kmalloc(sizeof *av, GFP_KERNEL);
+		if (!av)
+			goto host_alloc;
+			
+		ah->on_hca = 1;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+ host_alloc:
+	if (!ah->on_hca) {
+		ah->av = pci_pool_alloc(dev->av_table.pool,
+					SLAB_KERNEL, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	memset(av, 0, MTHCA_AV_SIZE);
+
+	av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+	av->g_slid  = ah_attr->src_path_bits;
+	av->dlid    = cpu_to_be16(ah_attr->dlid);
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		ah_attr->static_rate;
+	av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		av->g_slid |= 0x80;
+		av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+			ah_attr->grh.sgid_index;
+		av->hop_limit = ah_attr->grh.hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+	}
+
+	if (0) {
+		int j;
+		
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((u32 *) av)[j]));
+	}
+
+	if (ah->on_hca) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	if (ah->on_hca)
+		mthca_free(&dev->av_table.alloc,
+ 			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+	else
+		pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+
+	return 0;
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->on_hca)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = ah->av->g_slid & 0x7f;
+	if (ah->av->g_slid & 0x80) {
+		header->grh_present = 1;
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		ib_cached_gid_get(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index,
+				  &header->grh.source_gid);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	} else {
+		header->grh_present = 0;
+	}
+
+	return 0;
+}
+
+int __devinit mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void __devexit mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	pci_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_mcg.c	2004-12-13 09:44:46.802830478 -0800
@@ -0,0 +1,365 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_mcg.c 1298 2004-11-29 03:26:10Z roland $
+ */
+
+#include <linux/init.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+struct mthca_mgm {
+	u32 next_gid_index;
+	u32 reserved[3];
+	u8  gid[16];
+	u32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mgm *mgm,
+		    u16 *hash, int *prev, int *index)
+{
+	void *mailbox;
+	u8 *mgid;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(16 + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgid = MAILBOX_ALIGN(mailbox);
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mgid, hash, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "MGID_HASH returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %04x:%04x:%04x:%04x:"
+			  "%04x:%04x:%04x:%04x is %04x\n",
+			  be16_to_cpu(((u16 *) gid)[0]), be16_to_cpu(((u16 *) gid)[1]),
+			  be16_to_cpu(((u16 *) gid)[2]), be16_to_cpu(((u16 *) gid)[3]),
+			  be16_to_cpu(((u16 *) gid)[4]), be16_to_cpu(((u16 *) gid)[5]),
+			  be16_to_cpu(((u16 *) gid)[6]), be16_to_cpu(((u16 *) gid)[7]),
+			  *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			return -EINVAL;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 5;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	void *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgm = MAILBOX_ALIGN(mailbox);
+
+	if (down_interruptible(&dev->mcg_table.sem))
+		return -EINTR;
+
+	err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(mgm->gid, gid->raw, 16);
+		mgm->next_gid_index = 0;
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "READ_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 5);
+
+	err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+	}
+
+ out:
+	up(&dev->mcg_table.sem);
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	void *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+	u8 status;
+
+	mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+	if (!mailbox)
+		return -ENOMEM;
+	mgm = MAILBOX_ALIGN(mailbox);
+
+	if (down_interruptible(&dev->mcg_table.sem))
+		return -EINTR;
+
+	err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {	
+		mthca_err(dev, "MGID %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
+			  "not found\n",
+			  be16_to_cpu(((u16 *) gid->raw)[0]),
+			  be16_to_cpu(((u16 *) gid->raw)[1]),
+			  be16_to_cpu(((u16 *) gid->raw)[2]),
+			  be16_to_cpu(((u16 *) gid->raw)[3]),
+			  be16_to_cpu(((u16 *) gid->raw)[4]),
+			  be16_to_cpu(((u16 *) gid->raw)[5]),
+			  be16_to_cpu(((u16 *) gid->raw)[6]),
+			  be16_to_cpu(((u16 *) gid->raw)[7]));
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mgm, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		if (be32_to_cpu(mgm->next_gid_index) >> 5) {
+			err = mthca_READ_MGM(dev,
+					     be32_to_cpu(mgm->next_gid_index) >> 5,
+					     mgm, &status);
+			if (err)
+				goto out;
+			if (status) {
+				mthca_err(dev, "READ_MGM returned status %02x\n",
+					  status);
+				err = -EINVAL;
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+	} else {
+		/* Remove entry from AMGM */
+		index = be32_to_cpu(mgm->next_gid_index) >> 5;
+		err = mthca_READ_MGM(dev, prev, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(index << 5);
+
+		err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+ out:
+	up(&dev->mcg_table.sem);
+	kfree(mailbox);
+	return err;
+}
+
+int __devinit mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       dev->limits.num_amgms,
+			       dev->limits.num_amgms - 1,
+			       0);
+	if (err)
+		return err;
+
+	init_MUTEX(&dev->mcg_table.sem);
+
+	return 0;
+}
+
+void __devexit mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_mr.c	2004-12-13 09:44:46.828826648 -0800
@@ -0,0 +1,385 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_mr.c 1298 2004-11-29 03:26:10Z roland $
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	u32 flags;
+	u32 page_size;
+	u32 key;
+	u32 pd;
+	u64 start;
+	u64 length;
+	u32 lkey;
+	u32 window_count;
+	u32 window_count_limit;
+	u64 mtt_seg;
+	u32 reserved[3];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_alloc_mtt(struct mthca_dev *dev, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+	for (o = order; o <= dev->mr_table.max_mtt_order; ++o) {
+		m = 1 << (dev->mr_table.max_mtt_order - o);
+		seg = find_first_bit(dev->mr_table.mtt_buddy[o], m);
+		if (seg < m)
+			goto found;
+	}
+
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+	return -1;
+
+ found:
+	clear_bit(seg, dev->mr_table.mtt_buddy[o]);
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, dev->mr_table.mtt_buddy[o]);
+	}
+					  
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+	while (test_bit(seg ^ 1, dev->mr_table.mtt_buddy[order])) {
+		clear_bit(seg ^ 1, dev->mr_table.mtt_buddy[order]);
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, dev->mr_table.mtt_buddy[order]);
+
+	spin_unlock(&dev->mr_table.mpt_alloc.lock);
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	void *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	mr->order = -1;
+	mr->ibmr.lkey = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (mr->ibmr.lkey == -1)
+		return -ENOMEM;
+	mr->ibmr.rkey = mr->ibmr.lkey;
+
+	mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox) {
+		mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+		return -ENOMEM;
+	}
+	mpt_entry = MAILBOX_ALIGN(mailbox);
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_PHYSICAL    |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	mpt_entry->page_size = 0;
+	mpt_entry->key       = cpu_to_be32(mr->ibmr.lkey);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = 0;
+	mpt_entry->length    = ~0ULL;
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	err = mthca_SW2HW_MPT(dev, mpt_entry,
+			      mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+	else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+	}
+
+	kfree(mailbox);
+	return err;
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	void *mailbox;
+	u64 *mtt_entry;
+	struct mthca_mpt_entry *mpt_entry;
+	int err = -ENOMEM;
+	u8 status;
+	int i;
+
+	might_sleep();
+	WARN_ON(buffer_size_shift >= 32);
+
+	mr->ibmr.lkey = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (mr->ibmr.lkey == -1)
+		return -ENOMEM;
+	mr->ibmr.rkey = mr->ibmr.lkey;
+
+	for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
+	     i < list_len;
+	     i <<= 1, ++mr->order)
+		/* nothing */ ;
+
+	mr->first_seg = mthca_alloc_mtt(dev, mr->order);
+	if (mr->first_seg == -1)
+		goto err_out_mpt_free;
+
+	/*
+	 * If list_len is odd, we add one more dummy entry for
+	 * firmware efficiency.
+	 */
+	mailbox = kmalloc(max(sizeof *mpt_entry,
+			      (size_t) 8 * (list_len + (list_len & 1) + 2)) +
+			  MTHCA_CMD_MAILBOX_EXTRA,
+			  GFP_KERNEL);
+	if (!mailbox)
+		goto err_out_free_mtt;
+
+	mtt_entry = MAILBOX_ALIGN(mailbox);
+
+	mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+				   mr->first_seg * dev->limits.mtt_seg_size);
+	mtt_entry[1] = 0;
+	for (i = 0; i < list_len; ++i)
+		mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+	if (list_len & 1) {
+		mtt_entry[i + 2] = 0;
+		++list_len;
+	}
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry\n");
+		for (i = 0; i < list_len + 2; ++i)
+			printk(KERN_ERR "[%2d] %016llx\n",
+			       i, (unsigned long long) be64_to_cpu(mtt_entry[i]));
+	}
+
+	err = mthca_WRITE_MTT(dev, mtt_entry, list_len, &status);
+	if (err) {
+		mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+	if (status) {
+		mthca_warn(dev, "WRITE_MTT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_mailbox_free;
+	}
+
+	mpt_entry = MAILBOX_ALIGN(mailbox);
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(mr->ibmr.lkey);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base +
+					   mr->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mpt_entry,
+			      mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+	else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+	}
+
+	kfree(mailbox);
+	return err;
+
+ err_out_mailbox_free:
+	kfree(mailbox);
+
+ err_out_free_mtt:
+	mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+ err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+	return err;
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+	u8 status;
+
+	might_sleep();
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
+			   status);
+
+	if (mr->order >= 0)
+		mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+	mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);		   
+}
+
+int __devinit mthca_init_mr_table(struct mthca_dev *dev)
+{
+	int err;
+	int i, s;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	err = -ENOMEM;
+
+	for (i = 1, dev->mr_table.max_mtt_order = 0;
+	     i < dev->limits.num_mtt_segs;
+	     i <<= 1, ++dev->mr_table.max_mtt_order)
+		/* nothing */ ;
+
+	dev->mr_table.mtt_buddy = kmalloc((dev->mr_table.max_mtt_order + 1) *
+					  sizeof (long *),
+					  GFP_KERNEL);
+	if (!dev->mr_table.mtt_buddy)
+		goto err_out;
+
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		dev->mr_table.mtt_buddy[i] = NULL;
+
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i) {
+		s = BITS_TO_LONGS(1 << (dev->mr_table.max_mtt_order - i));
+		dev->mr_table.mtt_buddy[i] = kmalloc(s * sizeof (long),
+						     GFP_KERNEL);
+		if (!dev->mr_table.mtt_buddy[i])
+			goto err_out_free;
+		bitmap_zero(dev->mr_table.mtt_buddy[i],
+			    1 << (dev->mr_table.max_mtt_order - i));
+	}
+
+	set_bit(0, dev->mr_table.mtt_buddy[dev->mr_table.max_mtt_order]);
+
+	for (i = 0; i < dev->mr_table.max_mtt_order; ++i)
+		if (1 << i >= dev->limits.reserved_mtts)
+			break;
+
+	if (i == dev->mr_table.max_mtt_order) {
+		mthca_err(dev, "MTT table of order %d is "
+			  "too small.\n", i);
+		goto err_out_free;
+	}
+
+	(void) mthca_alloc_mtt(dev, i);
+
+	return 0;
+
+ err_out_free:
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		kfree(dev->mr_table.mtt_buddy[i]);
+
+ err_out:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void __devexit mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	int i;
+
+	/* XXX check if any MRs are still allocated? */
+	for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+		kfree(dev->mr_table.mtt_buddy[i]);
+	kfree(dev->mr_table.mtt_buddy);
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_pd.c	2004-12-13 09:44:46.861821787 -0800
@@ -0,0 +1,69 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_pd.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	int err;
+
+	might_sleep();
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+				     MTHCA_MPT_FLAG_LOCAL_READ |
+				     MTHCA_MPT_FLAG_LOCAL_WRITE,
+				     &pd->ntmr);
+	if (err)
+		mthca_free(&dev->pd_table.alloc, pd->pd_num);
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	might_sleep();
+	mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int __devinit mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void __devexit mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][14/21] Add Mellanox HCA low-level driver (MAD)
  2004-12-13 18:09               ` [PATCH][v3][13/21] Add Mellanox HCA low-level driver (last bits) Roland Dreier
@ 2004-12-13 18:09                 ` Roland Dreier
  2004-12-13 18:09                   ` [PATCH][v3][15/21] IPoIB IPv4 multicast Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add MAD (management datagram) code for Mellanox HCA driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/hw/mthca/mthca_mad.c	2004-12-13 09:44:47.344750643 -0800
@@ -0,0 +1,314 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: mthca_mad.c 1288 2004-11-24 01:12:39Z roland $
+ */
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	IB_SM_PORT_INFO        = 0x0015,
+	IB_SM_PKEY_TABLE       = 0x0016,
+	IB_SM_SM_INFO          = 0x0020,
+	IB_SM_VENDOR_START     = 0xff00
+};
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+struct mthca_trap_mad {
+	struct ib_mad *mad;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      struct ib_mad *mad)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == cpu_to_be16(IB_SM_PORT_INFO)) {
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpup((__be16 *) (mad->data + 58)),
+				     (*(u8 *) (mad->data + 76)) & 0xf);
+
+			event.device           = ibdev;
+			event.event            = IB_EVENT_LID_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+
+		if (mad->mad_hdr.attr_id == cpu_to_be16(IB_SM_PKEY_TABLE)) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct mthca_trap_mad *tmad;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+		.wr	     = {
+			 .ud = {
+				 .remote_qpn  = qpn,
+				 .remote_qkey = qpn ? IB_QP1_QKEY : 0,
+				 .timeout_ms  = 0
+			 }
+		 }
+	};
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		tmad = kmalloc(sizeof *tmad, GFP_KERNEL);
+		if (!tmad)
+			return;
+
+		tmad->mad = kmalloc(sizeof *tmad->mad, GFP_KERNEL);
+		if (!tmad->mad) {
+			kfree(tmad);
+			return;
+		}
+
+		memcpy(tmad->mad, mad, sizeof *mad);
+
+		wr.wr.ud.mad_hdr = &tmad->mad->mad_hdr;
+		wr.wr_id         = (unsigned long) tmad;
+
+		gather_list.addr   = dma_map_single(agent->device->dma_device,
+						    tmad->mad,
+						    sizeof *tmad->mad,
+						    DMA_TO_DEVICE);
+		gather_list.length = sizeof *tmad->mad;
+		gather_list.lkey   = to_mpd(agent->qp->pd)->ntmr.ibmr.lkey;
+		pci_unmap_addr_set(tmad, mapping, gather_list.addr);
+		
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		wr.wr.ud.ah      = dev->sm_ah[port_num - 1];
+		if (wr.wr.ud.ah)
+			ret = ib_post_send_mad(agent, &wr, &bad_wr);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret) {
+			dma_unmap_single(agent->device->dma_device,
+					 pci_unmap_addr(tmad, mapping),
+					 sizeof *tmad->mad,
+					 DMA_TO_DEVICE);
+			kfree(tmad->mad);
+			kfree(tmad);
+		}
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      u16 slid,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad)
+{
+	int err;
+	u8 status;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || 
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* 
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (be16_to_cpu(in_mad->mad_hdr.attr_id) == IB_SM_SM_INFO ||
+		    be16_to_cpu(in_mad->mad_hdr.attr_id) >= IB_SM_VENDOR_START)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     || 
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    !!(mad_flags & IB_MAD_IGNORE_MKEY),
+			    port_num, in_mad, out_mad,
+			    &status);
+	if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC failed\n");
+		return IB_MAD_RESULT_FAILURE;
+	}
+	if (status == MTHCA_CMD_STAT_BAD_PKT)
+		return IB_MAD_RESULT_SUCCESS;
+	if (status) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned status %02x\n",
+			  status);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status)
+		smp_snoop(ibdev, port_num, in_mad);
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct mthca_trap_mad *tmad =
+		(void *) (unsigned long) mad_send_wc->wr_id;
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(tmad, mapping),
+			 sizeof *tmad->mad,
+			 DMA_TO_DEVICE);
+	kfree(tmad->mad);
+	kfree(tmad);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL);
+			if (IS_ERR(agent))
+				goto err;
+			dev->send_agent[p][q] = agent;
+		}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return PTR_ERR(agent);
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][15/21] IPoIB IPv4 multicast
  2004-12-13 18:09                 ` [PATCH][v3][14/21] Add Mellanox HCA low-level driver (MAD) Roland Dreier
@ 2004-12-13 18:09                   ` Roland Dreier
  2004-12-13 18:09                     ` [PATCH][v3][16/21] IPoIB IPv6 support Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general, netdev

Add ip_ib_mc_map() to convert IPv4 multicast addresses to IPoIB
hardware addresses.  Also add <linux/if_infiniband.h> so INFINIBAND_ALEN
has a home.

The mapping for multicast addresses is described in
  http://www.ietf.org/internet-drafts/draft-ietf-ipoib-ip-over-infiniband-07.txt

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/include/linux/if_infiniband.h	2004-12-13 09:44:47.613711020 -0800
@@ -0,0 +1,29 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef _LINUX_IF_INFINIBAND_H
+#define _LINUX_IF_INFINIBAND_H
+
+#define INFINIBAND_ALEN		20	/* Octets in IPoIB HW addr	*/
+
+#endif /* _LINUX_IF_INFINIBAND_H */
--- linux-bk.orig/include/net/ip.h	2004-12-11 15:16:19.000000000 -0800
+++ linux-bk/include/net/ip.h	2004-12-13 09:44:47.641706896 -0800
@@ -229,6 +229,39 @@
 	buf[3]=addr&0x7F;
 }
 
+/*
+ *	Map a multicast IP onto multicast MAC for type IP-over-InfiniBand.
+ *	Leave P_Key as 0 to be filled in by driver.
+ */
+
+static inline void ip_ib_mc_map(u32 addr, char *buf)
+{
+	buf[0]  = 0;		/* Reserved */
+	buf[1]  = 0xff;		/* Multicast QPN */
+	buf[2]  = 0xff;
+	buf[3]  = 0xff;
+	addr    = ntohl(addr);
+	buf[4]  = 0xff;
+	buf[5]  = 0x12;		/* link local scope */
+	buf[6]  = 0x40;		/* IPv4 signature */
+	buf[7]  = 0x1b;
+	buf[8]  = 0;		/* P_Key */
+	buf[9]  = 0;
+	buf[10] = 0;
+	buf[11] = 0;
+	buf[12] = 0;
+	buf[13] = 0;
+	buf[14] = 0;
+	buf[15] = 0;
+	buf[19] = addr & 0xff;
+	addr  >>= 8;
+	buf[18] = addr & 0xff;
+	addr  >>= 8;
+	buf[17] = addr & 0xff;
+	addr  >>= 8;
+	buf[16] = addr & 0x0f;
+}
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <linux/ipv6.h>
 #endif
--- linux-bk.orig/net/ipv4/arp.c	2004-12-11 15:16:28.000000000 -0800
+++ linux-bk/net/ipv4/arp.c	2004-12-13 09:44:47.650705570 -0800
@@ -213,6 +213,9 @@
 	case ARPHRD_IEEE802_TR:
 		ip_tr_mc_map(addr, haddr);
 		return 0;
+	case ARPHRD_INFINIBAND:
+		ip_ib_mc_map(addr, haddr);
+		return 0;
 	default:
 		if (dir) {
 			memcpy(haddr, dev->broadcast, dev->addr_len);


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][16/21] IPoIB IPv6 support
  2004-12-13 18:09                   ` [PATCH][v3][15/21] IPoIB IPv4 multicast Roland Dreier
@ 2004-12-13 18:09                     ` Roland Dreier
  2004-12-13 18:09                       ` [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Roland Dreier
  2004-12-14  2:30                       ` [PATCH][v3][16/21] IPoIB IPv6 support YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 2 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general, netdev

Add ipv6_ib_mc_map() to convert IPv6 multicast addresses to IPoIB
hardware addresses, and add support for autoconfiguration for devices
with type ARPHRD_INFINIBAND.

The mapping for multicast addresses is described in
  http://www.ietf.org/internet-drafts/draft-ietf-ipoib-ip-over-infiniband-07.txt

Signed-off-by: Nitin Hande <Nitin.Hande@Sun.Com>
Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/include/net/if_inet6.h	2004-12-11 15:16:37.000000000 -0800
+++ linux-bk/include/net/if_inet6.h	2004-12-13 09:44:48.801536031 -0800
@@ -266,5 +266,20 @@
 {
 	buf[0] = 0x00;
 }
+
+static inline void ipv6_ib_mc_map(struct in6_addr *addr, char *buf)
+{
+	buf[0]  = 0;		/* Reserved */
+	buf[1]  = 0xff;		/* Multicast QPN */
+	buf[2]  = 0xff;
+	buf[3]  = 0xff;
+	buf[4]  = 0xff;
+	buf[5]  = 0x12;		/* link local scope */
+	buf[6]  = 0x60;		/* IPv6 signature */
+	buf[7]  = 0x1b;
+	buf[8]  = 0;		/* P_Key */
+	buf[9]  = 0;
+	memcpy(buf + 10, addr->s6_addr + 6, 10);
+}
 #endif
 #endif
--- linux-bk.orig/net/ipv6/addrconf.c	2004-12-11 15:16:33.000000000 -0800
+++ linux-bk/net/ipv6/addrconf.c	2004-12-13 09:44:48.840530286 -0800
@@ -48,6 +48,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/if_arcnet.h>
+#include <linux/if_infiniband.h>
 #include <linux/route.h>
 #include <linux/inetdevice.h>
 #include <linux/init.h>
@@ -1095,6 +1096,12 @@
 		memset(eui, 0, 7);
 		eui[7] = *(u8*)dev->dev_addr;
 		return 0;
+	case ARPHRD_INFINIBAND:
+		if (dev->addr_len != INFINIBAND_ALEN)
+			return -1;
+		memcpy(eui, dev->dev_addr + 12, 8);
+		eui[0] |= 2;
+		return 0;
 	}
 	return -1;
 }
@@ -1794,6 +1801,7 @@
 	if ((dev->type != ARPHRD_ETHER) && 
 	    (dev->type != ARPHRD_FDDI) &&
 	    (dev->type != ARPHRD_IEEE802_TR) &&
+	    (dev->type != ARPHRD_INFINIBAND) &&
 	    (dev->type != ARPHRD_ARCNET)) {
 		/* Alas, we support only Ethernet autoconfiguration. */
 		return;
--- linux-bk.orig/net/ipv6/ndisc.c	2004-12-11 15:16:13.000000000 -0800
+++ linux-bk/net/ipv6/ndisc.c	2004-12-13 09:44:48.890522921 -0800
@@ -260,6 +260,9 @@
 	case ARPHRD_ARCNET:
 		ipv6_arcnet_mc_map(addr, buf);
 		return 0;
+	case ARPHRD_INFINIBAND:
+		ipv6_ib_mc_map(addr, buf);
+		return 0;
 	default:
 		if (dir) {
 			memcpy(buf, dev->broadcast, dev->addr_len);


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver
  2004-12-13 18:09                     ` [PATCH][v3][16/21] IPoIB IPv6 support Roland Dreier
@ 2004-12-13 18:09                       ` Roland Dreier
  2004-12-13 18:09                         ` [PATCH][v3][18/21] Add InfiniBand userspace MAD support Roland Dreier
  2004-12-13 18:44                         ` [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Tom Duffy
  2004-12-14  2:30                       ` [PATCH][v3][16/21] IPoIB IPv6 support YOSHIFUJI Hideaki / 吉藤英明
  1 sibling, 2 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general, netdev


Add a driver that implements the (IPoIB) IP-over-InfiniBand protocol.
This is a network device driver of type ARPHRD_INFINIBAND (and
addr_len INFINIBAND_ALEN bytes).

The ARP/ND implementation for this driver is not completely
straightforward, because InfiniBand requires an additional path lookup
be performed (through an IB-specific mechanism) after a remote
hardware address has been resolved.  We are very open to suggestions
of a better way to handle this than the current implementation.

Although IB has a special multicast group join mode intended to
support IP multicast routing (non member join), no means to identify
different multicast styles has yet been determined, so all joins by
the driver are currently full member joins.  We are looking for
guidance in how to solve this.

The IPoIB protocol/encapsulation is described in the Internet-Drafts
  http://www.ietf.org/internet-drafts/draft-ietf-ipoib-architecture-04.txt
  http://www.ietf.org/internet-drafts/draft-ietf-ipoib-ip-over-infiniband-07.txt

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/drivers/infiniband/Kconfig	2004-12-13 09:44:43.936252779 -0800
+++ linux-bk/drivers/infiniband/Kconfig	2004-12-13 09:44:49.385450009 -0800
@@ -2,7 +2,6 @@
 
 config INFINIBAND
 	tristate "InfiniBand support"
-	default n
 	---help---
 	  Core support for InfiniBand (IB).  Make sure to also select
 	  any protocols you wish to use as well as drivers for your
@@ -10,4 +9,6 @@
 
 source "drivers/infiniband/hw/mthca/Kconfig"
 
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
 endmenu
--- linux-bk.orig/drivers/infiniband/Makefile	2004-12-13 09:44:43.909256756 -0800
+++ linux-bk/drivers/infiniband/Makefile	2004-12-13 09:44:49.342456343 -0800
@@ -1,2 +1,3 @@
 obj-$(CONFIG_INFINIBAND)		+= core/
 obj-$(CONFIG_INFINIBAND_MTHCA)		+= hw/mthca/
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/Kconfig	2004-12-13 09:44:49.470437489 -0800
@@ -0,0 +1,33 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on INFINIBAND && NETDEVICES && INET
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  The IPoIB protocol is defined by the IETF ipoib working
+	  group: <http://www.ietf.org/html.charters/ipoib-charter.html>.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging"
+	depends on INFINIBAND_IPOIB
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates an "ipoib_debugfs," which can be
+	  mounted to expose debugging information about IB multicast
+	  groups used by the IPoIB driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/Makefile	2004-12-13 09:44:49.426443970 -0800
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib.h	2004-12-13 09:44:49.497433512 -0800
@@ -0,0 +1,340 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib.h 1323 2004-12-11 02:36:04Z roland $
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/pci.h>
+#include <linux/config.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+
+#include <net/neighbour.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+/* constants */
+
+enum {
+	IPOIB_PACKET_SIZE         = 2048,
+	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
+
+	IPOIB_ENCAP_LEN 	  = 4,
+
+	IPOIB_RX_RING_SIZE 	  = 128,
+	IPOIB_TX_RING_SIZE 	  = 64,
+
+	IPOIB_NUM_WC 		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE     = 3,
+
+	IPOIB_FLAG_TX_FULL 	  = 0,
+	IPOIB_FLAG_OPER_UP 	  = 1,
+	IPOIB_FLAG_ADMIN_UP 	  = 2,
+	IPOIB_PKEY_ASSIGNED 	  = 3,
+	IPOIB_PKEY_STOP 	  = 4,
+	IPOIB_FLAG_SUBINTERFACE   = 5,
+	IPOIB_MCAST_RUN 	  = 6,
+	IPOIB_STOP_REAPER         = 7,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND 	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+};
+
+/* structs */
+
+struct ipoib_header {
+	u16 proto;
+	u16 reserved;
+};
+
+struct ipoib_pseudoheader {
+	u8  hwaddr[INFINIBAND_ALEN];
+};
+
+struct ipoib_mcast;
+
+struct ipoib_buf {
+	struct sk_buff *skb;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+/*
+ * Device private locking: tx_lock protects members used in TX fast
+ * path (and we use LLTX so upper layers don't do extra locking).
+ * lock protects everything else.  lock nests inside of tx_lock (ie
+ * tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct net_device *dev;
+
+	unsigned long flags;
+
+	struct semaphore mcast_mutex;
+	struct semaphore vlan_mutex;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct work_struct pkey_task;
+	struct work_struct mcast_task;
+	struct work_struct flush_task;
+	struct work_struct restart_task;
+	struct work_struct ah_reap_task;
+
+	struct ib_device *ca;
+	u8            	  port;
+	u16           	  pkey;
+	struct ib_pd  	 *pd;
+	struct ib_mr  	 *mr;
+	struct ib_cq  	 *cq;
+	struct ib_qp  	 *qp;
+	u32           	  qkey;
+
+	union ib_gid local_gid;
+	u16          local_lid;
+
+	unsigned int admin_mtu;
+	unsigned int mcast_mtu;
+
+	struct ipoib_buf *rx_ring;
+
+	spinlock_t tx_lock;
+	struct ipoib_buf *tx_ring;
+	unsigned tx_head;
+	unsigned tx_tail;
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct net_device_stats stats;
+
+	struct net_device *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+#endif
+};
+
+struct ipoib_ah {
+	struct net_device *dev;
+	struct ib_ah      *ah;
+	struct list_head   list;
+	struct kref        ref;
+	unsigned           last_send;
+};
+
+struct ipoib_path {
+	struct net_device    *dev;
+	struct ib_sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int                   query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node        rb_node;
+	struct list_head      list;
+};
+
+struct ipoib_neigh {
+	struct ipoib_ah    *ah;
+	struct sk_buff_head queue;
+
+	struct neighbour   *neighbour;
+
+	struct list_head    list;
+};
+
+static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
+{
+	return (struct ipoib_neigh **) (neigh->ha + 24 -
+					(offsetof(struct neighbour, ha) & 4));
+}
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+
+int ipoib_add_pkey_attr(struct net_device *dev);
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(void *dev_ptr);
+
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_ib_dev_flush(void *dev);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_up(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(void *dev_ptr);
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+		      struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(void *dev_ptr);
+int ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid);
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid);
+
+int ipoib_qp_create(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+void ipoib_pkey_poll(void *dev);
+int ipoib_pkey_dev_delay_open(struct net_device *dev);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_create_debug_file(struct net_device *dev);
+void ipoib_delete_debug_file(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline int ipoib_create_debug_file(struct net_device *dev) { return 0; }
+static inline void ipoib_delete_debug_file(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {					        \
+		if (debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {					        \
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {					        \
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+
+#define IPOIB_GID_FMT		"%x:%x:%x:%x:%x:%x:%x:%x"
+
+#define IPOIB_GID_ARG(gid)	be16_to_cpup((__be16 *) ((gid).raw +  0)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  2)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  4)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  6)), \
+				be16_to_cpup((__be16 *) ((gid).raw +  8)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 10)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 12)), \
+				be16_to_cpup((__be16 *) ((gid).raw + 14))
+
+#endif /* _IPOIB_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_fs.c	2004-12-13 09:44:49.522429829 -0800
@@ -0,0 +1,276 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include "ipoib.h"
+
+enum {
+	IPOIB_MAGIC = 0x49504942 /* "IPIB" */
+};
+
+static DECLARE_MUTEX(ipoib_fs_mutex);
+static struct dentry *ipoib_root;
+static struct super_block *ipoib_sb;
+static LIST_HEAD(ipoib_device_list);
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			ipoib_mcast_iter_free(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		ipoib_mcast_iter_free(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	int i, n;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (iter) {
+		ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+				      &complete, &send_only);
+
+		for (n = 0, i = 0; i < sizeof mgid / 2; ++i) {
+			n += sprintf(gid_buf + n, "%x",
+				     be16_to_cpu(((u16 *)mgid.raw)[i]));
+			if (i < sizeof mgid / 2 - 1)
+				gid_buf[n++] = ':';
+		}
+	}
+
+	seq_printf(file, "GID: %*s", -(1 + (int) sizeof gid_buf), gid_buf);
+
+	seq_printf(file,
+		   " created: %10ld queuelen: %4d complete: %d send_only: %d\n",
+		   created, queuelen, complete, send_only);
+
+	return 0;
+}
+
+static struct seq_operations ipoib_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->u.generic_ip;
+
+	return 0;
+}
+
+static struct file_operations ipoib_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static struct inode *ipoib_get_inode(void)
+{
+	struct inode *inode = new_inode(ipoib_sb);
+
+	if (inode) {
+		inode->i_mode 	 = S_IFREG | S_IRUGO;
+		inode->i_uid 	 = 0;
+		inode->i_gid 	 = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks  = 0;
+		inode->i_atime 	 = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_fop     = &ipoib_fops;
+	}
+
+	return inode;
+}
+
+static int __ipoib_create_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct dentry *dentry;
+	struct inode *inode;
+	char name[IFNAMSIZ + sizeof "_mcg"];
+
+	snprintf(name, sizeof name, "%s_mcg", dev->name);
+
+	dentry = d_alloc_name(ipoib_root, name);
+	if (!dentry)
+		return -ENOMEM;
+
+	inode = ipoib_get_inode();
+	if (!inode) {
+		dput(dentry);
+		return -ENOMEM;
+	}
+
+	inode->u.generic_ip = dev;
+	priv->mcg_dentry = dentry;
+
+	d_add(dentry, inode);
+
+	return 0;
+}
+
+int ipoib_create_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	down(&ipoib_fs_mutex);
+
+	list_add_tail(&priv->fs_list, &ipoib_device_list);
+
+	if (!ipoib_sb) {
+		up(&ipoib_fs_mutex);
+		return 0;
+	}
+
+	up(&ipoib_fs_mutex);
+
+	return __ipoib_create_debug_file(dev);
+}
+
+void ipoib_delete_debug_file(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	down(&ipoib_fs_mutex);
+	list_del(&priv->fs_list);
+	if (!ipoib_sb) {
+		up(&ipoib_fs_mutex);
+		return;
+	}
+	up(&ipoib_fs_mutex);
+
+	if (priv->mcg_dentry) {
+		d_drop(priv->mcg_dentry);
+		simple_unlink(ipoib_root->d_inode, priv->mcg_dentry);
+	}
+}
+
+static int ipoib_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr ipoib_files[] = {
+		{ "" }
+	};
+	struct ipoib_dev_priv *priv;
+	int ret;
+
+	ret = simple_fill_super(sb, IPOIB_MAGIC, ipoib_files);
+	if (ret)
+		return ret;
+
+	ipoib_root = sb->s_root;
+
+	down(&ipoib_fs_mutex);
+
+	ipoib_sb = sb;
+
+	list_for_each_entry(priv, &ipoib_device_list, fs_list) {
+		ret = __ipoib_create_debug_file(priv->dev);
+		if (ret)
+			break;
+	}
+
+	up(&ipoib_fs_mutex);
+
+	return ret;
+}
+
+static struct super_block *ipoib_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, ipoib_fill_super);
+}
+
+static void ipoib_kill_sb(struct super_block *sb)
+{
+	down(&ipoib_fs_mutex);
+	ipoib_sb = NULL;
+	up(&ipoib_fs_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type ipoib_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ipoib_debugfs",
+	.get_sb		= ipoib_get_sb,
+	.kill_sb	= ipoib_kill_sb,
+};
+
+int ipoib_register_debugfs(void)
+{
+	return register_filesystem(&ipoib_fs_type);
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	unregister_filesystem(&ipoib_fs_type);
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2004-12-13 09:44:49.547426147 -0800
@@ -0,0 +1,627 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib_ib.c 1323 2004-12-11 02:36:04Z roland $
+ */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+#define	IPOIB_OP_RECV	(1ul << 31)
+
+static DECLARE_MUTEX(pkey_sem);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return NULL;
+
+	ah->dev       = dev;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	ah->ah = ib_create_ah(pd, attr);
+	if (IS_ERR(ah->ah)) {
+		kfree(ah);
+		ah = NULL;
+	} else
+		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
+
+	unsigned long flags;
+
+	if (ah->last_send <= priv->tx_tail) {
+		ipoib_dbg(priv, "Freeing ah %p\n", ah->ah);
+		ib_destroy_ah(ah->ah);
+		kfree(ah);
+	} else {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_add_tail(&ah->list, &priv->dead_ahs);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+static inline int ipoib_ib_receive(struct ipoib_dev_priv *priv,
+				   unsigned int wr_id,
+				   dma_addr_t addr)
+{
+	struct ib_sge list = {
+		.addr    = addr,
+		.length  = IPOIB_BUF_SIZE,
+		.lkey    = priv->mr->lkey,
+	};
+	struct ib_recv_wr param = {
+		.wr_id 	    = wr_id | IPOIB_OP_RECV,
+		.sg_list    = &list,
+		.num_sge    = 1,
+		.recv_flags = IB_RECV_SIGNALED
+	};
+	struct ib_recv_wr *bad_wr;
+
+	return ib_post_recv(priv->qp, &param, &bad_wr);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	dma_addr_t addr;
+	int ret;
+
+	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+	if (!skb) {
+		ipoib_warn(priv, "failed to allocate receive buffer\n");
+
+		priv->rx_ring[id].skb = NULL;
+		return -ENOMEM;
+	}
+	skb_reserve(skb, 4);	/* 16 byte align IP header */
+	priv->rx_ring[id].skb = skb;
+	addr = dma_map_single(priv->ca->dma_device,
+			      skb->data, IPOIB_BUF_SIZE,
+			      DMA_FROM_DEVICE);
+	pci_unmap_addr_set(&priv->rx_ring[id], mapping, addr);
+
+	ret = ipoib_ib_receive(priv, id, addr);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_receive failed for buf %d (%d)\n",
+			   id, ret);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+		if (ipoib_ib_post_receive(dev, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void ipoib_ib_handle_wc(struct net_device *dev,
+			       struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+
+	ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+		       wr_id, wc->opcode, wc->status);
+
+	if (wr_id & IPOIB_OP_RECV) {
+		wr_id &= ~IPOIB_OP_RECV;
+
+		if (wr_id < IPOIB_RX_RING_SIZE) {
+			struct sk_buff *skb = priv->rx_ring[wr_id].skb;
+
+			priv->rx_ring[wr_id].skb = NULL;
+
+			dma_unmap_single(priv->ca->dma_device,
+					 pci_unmap_addr(&priv->rx_ring[wr_id],
+							mapping),
+					 IPOIB_BUF_SIZE,
+					 DMA_FROM_DEVICE);
+
+			if (wc->status != IB_WC_SUCCESS) {
+				if (wc->status != IB_WC_WR_FLUSH_ERR)
+					ipoib_warn(priv, "failed recv event "
+						   "(status=%d, wrid=%d vend_err %x)\n",
+						   wc->status, wr_id, wc->vendor_err);
+				dev_kfree_skb_any(skb);
+				return;
+			}
+
+			ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+				       wc->byte_len, wc->slid);
+
+			skb_put(skb, wc->byte_len);
+			skb_pull(skb, IB_GRH_BYTES);
+
+			if (wc->slid != priv->local_lid ||
+			    wc->src_qp != priv->qp->qp_num) {
+				skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+
+				skb_pull(skb, IPOIB_ENCAP_LEN);
+
+				dev->last_rx = jiffies;
+				++priv->stats.rx_packets;
+				priv->stats.rx_bytes += skb->len;
+
+				skb->dev = dev;
+				/* XXX get correct PACKET_ type here */
+				skb->pkt_type = PACKET_HOST;
+				netif_rx_ni(skb);
+			} else {
+				ipoib_dbg_data(priv, "dropping loopback packet\n");
+				dev_kfree_skb_any(skb);
+			}
+
+			/* repost receive */
+			if (ipoib_ib_post_receive(dev, wr_id))
+				ipoib_warn(priv, "ipoib_ib_post_receive failed "
+					   "for buf %d\n", wr_id);
+		} else
+			ipoib_warn(priv, "completion event with wrid %d\n",
+				   wr_id);
+
+	} else {
+		struct ipoib_buf *tx_req;
+		unsigned long flags;
+
+		if (wr_id >= IPOIB_TX_RING_SIZE) {
+			ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+				   wr_id, IPOIB_TX_RING_SIZE);
+			return;
+		}
+
+		ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+
+		tx_req = &priv->tx_ring[wr_id];
+
+		dma_unmap_single(priv->ca->dma_device,
+				 pci_unmap_addr(tx_req, mapping),
+				 tx_req->skb->len,
+				 DMA_TO_DEVICE);
+
+		++priv->stats.tx_packets;
+		priv->stats.tx_bytes += tx_req->skb->len;
+
+		dev_kfree_skb_any(tx_req->skb);
+
+		spin_lock_irqsave(&priv->tx_lock, flags);
+		++priv->tx_tail;
+		if (priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2)
+			netif_wake_queue(dev);
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
+			ipoib_warn(priv, "failed send event "
+				   "(status=%d, wrid=%d vend_err %x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+	}
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct net_device *dev = (struct net_device *) dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int n, i;
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	do {
+		n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i)
+			ipoib_ib_handle_wc(dev, priv->ibwc + i);
+	} while (n == IPOIB_NUM_WC);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    unsigned int wr_id,
+			    struct ib_ah *address, u32 qpn,
+			    dma_addr_t addr, int len)
+{
+	struct ib_sge list = {
+		.addr    = addr,
+		.length  = len,
+		.lkey    = priv->mr->lkey,
+	};
+	struct ib_send_wr param = {
+		.wr_id = wr_id,
+		.opcode = IB_WR_SEND,
+		.sg_list = &list,
+		.num_sge = 1,
+		.wr = {
+			.ud = {
+				 .remote_qpn = qpn,
+				 .remote_qkey = priv->qkey,
+				 .ah = address
+			 },
+		},
+		.send_flags = IB_SEND_SIGNALED,
+	};
+	struct ib_send_wr *bad_wr;
+
+	return ib_post_send(priv->qp, &param, &bad_wr);
+}
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_buf *tx_req;
+	dma_addr_t addr;
+
+	if (skb->len > dev->mtu + INFINIBAND_ALEN) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   skb->len, dev->mtu + INFINIBAND_ALEN);
+		++priv->stats.tx_dropped;
+		++priv->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	if (!(skb = skb_unshare(skb, GFP_ATOMIC))) {
+		ipoib_warn(priv, "failed to unshare sk_buff. Dropping\n");
+		++priv->stats.tx_dropped;
+		++priv->stats.tx_errors;
+		return;
+	}
+
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+		       skb->len, address, qpn);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)];
+	tx_req->skb = skb;
+	addr = dma_map_single(priv->ca->dma_device,
+			      skb->data, skb->len,
+			      DMA_TO_DEVICE);
+	pci_unmap_addr_set(tx_req, mapping, addr);
+
+	if (post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1),
+		      address->ah, qpn, addr, skb->len)) {
+		ipoib_warn(priv, "post_send failed\n");
+		++priv->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+	} else {
+		dev->trans_start = jiffies;
+
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+
+		if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {
+			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+			netif_stop_queue(dev);
+		}
+	}
+}
+
+void __ipoib_reap_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+
+	spin_lock_irq(&priv->lock);
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if (ah->last_send <= priv->tx_tail) {
+			list_del(&ah->list);
+			list_add_tail(&ah->list, &remove_list);
+		}
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(ah, tah, &remove_list, list) {
+		ipoib_dbg(priv, "Reaping ah %p\n", ah->ah);
+		ib_destroy_ah(ah->ah);
+		kfree(ah);
+	}
+}
+
+void ipoib_reap_ah(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	__ipoib_reap_ah(dev);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+}
+
+int ipoib_ib_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ret = ipoib_qp_create(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_qp_create returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		return -1;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+
+	return 0;
+}
+
+int ipoib_ib_dev_up(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	return ipoib_mcast_start_thread(dev);
+}
+
+int ipoib_ib_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	netif_carrier_off(dev);
+
+	/* Shutdown the P_Key thread if still active */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		down(&pkey_sem);
+		set_bit(IPOIB_PKEY_STOP, &priv->flags);
+		cancel_delayed_work(&priv->pkey_task);
+		up(&pkey_sem);
+		flush_workqueue(ipoib_workqueue);
+	}
+
+	ipoib_mcast_stop_thread(dev);
+
+	/*
+	 * Flush the multicast groups first so we stop any multicast joins. The
+	 * completion thread may have already died and we may deadlock waiting
+	 * for the completion thread to finish some multicast joins.
+	 */
+	ipoib_mcast_dev_flush(dev);
+
+	/* Delete broadcast and local addresses since they will be recreated */
+	ipoib_mcast_dev_down(dev);
+
+	ipoib_flush_paths(dev);
+
+	return 0;
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+		if (priv->rx_ring[i].skb)
+			return 1;
+
+	return 0;
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int i;
+
+	/* Kill the existing QP and allocate a new one */
+	qp_attr.qp_state = IB_QPS_ERR;
+	attr_mask        = IB_QP_STATE;
+	if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+	/* Wait for all sends and receives to complete */
+	while (priv->tx_head != priv->tx_tail || recvs_pending(dev))
+		yield();
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+	qp_attr.qp_state = IB_QPS_RESET;
+	attr_mask        = IB_QP_STATE;
+	if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	/* Wait for all AHs to be reaped */
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	cancel_delayed_work(&priv->ah_reap_task);
+	flush_workqueue(ipoib_workqueue);
+	while (!list_empty(&priv->dead_ahs)) {
+		__ipoib_reap_ah(dev);
+		yield();
+	}
+
+	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+		if (priv->rx_ring[i].skb)
+			ipoib_warn(priv, "Recv skb still around @ %d\n", i);
+
+	return 0;
+}
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	if (ipoib_transport_dev_init(dev, ca)) {
+		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+		return -ENODEV;
+	}
+
+	if (dev->flags & IFF_UP) {
+		if (ipoib_ib_dev_open(dev)) {
+			ipoib_transport_dev_cleanup(dev);
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+void ipoib_ib_dev_flush(void *_dev)
+{
+	struct net_device *dev = (struct net_device *)_dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv;
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		return;
+
+	ipoib_dbg(priv, "flushing\n");
+
+	ipoib_ib_dev_down(dev);
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_ib_dev_up(dev);
+
+	/* Flush any child interfaces too */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		ipoib_ib_dev_flush(&cpriv->dev);
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+
+	ipoib_mcast_stop_thread(dev);
+
+	/* Delete the broadcast address and the local address */
+	ipoib_mcast_dev_down(dev);
+
+	ipoib_transport_dev_cleanup(dev);
+}
+
+/*
+ * Delayed P_Key Assigment Interim Support
+ *
+ * The following is initial implementation of delayed P_Key assigment
+ * mechanism. It is using the same approach implemented for the multicast
+ * group join. The single goal of this implementation is to quickly address
+ * Bug #2507. This implementation will probably be removed when the P_Key
+ * change async notification is available.
+ */
+int ipoib_open(struct net_device *dev);
+
+static void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	u16 pkey_index = 0;
+
+	if (ib_cached_pkey_find(priv->ca, priv->port, priv->pkey, &pkey_index))
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	else
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+void ipoib_pkey_poll(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		ipoib_open(dev);
+	else {
+		down(&pkey_sem);
+		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->pkey_task,
+					   HZ);
+		up(&pkey_sem);
+	}
+}
+
+int ipoib_pkey_dev_delay_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* Look for the interface pkey value in the IB Port P_Key table and */
+	/* set the interface pkey assigment flag                            */
+	ipoib_pkey_dev_check_presence(dev);
+
+	/* P_Key value not assigned yet - start polling */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		down(&pkey_sem);
+		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
+		queue_delayed_work(ipoib_workqueue,
+				   &priv->pkey_task,
+				   HZ);
+		up(&pkey_sem);
+		return 1;
+	}
+
+	return 0;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_main.c	2004-12-13 09:44:49.573422317 -0800
@@ -0,0 +1,1023 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib_main.c 1323 2004-12-11 02:36:04Z roland $
+ */
+
+#include "ipoib.h"
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int debug_level;
+
+module_param(debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one
+};
+
+int ipoib_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	if (ipoib_pkey_dev_delay_open(dev))
+		return 0;
+
+	if (ipoib_ib_dev_open(dev))
+		return -EINVAL;
+
+	if (ipoib_ib_dev_up(dev))
+		return -EINVAL;
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		down(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (flags & IFF_UP)
+				continue;
+
+			dev_change_flags(cpriv->dev, flags | IFF_UP);
+		}
+		up(&priv->vlan_mutex);
+	}
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	netif_stop_queue(dev);
+
+	ipoib_ib_dev_down(dev);
+	ipoib_ib_dev_stop(dev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		down(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (!(flags & IFF_UP))
+				continue;
+
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+		}
+		up(&priv->vlan_mutex);
+	}
+
+	return 0;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	return 0;
+}
+
+static struct ipoib_path *__path_find(struct net_device *dev,
+				      union ib_gid *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, gid->raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void __path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+		if (neigh->ah)
+			ipoib_put_ah(neigh->ah);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	rb_erase(&path->rb_node, &priv->path_tree);
+	list_del(&path->list);
+	kfree(path);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list)
+		__path_free(dev, path);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void path_rec_completion(int status,
+				struct ib_sa_path_rec *pathrec,
+				void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(path->dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_neigh *neigh;
+	struct sk_buff_head skqueue;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	ipoib_dbg(priv, "status %d, LID 0x%04x for GID " IPOIB_GID_FMT "\n",
+		  status, be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
+
+	if (status == IB_WC_SUCCESS) {
+		struct ib_ah_attr av = {
+			.dlid 	       = be16_to_cpu(pathrec->dlid),
+			.sl 	       = pathrec->sl,
+			.src_path_bits = 0,
+			.static_rate   = 0,
+			.ah_flags      = 0,
+			.port_num      = priv->port
+		};
+
+		ah = ipoib_create_ah(path->dev, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	path->ah = ah;
+
+	if (ah) {
+		path->pathrec = *pathrec;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		skb_queue_head_init(&skqueue);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry(neigh, &path->neigh_list, list) {
+			neigh->ah = path->ah;
+			kref_get(&path->ah->ref);
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+	} else
+		path->query = NULL;
+
+
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = path->dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev,
+					  union ib_gid *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	path = kmalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	path->dev = dev;
+	path->pathrec.dlid = 0;
+
+	skb_queue_head_init(&path->queue);
+
+	INIT_LIST_HEAD(&path->neigh_list);
+	path->query = NULL;
+	init_completion(&path->done);
+
+	memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid));
+	path->pathrec.sgid      = priv->local_gid;
+	path->pathrec.pkey      = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path = 1;
+
+	__path_add(dev, path);
+
+	return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	path->query_id =
+		ib_sa_path_rec_get(priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
+		path->query = NULL;
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh; 
+
+	neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh) {
+		++priv->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	skb_queue_head_init(&neigh->queue);
+	neigh->neighbour = skb->dst->neighbour;
+	*to_ipoib_neigh(skb->dst->neighbour) = neigh;
+
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (skb->dst->neighbour->ha + 4));
+		if (!path)
+			goto err;
+	}
+
+	list_add_tail(&neigh->list, &path->neigh_list);
+
+	if (path->pathrec.dlid) {
+		neigh->ah = path->ah;
+		kref_get(&path->ah->ref);
+
+		ipoib_send(dev, skb, path->ah,
+			   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+	} else if (!path->query) {
+		neigh->ah  = NULL;
+		__skb_queue_tail(&neigh->queue, skb);
+		if (path_rec_start(dev, path))
+			goto err;
+	}
+
+	spin_unlock(&priv->lock);
+	return;
+
+err:
+	*to_ipoib_neigh(skb->dst->neighbour) = NULL;
+	list_del(&neigh->list);
+	kfree(neigh);
+	neigh->neighbour->ops->destructor = NULL;
+
+	++priv->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock(&priv->lock);
+}
+
+static void path_lookup(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
+
+	/* Look up path record for unicasts */
+	if (skb->dst->neighbour->ha[4] != 0xff) {
+		neigh_add_path(skb, dev);
+		return;
+	}
+
+	/* Add in the P_Key for multicasts */
+	skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
+	skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
+	ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb);
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_pseudoheader *phdr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	/*
+	 * We can only be called from ipoib_start_xmit, so we're
+	 * inside tx_lock -- no need to save/restore flags.
+	 */
+	spin_lock(&priv->lock);
+
+	path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4));
+	if (!path) {
+		path = path_rec_create(dev,
+				       (union ib_gid *) (phdr->hwaddr + 4));
+		if (path) {
+			__skb_queue_tail(&path->queue, skb);
+
+			if (path_rec_start(dev, path))
+				__path_free(dev, path);
+		} else {
+			++priv->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		spin_unlock(&priv->lock);
+		return;
+	}
+
+	ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid));
+
+	ipoib_send(dev, skb, path->ah,
+		   be32_to_cpup((__be32 *) phdr->hwaddr));
+
+	spin_unlock(&priv->lock);
+}
+
+static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (!spin_trylock(&priv->tx_lock)) { 
+		local_irq_restore(flags);
+		return NETDEV_TX_LOCKED; 
+	} 
+
+	if (skb->dst && skb->dst->neighbour) {
+		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+			path_lookup(skb, dev);
+			goto out;
+		}
+
+		neigh = *to_ipoib_neigh(skb->dst->neighbour);
+
+		if (likely(neigh->ah)) {
+			ipoib_send(dev, skb, neigh->ah,
+				   be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+			goto out;
+		}
+
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+			__skb_queue_tail(&neigh->queue, skb);
+		else
+			goto err;
+	} else {
+		struct ipoib_pseudoheader *phdr =
+			(struct ipoib_pseudoheader *) skb->data;
+		skb_pull(skb, sizeof *phdr);
+
+		if (phdr->hwaddr[4] == 0xff) {
+			/* Add in the P_Key for multicast*/
+			phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+			phdr->hwaddr[9] = priv->pkey & 0xff;
+
+			ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb);
+		} else {
+			/* unicast GID -- should be ARP reply */
+
+			if (be16_to_cpup((u16 *) skb->data) != ETH_P_ARP) {
+				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
+					   IPOIB_GID_FMT "\n",
+					   skb->dst ? "neigh" : "dst",
+					   be16_to_cpup((u16 *) skb->data),
+					   be32_to_cpup((u32 *) phdr->hwaddr),
+					   IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4)));
+				dev_kfree_skb_any(skb);
+				++priv->stats.tx_dropped;
+				goto out;
+			}
+
+			/* put the pseudoheader back on and try to send */
+			unicast_arp_send(skb, dev, phdr);
+		}
+	}
+
+	goto out;
+
+err:
+	++priv->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+out:
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	return NETDEV_TX_OK;
+}
+
+struct net_device_stats *ipoib_get_stats(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return &priv->stats;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_warn(priv, "transmit timeout: latency %ld\n",
+		   jiffies - dev->trans_start);
+	/* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+			     struct net_device *dev,
+			     unsigned short type,
+			     void *daddr, void *saddr, unsigned len)
+{
+	struct ipoib_header *header;
+
+	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
+
+	header->proto = htons(type);
+	header->reserved = 0;
+
+	/*
+	 * If we don't have a neighbour structure, stuff the
+	 * destination address onto the front of the skb so we can
+	 * figure out where to send the packet later.
+	 */
+	if (!skb->dst || !skb->dst->neighbour) {
+		struct ipoib_pseudoheader *phdr =
+			(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+		memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
+	}
+
+	return 0;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	schedule_work(&priv->restart_task);
+}
+
+static void ipoib_neigh_destructor(struct neighbour *n)
+{
+	struct ipoib_neigh *neigh = *to_ipoib_neigh(n);
+	struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+	unsigned long flags;
+
+	ipoib_dbg(priv,
+		  "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+		  be32_to_cpup((__be32 *) n->ha),
+		  IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4))));
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (neigh) {
+		if (neigh->ah)
+			ipoib_put_ah(neigh->ah);
+		list_del(&neigh->list);
+		*to_ipoib_neigh(n) = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int ipoib_neigh_setup(struct neighbour *neigh)
+{
+	/*
+	 * Is this kosher?  I can't find anybody in the kernel that
+	 * sets neigh->destructor, so we should be able to set it here
+	 * without trouble.
+	 */
+	neigh->ops->destructor = ipoib_neigh_destructor;
+
+	return 0;
+}
+
+static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
+{
+	parms->neigh_setup = ipoib_neigh_setup;
+
+	return 0;
+}
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* Allocate RX/TX "rings" to hold queued skbs */
+
+	priv->rx_ring =	kmalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf),
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+		       ca->name, IPOIB_RX_RING_SIZE);
+		goto out;
+	}
+	memset(priv->rx_ring, 0,
+	       IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf));
+
+	priv->tx_ring = kmalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf),
+				GFP_KERNEL);
+	if (!priv->tx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+		       ca->name, IPOIB_TX_RING_SIZE);
+		goto out_rx_ring_cleanup;
+	}
+	memset(priv->tx_ring, 0,
+	       IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf));
+
+	/* priv->tx_head & tx_tail are already 0 */
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_ring_cleanup;
+
+	return 0;
+
+out_tx_ring_cleanup:
+	kfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out:
+	return -ENOMEM;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+
+	ipoib_delete_debug_file(dev);
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		unregister_netdev(cpriv->dev);
+		ipoib_dev_cleanup(cpriv->dev);
+		free_netdev(cpriv->dev);
+	}
+
+	ipoib_ib_dev_cleanup(dev);
+
+	if (priv->rx_ring) {
+		kfree(priv->rx_ring);
+		priv->rx_ring = NULL;
+	}
+
+	if (priv->tx_ring) {
+		kfree(priv->tx_ring);
+		priv->tx_ring = NULL;
+	}
+}
+
+static void ipoib_setup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	dev->open 		 = ipoib_open;
+	dev->stop 		 = ipoib_stop;
+	dev->change_mtu 	 = ipoib_change_mtu;
+	dev->hard_start_xmit 	 = ipoib_start_xmit;
+	dev->get_stats 		 = ipoib_get_stats;
+	dev->tx_timeout 	 = ipoib_timeout;
+	dev->hard_header 	 = ipoib_hard_header;
+	dev->set_multicast_list  = ipoib_set_mcast_list;
+	dev->neigh_setup         = ipoib_neigh_setup_dev;
+
+	dev->watchdog_timeo 	 = HZ;
+
+	dev->rebuild_header 	 = NULL;
+	dev->set_mac_address 	 = NULL;
+	dev->header_cache_update = NULL;
+
+	dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
+	
+	/*
+	 * We add in INFINIBAND_ALEN to allow for the destination
+	 * address "pseudoheader" for skbs without neighbour struct.
+	 */
+	dev->hard_header_len 	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
+	dev->addr_len 		 = INFINIBAND_ALEN;
+	dev->type 		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len 	 = IPOIB_TX_RING_SIZE * 2;
+	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+
+	/* MTU will be reset when mcast join happens */
+	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
+
+	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	netif_carrier_off(dev);
+
+	SET_MODULE_OWNER(dev);
+
+	priv->dev = dev;
+
+	spin_lock_init(&priv->lock);
+	spin_lock_init(&priv->tx_lock);
+
+	init_MUTEX(&priv->mcast_mutex);
+	init_MUTEX(&priv->vlan_mutex);
+
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+
+	INIT_WORK(&priv->pkey_task,    ipoib_pkey_poll,          priv->dev);
+	INIT_WORK(&priv->mcast_task,   ipoib_mcast_join_task,    priv->dev);
+	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush,       priv->dev);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev);
+	INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah,            priv->dev);
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
+{
+	struct net_device *dev;
+
+	dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
+			   ipoib_setup);
+	if (!dev)
+		return NULL;
+
+	return netdev_priv(dev);
+}
+
+static ssize_t show_pkey(struct class_device *cdev, char *buf)
+{
+	struct ipoib_dev_priv *priv =
+		netdev_priv(container_of(cdev, struct net_device, class_dev));
+
+	return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t create_child(struct class_device *cdev,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev),
+			     pkey);
+
+	return ret ? ret : count;
+}
+static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
+
+static ssize_t delete_child(struct class_device *cdev,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev),
+				pkey);
+
+	return ret ? ret : count;
+
+}
+static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+	return class_device_create_file(&dev->class_dev,
+					&class_device_attr_pkey);
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+					 struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(format);
+	if (!priv)
+		goto alloc_mem_failed;
+
+	SET_NETDEV_DEV(priv->dev, hca->dma_device);
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto alloc_mem_failed;
+	}
+
+	priv->dev->broadcast[8] = priv->pkey >> 8;
+	priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto alloc_mem_failed;
+	} else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+
+	result = ipoib_dev_init(priv->dev, hca, port);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	result = ib_register_event_handler(&priv->event_handler);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+		       "port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto event_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
+		       hca->name, port, result);
+		goto register_failed;
+	}
+
+	if (ipoib_create_debug_file(priv->dev))
+		goto debug_failed;
+
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_create_child))
+		goto sysfs_failed;
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_delete_child))
+		goto sysfs_failed;
+
+	return priv->dev;
+
+sysfs_failed:
+	ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ib_unregister_event_handler(&priv->event_handler);
+
+event_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	int s, e, p;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	if (device->node_type == IB_NODE_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		dev = ipoib_add_port("ib%d", device, p);
+		if (!IS_ERR(dev)) {
+			priv = netdev_priv(dev);
+			list_add_tail(&priv->list, dev_list);
+		}
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device)
+{
+	struct ipoib_dev_priv *priv, *tmp;
+	struct list_head *dev_list;
+
+	dev_list = ib_get_client_data(device, &ipoib_client);
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		ib_unregister_event_handler(&priv->event_handler);
+
+		unregister_netdev(priv->dev);
+		ipoib_dev_cleanup(priv->dev);
+		free_netdev(priv->dev);
+	}
+}
+
+static int __init ipoib_init_module(void)
+{
+	int ret;
+
+	ret = ipoib_register_debugfs();
+	if (ret)
+		return ret;
+
+	/*
+	 * We create our own workqueue mainly because we want to be
+	 * able to flush it when devices are being removed.  We can't
+	 * use schedule_work()/flush_scheduled_work() because both
+	 * unregister_netdev() and linkwatch_event take the rtnl lock,
+	 * so flush_scheduled_work() can deadlock during device
+	 * removal.
+	 */
+	ipoib_workqueue = create_singlethread_workqueue("ipoib");
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_wq;
+
+	return 0;
+
+err_fs:
+	ipoib_unregister_debugfs();
+
+err_wq:
+	destroy_workqueue(ipoib_workqueue);
+
+	return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+	ipoib_unregister_debugfs();
+	ib_unregister_client(&ipoib_client);
+	destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2004-12-13 09:44:49.603417898 -0800
@@ -0,0 +1,954 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib_multicast.c 1323 2004-12-11 02:36:04Z roland $
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+static DECLARE_MUTEX(mcast_mutex);
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ipoib_ah          *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+	struct completion done;
+	
+	int                 query_id;
+	struct ib_sa_query *query;
+
+	unsigned long created;
+	unsigned long backoff;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+};
+
+struct ipoib_mcast_iter {
+	struct net_device *dev;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh, *tmp;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(netdev_priv(dev),
+			"deleting multicast group " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
+		ipoib_put_ah(neigh->ah);
+		*to_ipoib_neigh(neigh->neighbour) = NULL;
+		neigh->neighbour->ops->destructor = NULL;
+		kfree(neigh);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		skb->dev = dev;
+		dev_kfree_skb_any(skb);
+	}
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kmalloc(sizeof (*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	memset(mcast, 0, sizeof (*mcast));
+
+	init_completion(&mcast->done);
+
+	mcast->dev = dev;
+	mcast->created = jiffies;
+	mcast->backoff = HZ;
+	mcast->logcount = 0;
+
+	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
+	skb_queue_head_init(&mcast->pkt_queue);
+
+	mcast->ah    = NULL;
+	mcast->query = NULL;
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the cached Q_Key before we attach if it's the broadcast group */
+	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		    sizeof (union ib_gid)))
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group " IPOIB_GID_FMT
+				   " already attached\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			return 0;
+		}
+
+		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
+					 &mcast->mcmember.mgid);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group "
+				   IPOIB_GID_FMT "\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	{
+		struct ib_ah_attr av = {
+			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
+			.port_num      = priv->port,
+			.sl	       = mcast->mcmember.sl,
+			.src_path_bits = 0,
+			.static_rate   = 0,
+			.ah_flags      = IB_AH_GRH,
+			.grh	       = {
+				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
+				.hop_limit     = mcast->mcmember.hop_limit,
+				.sgid_index    = 0,
+				.traffic_class = mcast->mcmember.traffic_class
+			}
+		};
+
+		av.grh.dgid = mcast->mcmember.mgid;
+
+		mcast->ah = ipoib_create_ah(dev, priv->pd, &av);
+		if (!mcast->ah) {
+			ipoib_warn(priv, "ib_address_create failed\n");
+		} else {
+			ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT
+					" AV %p, LID 0x%04x, SL %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid),
+					mcast->ah->ah,
+					be16_to_cpu(mcast->mcmember.mlid),
+					mcast->mcmember.sl);
+		}
+	}
+
+	/* actually send any queued packets */
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		skb->dev = dev;
+
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+	}
+
+	return 0;
+}
+
+static void
+ipoib_mcast_sendonly_join_complete(int status,
+				   struct ib_sa_mcmember_rec *mcmember,
+				   void *mcast_ptr)
+{
+	struct ipoib_mcast *mcast = mcast_ptr;
+	struct net_device *dev = mcast->dev;
+
+	if (!status)
+		ipoib_mcast_join_finish(mcast, mcmember);
+	else {
+		if (mcast->logcount++ < 20)
+			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
+					IPOIB_GID_FMT ", status %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+		/* Flush out any queued packets */
+		while (!skb_queue_empty(&mcast->pkt_queue)) {
+			struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+			skb->dev = dev;
+
+			dev_kfree_skb_any(skb);
+		}
+
+		/* Clear the busy flag so we try again */
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	}
+
+	complete(&mcast->done);
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+#if 0				/* Some SMs don't support send-only yet */
+		.join_state = 4
+#else
+		.join_state = 1
+#endif
+	};
+	int ret = 0;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+		return -ENODEV;
+	}
+
+	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+		return -EBUSY;
+	}
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec,
+				     IB_SA_MCMEMBER_REC_MGID		|
+				     IB_SA_MCMEMBER_REC_PORT_GID	|
+				     IB_SA_MCMEMBER_REC_PKEY		|
+				     IB_SA_MCMEMBER_REC_JOIN_STATE,
+				     1000, GFP_ATOMIC,
+				     ipoib_mcast_sendonly_join_complete,
+				     mcast, &mcast->query);
+	if (ret < 0) {
+		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n",
+			   ret);
+	} else {
+		ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
+				", starting join\n",
+				IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+		mcast->query_id = ret;
+	}
+
+	return ret;
+}
+
+static void ipoib_mcast_join_complete(int status,
+				      struct ib_sa_mcmember_rec *mcmember,
+				      void *mcast_ptr)
+{
+	struct ipoib_mcast *mcast = mcast_ptr;
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT
+			" (status %d)\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+	if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) {
+		mcast->backoff = HZ;
+		down(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_work(ipoib_workqueue, &priv->mcast_task);
+		up(&mcast_mutex);
+		complete(&mcast->done);
+		return;
+	}
+
+	if (status == -EINTR) {
+		complete(&mcast->done);
+		return;
+	}
+
+	if (status && mcast->logcount++ < 20) {
+		if (status == -ETIMEDOUT || status == -EINTR) {
+			ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
+					", status %d\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid),
+					status);
+		} else {
+			ipoib_warn(priv, "multicast join failed for "
+				   IPOIB_GID_FMT ", status %d\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid),
+				   status);
+		}
+	}
+
+	mcast->backoff *= 2;
+	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+	mcast->query = NULL;
+
+	down(&mcast_mutex);
+	if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) {
+		if (status == -ETIMEDOUT)
+			queue_work(ipoib_workqueue, &priv->mcast_task);
+		else
+			queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+					   mcast->backoff * HZ);
+	} else
+		complete(&mcast->done);
+	up(&mcast_mutex);
+
+	return;
+}
+
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+			     int create)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (create) {
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY		|
+			IB_SA_MCMEMBER_REC_SL		|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL	|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+	}
+
+	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, comp_mask,
+				     mcast->backoff * 1000, GFP_ATOMIC,
+				     ipoib_mcast_join_complete,
+				     mcast, &mcast->query);
+
+	if (ret < 0) {
+		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
+
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+		down(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task,
+					   mcast->backoff);
+		up(&mcast_mutex);
+	} else
+		mcast->query_id = ret;
+}
+
+void ipoib_mcast_join_task(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		return;
+
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+		ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+	else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	if (!priv->broadcast) {
+		priv->broadcast = ipoib_mcast_alloc(dev, 1);
+		if (!priv->broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			down(&mcast_mutex);
+			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+				queue_delayed_work(ipoib_workqueue,
+						   &priv->mcast_task, HZ);
+			up(&mcast_mutex);
+			return;
+		}
+
+		memcpy(priv->broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		       sizeof (union ib_gid));
+
+		spin_lock_irq(&priv->lock);
+		__ipoib_mcast_add(dev, priv->broadcast);
+		spin_unlock_irq(&priv->lock);
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		ipoib_mcast_join(dev, priv->broadcast, 0);
+		return;
+	}
+
+	while (1) {
+		struct ipoib_mcast *mcast = NULL;
+
+		spin_lock_irq(&priv->lock);
+		list_for_each_entry(mcast, &priv->multicast_list, list) {
+			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+				/* Found the next unjoined group */
+				break;
+			}
+		}
+		spin_unlock_irq(&priv->lock);
+
+		if (&mcast->list == &priv->multicast_list) {
+			/* All done */
+			break;
+		}
+
+		ipoib_mcast_join(dev, mcast, 1);
+		return;
+	}
+
+	{
+		struct ib_port_attr attr;
+
+		if (!ib_query_port(priv->ca, priv->port, &attr))
+			priv->local_lid = attr.lid;
+		else
+			ipoib_warn(priv, "ib_query_port failed\n");
+	}
+
+	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
+		IPOIB_ENCAP_LEN;
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	netif_carrier_on(dev);
+}
+
+int ipoib_mcast_start_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+	down(&mcast_mutex);
+	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_work(ipoib_workqueue, &priv->mcast_task);
+	up(&mcast_mutex);
+
+	return 0;
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	down(&mcast_mutex);
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	cancel_delayed_work(&priv->mcast_task);
+	up(&mcast_mutex);
+
+	flush_workqueue(ipoib_workqueue);
+
+	if (priv->broadcast && priv->broadcast->query) {
+		ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
+		priv->broadcast->query = NULL;
+		ipoib_dbg_mcast(priv, "waiting for bcast\n");
+		wait_for_completion(&priv->broadcast->done);
+	}
+
+	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (mcast->query) {
+			ib_sa_cancel_query(mcast->query_id, mcast->query);
+			mcast->query = NULL;
+			ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+			wait_for_completion(&mcast->done);
+		}
+	}
+
+	return 0;
+}
+
+int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	int ret = 0;
+
+	if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags))
+		return 0;
+
+	ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
+			IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = be16_to_cpu(priv->pkey);
+
+	/* Remove ourselves from the multicast group */
+	ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
+				 &mcast->mcmember.mgid);
+	if (ret)
+		ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+
+	/*
+	 * Just make one shot at leaving and don't wait for a reply;
+	 * if we fail, too bad.
+	 */
+	ret = ib_sa_mcmember_rec_delete(priv->ca, priv->port, &rec,
+					IB_SA_MCMEMBER_REC_MGID		|
+					IB_SA_MCMEMBER_REC_PORT_GID	|
+					IB_SA_MCMEMBER_REC_PKEY		|
+					IB_SA_MCMEMBER_REC_JOIN_STATE,
+					0, GFP_ATOMIC, NULL,
+					mcast, &mcast->query);
+	if (ret < 0)
+		ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
+			   "for leave (result = %d)\n", ret);
+
+	return 0;
+}
+
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+		      struct sk_buff *skb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	mcast = __ipoib_mcast_find(dev, mgid);
+	if (!mcast) {
+		/* Let's create a new send only group now */
+		ipoib_dbg_mcast(priv, "setting up send only multicast group for "
+				IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid));
+
+		mcast = ipoib_mcast_alloc(dev, 0);
+		if (!mcast) {
+			ipoib_warn(priv, "unable to allocate memory for "
+				   "multicast structure\n");
+			dev_kfree_skb_any(skb);
+			goto out;
+		}
+
+		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+		mcast->mcmember.mgid = *mgid;
+		__ipoib_mcast_add(dev, mcast);
+		list_add_tail(&mcast->list, &priv->multicast_list);
+	}
+
+	if (!mcast->ah) {
+		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+			skb_queue_tail(&mcast->pkt_queue, skb);
+		else
+			dev_kfree_skb_any(skb);
+
+		if (mcast->query)
+			ipoib_dbg_mcast(priv, "no address vector, "
+					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);
+
+		/*
+		 * If lookup completes between here and out:, don't
+		 * want to send packet twice.
+		 */
+		mcast = NULL;
+	}
+
+out:
+	if (mcast && mcast->ah) {
+		if (skb->dst            &&
+		    skb->dst->neighbour &&
+		    !*to_ipoib_neigh(skb->dst->neighbour)) {
+			struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+
+			if (neigh) {
+				kref_get(&mcast->ah->ref);
+				neigh->ah  	= mcast->ah;
+				neigh->neighbour = skb->dst->neighbour;
+				*to_ipoib_neigh(skb->dst->neighbour) = neigh;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
+			}
+		}
+
+		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast, *nmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		nmcast = ipoib_mcast_alloc(dev, 0);
+		if (nmcast) {
+			nmcast->flags =
+				mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY);
+
+			nmcast->mcmember.mgid = mcast->mcmember.mgid;
+
+			/* Add the new group in before the to-be-destroyed group */
+			list_add_tail(&nmcast->list, &mcast->list);
+			list_del_init(&mcast->list);
+
+			rb_replace_node(&mcast->rb_node, &nmcast->rb_node,
+					&priv->multicast_tree);
+
+			list_add_tail(&mcast->list, &remove_list);
+		} else {
+			ipoib_warn(priv, "could not reallocate multicast group "
+				   IPOIB_GID_FMT "\n",
+				   IPOIB_GID_ARG(mcast->mcmember.mgid));
+		}
+	}
+
+	if (priv->broadcast) {
+		nmcast = ipoib_mcast_alloc(dev, 0);
+		if (nmcast) {
+			nmcast->mcmember.mgid = priv->broadcast->mcmember.mgid;
+
+			rb_replace_node(&priv->broadcast->rb_node,
+					&nmcast->rb_node,
+					&priv->multicast_tree);
+
+			list_add_tail(&priv->broadcast->list, &remove_list);
+		}
+
+		priv->broadcast = nmcast;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry(mcast, &remove_list, list) {
+		ipoib_mcast_leave(dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+void ipoib_mcast_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+
+	/* Delete broadcast since it will be recreated */
+	if (priv->broadcast) {
+		ipoib_dbg_mcast(priv, "deleting broadcast group\n");
+
+		spin_lock_irqsave(&priv->lock, flags);
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_mcast_leave(dev, priv->broadcast);
+		ipoib_mcast_free(priv->broadcast);
+		priv->broadcast = NULL;
+	}
+}
+
+void ipoib_mcast_restart_task(void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct dev_mc_list *mclist;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+	ipoib_mcast_stop_thread(dev);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+	for (mclist = dev->mc_list; mclist; mclist = mclist->next) {
+		union ib_gid mgid;
+
+		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
+
+		/* Add in the P_Key */
+		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
+		mgid.raw[5] = priv->pkey & 0xff;
+
+		mcast = __ipoib_mcast_find(dev, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid "
+					IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));
+
+			nmcast = ipoib_mcast_alloc(dev, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_del(&mcast->list);
+				list_add_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(dev, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n",
+					IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_del(&mcast->list);
+			list_add_tail(&mcast->list, &remove_list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	/* We have to cancel outside of the spinlock */
+	list_for_each_entry(mcast, &remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_mcast_start_thread(dev);
+}
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->mgid.raw, 0, sizeof iter->mgid);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		ipoib_mcast_iter_free(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter)
+{
+	kfree(iter);
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2004-12-13 09:44:49.634413332 -0800
@@ -0,0 +1,243 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib_verbs.c 1308 2004-12-03 01:31:40Z roland $
+ */
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr *qp_attr;
+	int attr_mask;
+	int ret;
+	u16 pkey_index;
+
+	ret = -ENOMEM;
+	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+	if (!qp_attr)
+		goto out;
+
+	if (ib_cached_pkey_find(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	/* set correct QKey for QP */
+	qp_attr->qkey = priv->qkey;
+	attr_mask = IB_QP_QKEY;
+	ret = ib_modify_qp(priv->qp, qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+		goto out;
+	}
+
+	/* attach QP to multicast group */
+	down(&priv->mcast_mutex);
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	up(&priv->mcast_mutex);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	down(&priv->mcast_mutex);
+	ret = ib_detach_mcast(priv->qp, mgid, mlid);
+	up(&priv->mcast_mutex);
+	if (ret)
+		ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+
+	return ret;
+}
+
+int ipoib_qp_create(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	u16 pkey_index;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	/*
+	 * Search through the port P_Key table for the requested pkey value.
+	 * The port has to be assigned to the respective IB partition in
+	 * advance.
+	 */
+	ret = ib_cached_pkey_find(priv->ca, priv->port, priv->pkey, &pkey_index);
+	if (ret) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		return ret;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	ib_destroy_qp(priv->qp);
+	priv->qp = NULL;
+
+	return -EINVAL;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = IPOIB_TX_RING_SIZE,
+			.max_recv_wr  = IPOIB_RX_RING_SIZE,
+			.max_send_sge = 1,
+			.max_recv_sge = 1
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.rq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+
+	priv->pd = ib_alloc_pd(priv->ca);
+	if (IS_ERR(priv->pd)) {
+		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+		return -ENODEV;
+	}
+
+	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
+				IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1);
+	if (IS_ERR(priv->cq)) {
+		printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
+		goto out_free_pd;
+	}
+
+	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
+		goto out_free_cq;
+
+	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(priv->mr)) {
+		printk(KERN_WARNING "%s: ib_reg_phys_mr failed\n", ca->name);
+		goto out_free_cq;
+	}
+
+	init_attr.send_cq = priv->cq;
+	init_attr.recv_cq = priv->cq,
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+		goto out_free_mr;
+	}
+
+	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
+	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
+
+	return 0;
+
+out_free_mr:
+	ib_dereg_mr(priv->mr);
+
+out_free_cq:
+	ib_destroy_cq(priv->cq);
+
+out_free_pd:
+	ib_dealloc_pd(priv->pd);
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+
+	if (ib_dereg_mr(priv->mr))
+		ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+	if (ib_destroy_cq(priv->cq))
+		ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+	if (ib_dealloc_pd(priv->pd))
+		ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->event == IB_EVENT_PORT_ACTIVE ||
+	    record->event == IB_EVENT_LID_CHANGE  ||
+	    record->event == IB_EVENT_SM_CHANGE) {
+		ipoib_dbg(priv, "Port active event\n");
+		schedule_work(&priv->flush_task);
+	}
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/ulp/ipoib/ipoib_vlan.c	2004-12-13 09:44:49.660409502 -0800
@@ -0,0 +1,166 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id: ipoib_vlan.c 1271 2004-11-18 22:11:29Z roland $
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct class_device *class_dev, char *buf)
+{
+	struct net_device *dev =
+		container_of(class_dev, struct net_device, class_dev);
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static CLASS_DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	down(&ppriv->vlan_mutex);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the child interfaces to make sure the Pkey doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto err;
+	}
+
+	list_for_each_entry(priv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			result = -ENOTUNIQ;
+			goto err;
+		}
+	}
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv) {
+		result = -ENOMEM;
+		goto err;
+	}
+
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	priv->pkey = pkey;
+
+	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	priv->dev->broadcast[8] = pkey >> 8;
+	priv->dev->broadcast[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto device_init_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	priv->parent = ppriv->dev;
+
+	if (ipoib_create_debug_file(priv->dev))
+		goto debug_failed;
+
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+
+	if (class_device_create_file(&priv->dev->class_dev,
+				     &class_device_attr_parent))
+		goto sysfs_failed;
+
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	up(&ppriv->vlan_mutex);
+
+	return 0;
+
+sysfs_failed:
+	ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+err:
+	up(&ppriv->vlan_mutex);
+	return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	int ret = -ENOENT;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	down(&ppriv->vlan_mutex);
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			unregister_netdev(priv->dev);
+			ipoib_dev_cleanup(priv->dev);
+
+			list_del(&priv->list);
+
+			kfree(priv);
+
+			ret = 0;
+			break;
+		}
+	}
+	up(&ppriv->vlan_mutex);
+
+	return ret;
+}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][18/21] Add InfiniBand userspace MAD support
  2004-12-13 18:09                       ` [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Roland Dreier
@ 2004-12-13 18:09                         ` Roland Dreier
  2004-12-13 18:09                           ` [PATCH][v3][19/21] Document InfiniBand ioctl use Roland Dreier
  2004-12-13 18:44                         ` [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Tom Duffy
  1 sibling, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add a driver that provides a character special device for each
InfiniBand port.  This device allows userspace to send and receive
MADs via write() and read() (with some control operations implemented
as ioctls).

All operations are 32/64 clean and have been tested with 32-bit
userspace running on a ppc64 kernel.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/drivers/infiniband/core/Makefile	2004-12-13 09:44:43.579305364 -0800
+++ linux-bk/drivers/infiniband/core/Makefile	2004-12-13 09:44:50.192331140 -0800
@@ -1,22 +1,12 @@
 EXTRA_CFLAGS += -Idrivers/infiniband/include
 
-obj-$(CONFIG_INFINIBAND) += \
-    ib_core.o \
-    ib_mad.o \
-    ib_sa.o
+obj-$(CONFIG_INFINIBAND) +=	ib_core.o ib_mad.o ib_sa.o ib_umad.o
 
-ib_core-objs := \
-    packer.o \
-    ud_header.o \
-    verbs.o \
-    sysfs.o \
-    device.o \
-    fmr_pool.o \
-    cache.o
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o
 
-ib_mad-objs := \
-    mad.o \
-    smi.o \
-    agent.o
+ib_mad-y :=			mad.o smi.o agent.o
 
-ib_sa-objs := sa_query.o
+ib_sa-y :=			sa_query.o
+
+ib_umad-y :=			user_mad.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/core/user_mad.c	2004-12-13 09:44:50.232325248 -0800
@@ -0,0 +1,694 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/kref.h>
+
+#include <asm/uaccess.h>
+
+#include <ib_mad.h>
+#include <ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 256,
+	IB_UMAD_MAX_AGENTS = 32
+};
+
+struct ib_umad_port {
+	int                    devnum;
+	struct cdev            dev;
+	struct class_device    class_dev;
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kref          ref;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct ib_umad_port *port;
+	spinlock_t           recv_lock;
+	struct list_head     recv_list;
+	wait_queue_head_t    recv_wait;
+	struct rw_semaphore  agent_mutex;
+	struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+	struct ib_mr        *mr[IB_UMAD_MAX_AGENTS];
+};
+
+struct ib_umad_packet {
+	struct ib_user_mad mad;
+	struct ib_ah      *ah;
+	struct list_head   list;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static dev_t base_dev;
+static spinlock_t map_lock;
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	down_read(&file->agent_mutex);
+	for (packet->mad.id = 0;
+	     packet->mad.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.id++)
+		if (agent == file->agent[packet->mad.id]) {
+			spin_lock_irq(&file->recv_lock);
+			list_add_tail(&packet->list, &file->recv_list);
+			spin_unlock_irq(&file->recv_lock);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	up_read(&file->agent_mutex);
+
+	return ret;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet =
+		(void *) (unsigned long) send_wc->wr_id;
+
+	dma_unmap_single(agent->device->dma_device,
+			 pci_unmap_addr(packet, mapping),
+			 sizeof packet->mad.data,
+			 DMA_TO_DEVICE);
+	ib_destroy_ah(packet->ah);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->mad.status = ETIMEDOUT;
+
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+		
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto out;
+
+	packet = kmalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto out;
+
+	memset(packet, 0, sizeof *packet);
+
+	memcpy(packet->mad.data, mad_recv_wc->recv_buf.mad, sizeof packet->mad.data);
+	packet->mad.status        = 0;
+	packet->mad.qpn 	  = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.lid 	  = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.sl  	  = mad_recv_wc->wc->sl;
+	packet->mad.path_bits 	  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.grh_present   = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.grh_present) {
+		/* XXX parse GRH */
+		packet->mad.gid_index 	  = 0;
+		packet->mad.hop_limit 	  = 0;
+		packet->mad.traffic_class = 0;
+		memset(packet->mad.gid, 0, 16);
+		packet->mad.flow_label 	  = 0;
+	}
+
+	if (queue_packet(file, agent, packet))
+		kfree(packet);
+
+out:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < sizeof (struct ib_user_mad))
+		return -EINVAL;
+
+	spin_lock_irq(&file->recv_lock);
+
+	while (list_empty(&file->recv_list)) {
+		spin_unlock_irq(&file->recv_lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->recv_lock);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	spin_unlock_irq(&file->recv_lock);
+
+	if (copy_to_user(buf, &packet->mad, sizeof packet->mad))
+		ret = -EFAULT;
+	else
+		ret = sizeof packet->mad;
+
+	kfree(packet);
+	return ret;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_sge      gather_list;
+	struct ib_send_wr *bad_wr, wr = {
+		.opcode      = IB_WR_SEND,
+		.sg_list     = &gather_list,
+		.num_sge     = 1,
+		.send_flags  = IB_SEND_SIGNALED,
+	};
+	int ret;
+
+	if (count < sizeof (struct ib_user_mad))
+		return -EINVAL;
+
+	packet = kmalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, sizeof packet->mad)) {
+		kfree(packet);
+		return -EFAULT;
+	}
+
+	if (packet->mad.id < 0 || packet->mad.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	down_read(&file->agent_mutex);
+
+	agent = file->agent[packet->mad.id];
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	((struct ib_mad_hdr *) packet->mad.data)->tid =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 |
+			    (be64_to_cpu(((struct ib_mad_hdr *) packet->mad.data)->tid) &
+			     0xffffffff));
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.lid);
+	ah_attr.sl            = packet->mad.sl;
+	ah_attr.src_path_bits = packet->mad.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	/* XXX handle GRH */
+
+	packet->ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(packet->ah)) {
+		ret = PTR_ERR(packet->ah);
+		goto err_up;
+	}
+
+	gather_list.addr = dma_map_single(agent->device->dma_device,
+					  packet->mad.data,
+					  sizeof packet->mad.data,
+					  DMA_TO_DEVICE);
+	gather_list.length = sizeof packet->mad.data;
+	gather_list.lkey   = file->mr[packet->mad.id]->lkey;
+	pci_unmap_addr_set(packet, mapping, gather_list.addr);
+
+	wr.wr.ud.mad_hdr     = (struct ib_mad_hdr *) packet->mad.data;
+	wr.wr.ud.ah          = packet->ah;
+	wr.wr.ud.remote_qpn  = be32_to_cpu(packet->mad.qpn);
+	wr.wr.ud.remote_qkey = be32_to_cpu(packet->mad.qkey);
+	wr.wr.ud.timeout_ms  = packet->mad.timeout_ms;
+
+	wr.wr_id            = (unsigned long) packet;
+
+	ret = ib_post_send_mad(agent, &wr, &bad_wr);
+	if (ret) {
+		dma_unmap_single(agent->device->dma_device,
+				 pci_unmap_addr(packet, mapping),
+				 sizeof packet->mad.data,
+				 DMA_TO_DEVICE);
+		goto err_up;
+	}
+
+	up_read(&file->agent_mutex);
+
+	return sizeof packet->mad;
+
+err_up:
+	up_read(&file->agent_mutex);
+
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent;
+	int agent_id;
+	int ret;
+
+	down_write(&file->agent_mutex);
+
+	if (copy_from_user(&ureq, (void __user *) arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!file->agent[agent_id])
+			goto found;
+
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	req.mgmt_class         = ureq.mgmt_class;
+	req.mgmt_class_version = ureq.mgmt_class_version;
+	memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask);
+	memcpy(req.oui,         ureq.oui,         sizeof req.oui);
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      &req, 0, send_handler, recv_handler,
+				      file);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		goto out;
+	}
+
+	file->agent[agent_id] = agent;
+
+	file->mr[agent_id] = ib_get_dma_mr(agent->qp->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(file->mr[agent_id])) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto err_mr;
+	}
+
+	ret = 0;
+	goto out;
+
+err_mr:
+	ib_dereg_mr(file->mr[agent_id]);
+
+err:
+	file->agent[agent_id] = NULL;
+	ib_unregister_mad_agent(agent);
+
+out:
+	up_write(&file->agent_mutex);
+	return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+	u32 id;
+	int ret = 0;
+
+	down_write(&file->agent_mutex);
+
+	if (get_user(id, (u32 __user *) arg)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !file->agent[id]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ib_dereg_mr(file->mr[id]);
+	ib_unregister_mad_agent(file->agent[id]);
+	file->agent[id] = NULL;
+
+out:
+	up_write(&file->agent_mutex);
+	return ret;
+}
+
+static int ib_umad_ioctl(struct inode *inode, struct file *filp,
+			 unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, arg);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port =
+		container_of(inode->i_cdev, struct ib_umad_port, dev);
+	struct ib_umad_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	memset(file, 0, sizeof *file);
+
+	spin_lock_init(&file->recv_lock);
+	init_rwsem(&file->agent_mutex);
+	INIT_LIST_HEAD(&file->recv_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	filp->private_data = file;
+
+	return 0;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	int i;
+
+	for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+		if (file->agent[i]) {
+			ib_dereg_mr(file->mr[i]);
+			ib_unregister_mad_agent(file->agent[i]);
+		}
+
+	kfree(file);
+
+	return 0;
+}
+
+static struct file_operations umad_fops = {
+	.owner 	 = THIS_MODULE,
+	.read 	 = ib_umad_read,
+	.write 	 = ib_umad_write,
+	.poll 	 = ib_umad_poll,
+	.ioctl 	 = ib_umad_ioctl,
+	.open 	 = ib_umad_open,
+	.release = ib_umad_close
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_dev(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port =
+		container_of(class_dev, struct ib_umad_port, class_dev);
+
+	return print_dev_t(buf, port->dev.dev);
+}
+static CLASS_DEVICE_ATTR(dev, S_IRUGO, show_dev, NULL);
+
+static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port =
+		container_of(class_dev, struct ib_umad_port, class_dev);
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct class_device *class_dev, char *buf)
+{
+	struct ib_umad_port *port =
+		container_of(class_dev, struct ib_umad_port, class_dev);
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static CLASS_DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+	struct ib_umad_device *dev =
+		container_of(ref, struct ib_umad_device, ref);
+
+	kfree(dev);
+}
+
+static void ib_umad_release_port(struct class_device *class_dev)
+{
+	struct ib_umad_port *port =
+		container_of(class_dev, struct ib_umad_port, class_dev);
+
+	cdev_del(&port->dev);
+	clear_bit(port->devnum, dev_map);
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+}
+
+static struct class umad_class = {
+	.name    = "infiniband_mad",
+	.release = ib_umad_release_port
+};
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (device->node_type == IB_NODE_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kmalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	memset(umad_dev, 0, sizeof *umad_dev +
+	       (e - s + 1) * sizeof (struct ib_umad_port));
+
+	kref_init(&umad_dev->ref);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+		kref_get(&umad_dev->ref);
+
+		spin_lock(&map_lock);
+		umad_dev->port[i - s].devnum =
+			find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+		if (umad_dev->port[i - s].devnum >= IB_UMAD_MAX_PORTS) {
+			spin_unlock(&map_lock);
+			goto err;
+		}
+		set_bit(umad_dev->port[i - s].devnum, dev_map);
+		spin_unlock(&map_lock);
+
+		umad_dev->port[i - s].ib_dev   = device;
+		umad_dev->port[i - s].port_num = i;
+
+		cdev_init(&umad_dev->port[i - s].dev, &umad_fops);
+		umad_dev->port[i - s].dev.owner = THIS_MODULE;
+		kobject_set_name(&umad_dev->port[i - s].dev.kobj,
+				 "umad%d", umad_dev->port[i - s].devnum);
+		if (cdev_add(&umad_dev->port[i - s].dev, base_dev +
+			     umad_dev->port[i - s].devnum, 1))
+			goto err;
+
+		umad_dev->port[i - s].class_dev.class = &umad_class;
+		umad_dev->port[i - s].class_dev.dev   = device->dma_device;
+		snprintf(umad_dev->port[i - s].class_dev.class_id,
+			 BUS_ID_SIZE, "umad%d", umad_dev->port[i - s].devnum);
+		if (class_device_register(&umad_dev->port[i - s].class_dev))
+			goto err_class;
+
+		if (class_device_create_file(&umad_dev->port[i - s].class_dev,
+					     &class_device_attr_dev))
+			goto err_class;
+		if (class_device_create_file(&umad_dev->port[i - s].class_dev,
+					     &class_device_attr_ibdev))
+			goto err_class;
+		if (class_device_create_file(&umad_dev->port[i - s].class_dev,
+					     &class_device_attr_port))
+			goto err_class;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err_class:
+	cdev_del(&umad_dev->port[i - s].dev);
+	clear_bit(umad_dev->port[i - s].devnum, dev_map);
+
+err:
+	while (--i >= s)
+		class_device_unregister(&umad_dev->port[i - s].class_dev);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+		class_device_unregister(&umad_dev->port[i].class_dev);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	spin_lock_init(&map_lock);
+
+	ret = alloc_chrdev_region(&base_dev, 0, IB_UMAD_MAX_PORTS,
+				  "infiniband_mad");
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't get device number\n");
+		goto out;
+	}
+
+	ret = class_register(&umad_class);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(&umad_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+		goto out_class;
+	}
+		
+	return 0;
+
+out_class:
+	class_unregister(&umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_unregister(&umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/drivers/infiniband/include/ib_user_mad.h	2004-12-13 09:44:50.261320976 -0800
@@ -0,0 +1,112 @@
+/*
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available at
+ * <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ * license, available in the LICENSE.TXT file accompanying this
+ * software.  These details are also available at
+ * <http://openib.org/license.html>.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION	2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad - MAD packet
+ * @data - Contents of MAD
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ *
+ * All multi-byte quantities are stored in network (big endian) byte order.
+ */
+struct ib_user_mad {
+	__u8	data[256];
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	qpn;
+	__u32   qkey;
+	__u16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__u32	flow_label;
+};
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ */
+struct ib_user_mad_reg_req {
+	__u32	id;
+	__u32	method_mask[4];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+};
+
+#define IB_IOCTL_MAGIC		0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
+					      struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#endif /* IB_USER_MAD_H */


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][19/21] Document InfiniBand ioctl use
  2004-12-13 18:09                         ` [PATCH][v3][18/21] Add InfiniBand userspace MAD support Roland Dreier
@ 2004-12-13 18:09                           ` Roland Dreier
  2004-12-13 18:10                             ` [PATCH][v3][20/21] Add InfiniBand Documentation files Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add the 0x1b ioctl magic number used by ib_umad module to
Documentation/ioctl-number.txt.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/Documentation/ioctl-number.txt	2004-12-11 15:16:34.000000000 -0800
+++ linux-bk/Documentation/ioctl-number.txt	2004-12-13 09:44:50.671260584 -0800
@@ -72,6 +72,7 @@
 0x09	all	linux/md.h
 0x12	all	linux/fs.h
 		linux/blkpg.h
+0x1b	all	InfiniBand Subsystem	<http://www.openib.org/>
 0x20	all	drivers/cdrom/cm206.h
 0x22	all	scsi/sg.h
 '#'	00-3F	IEEE 1394 Subsystem	Block for the entire subsystem


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-13 18:09                           ` [PATCH][v3][19/21] Document InfiniBand ioctl use Roland Dreier
@ 2004-12-13 18:10                             ` Roland Dreier
  2004-12-13 18:10                               ` [PATCH][v3][21/21] InfiniBand MAINTAINERS entry Roland Dreier
                                                 ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add files to Documentation/infiniband that describe the tree under
/sys/class/infiniband, the IPoIB driver and the userspace MAD access driver.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/Documentation/infiniband/ipoib.txt	2004-12-13 09:44:50.964217426 -0800
@@ -0,0 +1,56 @@
+IP OVER INFINIBAND
+
+  The ib_ipoib driver is an implementation of the IP over InfiniBand
+  protocol as specified by the latest Internet-Drafts issued by the
+  IETF ipoib working group.  It is a "native" implementation in the
+  sense of setting the interface type to ARPHRD_INFINIBAND and the
+  hardware address length to 20 (earlier proprietary implementations
+  masqueraded to the kernel as ethernet interfaces).
+
+Partitions and P_Keys
+
+  When the IPoIB driver is loaded, it creates one interface for each
+  port using the P_Key at index 0.  To create an interface with a
+  different P_Key, write the desired P_Key into the main interface's
+  /sys/class/net/<intf name>/create_child file.  For example:
+
+    echo 0x8001 > /sys/class/net/ib0/create_child
+
+  This will create an interface named ib0.8001 with P_Key 0x8001.  To
+  remove a subinterface, use the "delete_child" file:
+
+    echo 0x8001 > /sys/class/net/ib0/delete_child
+
+  The P_Key for any interface is given by the "pkey" file, and the
+  main interface for a subinterface is in "parent."
+
+Debugging Information
+
+  By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
+  to 'y', tracing messages are compiled into the driver.  They are
+  turned on by setting the module parameters debug_level and
+  mcast_debug_level to 1.  These parameters can be controlled at
+  runtime through files in /sys/module/ib_ipoib/.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG also enables the "ipoib_debugfs"
+  virtual filesystem.  By mounting this filesystem, for example with
+
+    mkdir -p /ipoib_debugfs
+    mount -t ipoib_debugfs none /ipoib_debufs
+
+  it is possible to get statistics about multicast groups from the
+  files /ipoib_debugfs/ib0_mcg and so on.
+
+  The performance impact of this option is negligible, so it
+  is safe to enable this option with debug_level set to 0 for normal
+  operation.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
+  the data path when data_debug_level is set to 1.  However, even with
+  the output disabled, enabling this configuration option will affect
+  performance, because it adds tests to the fast path.
+
+References
+
+  IETF IP over InfiniBand (ipoib) Working Group
+    http://ietf.org/html.charters/ipoib-charter.html
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/Documentation/infiniband/sysfs.txt	2004-12-13 09:44:51.008210945 -0800
@@ -0,0 +1,63 @@
+SYSFS FILES
+
+  For each InfiniBand device, the InfiniBand drivers create the
+  following files under /sys/class/infiniband/<device name>:
+
+    node_guid      - Node GUID
+    sys_image_guid - System image GUID
+
+  In addition, there is a "ports" subdirectory, with one subdirectory
+  for each port.  For example, if mthca0 is a 2-port HCA, there will
+  be two directories:
+
+    /sys/class/infiniband/mthca0/ports/1
+    /sys/class/infiniband/mthca0/ports/2
+
+  (A switch will only have a single "0" subdirectory for switch port
+  0; no subdirectory is created for normal switch ports)
+
+  In each port subdirectory, the following files are created:
+
+    cap_mask       - Port capability mask
+    lid            - Port LID
+    lid_mask_count - Port LID mask count
+    sm_lid         - Subnet manager LID for port's subnet
+    sm_sl          - Subnet manager SL for port's subnet
+    state          - Port state (DOWN, INIT, ARMED, ACTIVE or ACTIVE_DEFER)
+
+  There is also a "counters" subdirectory, with files
+
+    VL15_dropped
+    excessive_buffer_overrun_errors
+    link_downed
+    link_error_recovery
+    local_link_integrity_errors
+    port_rcv_constraint_errors
+    port_rcv_data
+    port_rcv_errors
+    port_rcv_packets
+    port_rcv_remote_physical_errors
+    port_rcv_switch_relay_errors
+    port_xmit_constraint_errors
+    port_xmit_data
+    port_xmit_discards
+    port_xmit_packets
+    symbol_error
+
+  Each of these files contains the corresponding value from the port's
+  Performance Management PortCounters attribute, as described in
+  section 16.1.3.5 of the InfiniBand Architecture Specification.
+
+  The "pkeys" and "gids" subdirectories contain one file for each
+  entry in the port's P_Key or GID table respectively.  For example,
+  ports/1/pkeys/10 contains the value at index 10 in port 1's P_Key
+  table.
+
+MTHCA
+
+  The Mellanox HCA driver also creates the files:
+
+    hw_rev   - Hardware revision number
+    fw_ver   - Firmware version
+    hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)",
+               or "MT25208"
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-bk/Documentation/infiniband/user_mad.txt	2004-12-13 09:44:51.441147165 -0800
@@ -0,0 +1,81 @@
+USERSPACE MAD ACCESS
+
+Device files
+
+  Each port of each InfiniBand device has a "umad" device attached.
+  For example, a two-port HCA will have two devices, while a switch
+  will have one device (for switch port 0).
+
+Creating MAD agents
+
+  A MAD agent can be created by filling in a struct ib_user_mad_reg_req
+  and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
+  descriptor for the appropriate device file.  If the registration
+  request succeeds, a 32-bit id will be returned in the structure.
+  For example:
+
+	struct ib_user_mad_reg_req req = { /* ... */ };
+	ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
+        if (!ret)
+		my_agent = req.id;
+	else
+		perror("agent register");
+
+  Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
+  ioctl.  Also, all agents registered through a file descriptor will
+  be unregistered when the descriptor is closed.
+
+Receiving MADs
+
+  MADs are received using read().  The buffer passed to read() must be
+  large enough to hold at least one struct ib_user_mad.  For example:
+
+	struct ib_user_mad mad;
+	ret = read(fd, &mad, sizeof mad);
+	if (ret != sizeof mad)
+		perror("read");
+
+  In addition to the actual MAD contents, the other struct ib_user_mad
+  fields will be filled in with information on the received MAD.  For
+  example, the remote LID will be in mad.lid.
+
+  If a send times out, a receive will be generated with mad.status set
+  to ETIMEDOUT.  Otherwise when a MAD has been successfully received,
+  mad.status will be 0.
+
+  poll()/select() may be used to wait until a MAD can be read.
+
+Sending MADs
+
+  MADs are sent using write().  The agent ID for sending should be
+  filled into the id field of the MAD, the destination LID should be
+  filled into the lid field, and so on.  For example:
+
+	struct ib_user_mad mad;
+
+	/* fill in mad.data */
+
+	mad.id  = my_agent;	/* req.id from agent registration */
+	mad.lid = my_dest;	/* in network byte order... */
+	/* etc. */
+
+	ret = write(fd, &mad, sizeof mad);
+	if (ret != sizeof mad)
+		perror("write");
+
+/dev files
+
+  To create the appropriate character device files automatically with
+  udev, a rule like
+
+    KERNEL="umad*", NAME="infiniband/%k"
+
+  can be used.  This will create a device node named
+
+    /dev/infiniband/umad0
+
+  for the first port, and so on.  The InfiniBand device and port
+  associated with this device can be determined from the files
+
+    /sys/class/infiniband_mad/umad0/ibdev
+    /sys/class/infiniband_mad/umad0/port


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH][v3][21/21] InfiniBand MAINTAINERS entry
  2004-12-13 18:10                             ` [PATCH][v3][20/21] Add InfiniBand Documentation files Roland Dreier
@ 2004-12-13 18:10                               ` Roland Dreier
  2004-12-13 18:48                               ` [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files Tom Duffy
  2004-12-14  2:42                               ` YOSHIFUJI Hideaki / 吉藤英明
  2 siblings, 0 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:10 UTC (permalink / raw)
  To: linux-kernel; +Cc: openib-general

Add OpenIB maintainers information to MAINTAINERS.

Signed-off-by: Roland Dreier <roland@topspin.com>


--- linux-bk.orig/MAINTAINERS	2004-12-11 15:16:16.000000000 -0800
+++ linux-bk/MAINTAINERS	2004-12-13 09:44:51.816091929 -0800
@@ -1081,6 +1081,17 @@
 L:	linux-fbdev-devel@lists.sourceforge.net
 S:	Maintained
 
+INFINIBAND SUBSYSTEM
+P:	Roland Dreier
+M:	roland@topspin.com
+P:	Sean Hefty
+M:	mshefty@ichips.intel.com
+P:	Hal Rosenstock
+M:	halr@voltaire.com
+L:	openib-general@openib.org
+W:	http://www.openib.org/
+S:	Supported
+
 INPUT (KEYBOARD, MOUSE, JOYSTICK) DRIVERS
 P:	Vojtech Pavlik
 M:	vojtech@suse.cz


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver
  2004-12-13 18:09                       ` [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Roland Dreier
  2004-12-13 18:09                         ` [PATCH][v3][18/21] Add InfiniBand userspace MAD support Roland Dreier
@ 2004-12-13 18:44                         ` Tom Duffy
  2004-12-13 18:49                           ` Roland Dreier
  1 sibling, 1 reply; 29+ messages in thread
From: Tom Duffy @ 2004-12-13 18:44 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Linux Kernel Mailing List, netdev, openib-general

[-- Attachment #1: Type: text/plain, Size: 565 bytes --]

On Mon, 2004-12-13 at 10:09 -0800, Roland Dreier wrote:
> --- linux-bk.orig/drivers/infiniband/Kconfig	2004-12-13 09:44:43.936252779 -0800
> +++ linux-bk/drivers/infiniband/Kconfig	2004-12-13 09:44:49.385450009 -0800
> @@ -2,7 +2,6 @@
>  
>  config INFINIBAND
>  	tristate "InfiniBand support"
> -	default n
>  	---help---
>  	  Core support for InfiniBand (IB).  Make sure to also select
>  	  any protocols you wish to use as well as drivers for your

Is there a reason why you put this in in an earlier patch and then take
it out later?

-tduffy

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-13 18:10                             ` [PATCH][v3][20/21] Add InfiniBand Documentation files Roland Dreier
  2004-12-13 18:10                               ` [PATCH][v3][21/21] InfiniBand MAINTAINERS entry Roland Dreier
@ 2004-12-13 18:48                               ` Tom Duffy
  2004-12-13 19:00                                 ` Roland Dreier
  2004-12-14  2:42                               ` YOSHIFUJI Hideaki / 吉藤英明
  2 siblings, 1 reply; 29+ messages in thread
From: Tom Duffy @ 2004-12-13 18:48 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Linux Kernel Mailing List, openib-general

[-- Attachment #1: Type: text/plain, Size: 268 bytes --]

On Mon, 2004-12-13 at 10:10 -0800, Roland Dreier wrote:
> Add files to Documentation/infiniband that describe the tree under
> /sys/class/infiniband, the IPoIB driver and the userspace MAD access driver.

Do you want to add in Hal's ipoib faq as well?

-tduffy

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver
  2004-12-13 18:44                         ` [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Tom Duffy
@ 2004-12-13 18:49                           ` Roland Dreier
  2004-12-13 18:54                             ` Tom Duffy
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 18:49 UTC (permalink / raw)
  To: Tom Duffy; +Cc: Linux Kernel Mailing List, netdev, openib-general

    Tom> Is there a reason why you put this in in an earlier patch and
    Tom> then take it out later?

I guess the reasons are stupidity and bad patch scripts...

Doesn't hurt for now, will be fixed in future versions.

 - R.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver
  2004-12-13 18:49                           ` Roland Dreier
@ 2004-12-13 18:54                             ` Tom Duffy
  2004-12-13 19:00                               ` Roland Dreier
  0 siblings, 1 reply; 29+ messages in thread
From: Tom Duffy @ 2004-12-13 18:54 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Linux Kernel Mailing List, netdev, openib-general

[-- Attachment #1: Type: text/plain, Size: 513 bytes --]

On Mon, 2004-12-13 at 10:49 -0800, Roland Dreier wrote:
>     Tom> Is there a reason why you put this in in an earlier patch and
>     Tom> then take it out later?
> 
> I guess the reasons are stupidity and bad patch scripts...
> 
> Doesn't hurt for now, will be fixed in future versions.

Speaking of nits, there are also some formatting issues with the
Makefiles that changes in the later patches...

But, the end result is you get the "correct" formatting if you apply all
the patches.

-tduffy

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-13 18:48                               ` [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files Tom Duffy
@ 2004-12-13 19:00                                 ` Roland Dreier
  0 siblings, 0 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 19:00 UTC (permalink / raw)
  To: Tom Duffy; +Cc: Linux Kernel Mailing List, openib-general

    Tom> Do you want to add in Hal's ipoib faq as well?

Good idea.  I'll fix up the formatting and add that to the next
version as well.

 - R.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver
  2004-12-13 18:54                             ` Tom Duffy
@ 2004-12-13 19:00                               ` Roland Dreier
  0 siblings, 0 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-13 19:00 UTC (permalink / raw)
  To: Tom Duffy; +Cc: Linux Kernel Mailing List, netdev, openib-general

    Tom> Speaking of nits, there are also some formatting issues with
    Tom> the Makefiles that changes in the later patches...

Thanks, I've fixed those intermediate versions as well.

 - R.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH][v3][16/21] IPoIB IPv6 support
  2004-12-13 18:09                     ` [PATCH][v3][16/21] IPoIB IPv6 support Roland Dreier
  2004-12-13 18:09                       ` [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Roland Dreier
@ 2004-12-14  2:30                       ` YOSHIFUJI Hideaki / 吉藤英明
  2004-12-14  2:36                         ` Roland Dreier
  1 sibling, 1 reply; 29+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-12-14  2:30 UTC (permalink / raw)
  To: roland; +Cc: linux-kernel, openib-general, netdev, yoshfuji

In article <20041213109.iziHvQZqtmP83gmx@topspin.com> (at Mon, 13 Dec 2004 10:09:46 -0800), Roland Dreier <roland@topspin.com> says:

>  }
> @@ -1794,6 +1801,7 @@
>  	if ((dev->type != ARPHRD_ETHER) && 
>  	    (dev->type != ARPHRD_FDDI) &&
>  	    (dev->type != ARPHRD_IEEE802_TR) &&
> +	    (dev->type != ARPHRD_INFINIBAND) &&
>  	    (dev->type != ARPHRD_ARCNET)) {
>  		/* Alas, we support only Ethernet autoconfiguration. */
>  		return;

Please put ARPHRD_INFINIBAND after ARPHRD_ARCNET like other places.

--yoshfuji

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH][v3][16/21] IPoIB IPv6 support
  2004-12-14  2:30                       ` [PATCH][v3][16/21] IPoIB IPv6 support YOSHIFUJI Hideaki / 吉藤英明
@ 2004-12-14  2:36                         ` Roland Dreier
  0 siblings, 0 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-14  2:36 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki / 吉藤英明
  Cc: linux-kernel, openib-general, netdev

    YOSHIFUJI> Please put ARPHRD_INFINIBAND after ARPHRD_ARCNET like
    YOSHIFUJI> other places.

Thanks, I've made this update.

Thanks,
  Roland


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-13 18:10                             ` [PATCH][v3][20/21] Add InfiniBand Documentation files Roland Dreier
  2004-12-13 18:10                               ` [PATCH][v3][21/21] InfiniBand MAINTAINERS entry Roland Dreier
  2004-12-13 18:48                               ` [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files Tom Duffy
@ 2004-12-14  2:42                               ` YOSHIFUJI Hideaki / 吉藤英明
  2004-12-14  2:58                                 ` Roland Dreier
  2 siblings, 1 reply; 29+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-12-14  2:42 UTC (permalink / raw)
  To: roland; +Cc: linux-kernel, openib-general, yoshfuji

In article <200412131010.qyAMW5NxoiM4CntC@topspin.com> (at Mon, 13 Dec 2004 10:10:04 -0800), Roland Dreier <roland@topspin.com> says:

> Add files to Documentation/infiniband that describe the tree under
> /sys/class/infiniband, the IPoIB driver and the userspace MAD access driver.
> 
> Signed-off-by: Roland Dreier <roland@topspin.com>

LICENSE.TXT is missing. Please add one.

-- 
Hideaki YOSHIFUJI @ USAGI Project <yoshfuji@linux-ipv6.org>
GPG FP: 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-14  2:42                               ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-12-14  2:58                                 ` Roland Dreier
  2004-12-14  3:16                                   ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 1 reply; 29+ messages in thread
From: Roland Dreier @ 2004-12-14  2:58 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki / 吉藤英明
  Cc: linux-kernel, openib-general

    YOSHIFUJI> LICENSE.TXT is missing. Please add one.

I think it would be even better to rewrite the copyright header so it
doesn't refer to external files.  I will see if this is possible.

Thanks,
  Roland

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH][v3][20/21] Add InfiniBand Documentation files
  2004-12-14  2:58                                 ` Roland Dreier
@ 2004-12-14  3:16                                   ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 0 replies; 29+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-12-14  3:16 UTC (permalink / raw)
  To: roland; +Cc: linux-kernel, openib-general, yoshfuji

In article <52u0qp37s1.fsf@topspin.com> (at Mon, 13 Dec 2004 18:58:38 -0800), Roland Dreier <roland@topspin.com> says:

>     YOSHIFUJI> LICENSE.TXT is missing. Please add one.
> 
> I think it would be even better to rewrite the copyright header so it
> doesn't refer to external files.  I will see if this is possible.

Yes, I agree.

--yoshfuji

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][5/21] Add InfiniBand MAD (management datagram) support
       [not found] <20041213109.3tK6alRLJABxH4bu@topspin.com>
  2004-12-13 18:09 ` [PATCH][v3][6/21] Add InfiniBand SA (Subnet Administration) query support Roland Dreier
@ 2004-12-14 17:10 ` Tom Duffy
  2004-12-14 17:43   ` Roland Dreier
  1 sibling, 1 reply; 29+ messages in thread
From: Tom Duffy @ 2004-12-14 17:10 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Linux Kernel Mailing List, openib-general

[-- Attachment #1: Type: text/plain, Size: 593 bytes --]

On Mon, 2004-12-13 at 10:09 -0800, Roland Dreier wrote:
> Add support for handling InfiniBand MADs (management datagrams),
> including sending and receiving MADs as well as passing MADs on to
> local agents.
> 
> This is required for an SM (subnet manager) to discover and configure
> the host, since the SM's query MADs must be passed to the local SMA
> (subnet management agent).  In addition, this support is used by upper
> level protocols to send queries to and receive responses from the SM.

This one didn't make it to lkml.  I think it was over the size limit.

-tduffy


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [openib-general] [PATCH][v3][5/21] Add InfiniBand MAD (management datagram) support
  2004-12-14 17:10 ` [openib-general] [PATCH][v3][5/21] Add InfiniBand MAD (management datagram) support Tom Duffy
@ 2004-12-14 17:43   ` Roland Dreier
  0 siblings, 0 replies; 29+ messages in thread
From: Roland Dreier @ 2004-12-14 17:43 UTC (permalink / raw)
  To: Tom Duffy; +Cc: Linux Kernel Mailing List, openib-general

    Tom> This one didn't make it to lkml.  I think it was over the
    Tom> size limit.

Ugh, looks like it was just over 100K.  I'll split it next time.

 - R.

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2004-12-14 17:51 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20041213109.3tK6alRLJABxH4bu@topspin.com>
2004-12-13 18:09 ` [PATCH][v3][6/21] Add InfiniBand SA (Subnet Administration) query support Roland Dreier
2004-12-13 18:09   ` [PATCH][v3][7/21] Add Mellanox HCA low-level driver Roland Dreier
2004-12-13 18:09     ` [PATCH][v3][8/21] Add Mellanox HCA low-level driver (midlayer interface) Roland Dreier
2004-12-13 18:09       ` [PATCH][v3][9/21] Add Mellanox HCA low-level driver (FW commands) Roland Dreier
2004-12-13 18:09         ` [PATCH][v3][10/21] Add Mellanox HCA low-level driver (EQ) Roland Dreier
2004-12-13 18:09           ` [PATCH][v3][11/21] Add Mellanox HCA low-level driver (initialization) Roland Dreier
2004-12-13 18:09             ` [PATCH][v3][12/21] Add Mellanox HCA low-level driver (QP/CQ) Roland Dreier
2004-12-13 18:09               ` [PATCH][v3][13/21] Add Mellanox HCA low-level driver (last bits) Roland Dreier
2004-12-13 18:09                 ` [PATCH][v3][14/21] Add Mellanox HCA low-level driver (MAD) Roland Dreier
2004-12-13 18:09                   ` [PATCH][v3][15/21] IPoIB IPv4 multicast Roland Dreier
2004-12-13 18:09                     ` [PATCH][v3][16/21] IPoIB IPv6 support Roland Dreier
2004-12-13 18:09                       ` [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Roland Dreier
2004-12-13 18:09                         ` [PATCH][v3][18/21] Add InfiniBand userspace MAD support Roland Dreier
2004-12-13 18:09                           ` [PATCH][v3][19/21] Document InfiniBand ioctl use Roland Dreier
2004-12-13 18:10                             ` [PATCH][v3][20/21] Add InfiniBand Documentation files Roland Dreier
2004-12-13 18:10                               ` [PATCH][v3][21/21] InfiniBand MAINTAINERS entry Roland Dreier
2004-12-13 18:48                               ` [openib-general] [PATCH][v3][20/21] Add InfiniBand Documentation files Tom Duffy
2004-12-13 19:00                                 ` Roland Dreier
2004-12-14  2:42                               ` YOSHIFUJI Hideaki / 吉藤英明
2004-12-14  2:58                                 ` Roland Dreier
2004-12-14  3:16                                   ` YOSHIFUJI Hideaki / 吉藤英明
2004-12-13 18:44                         ` [openib-general] [PATCH][v3][17/21] Add IPoIB (IP-over-InfiniBand) driver Tom Duffy
2004-12-13 18:49                           ` Roland Dreier
2004-12-13 18:54                             ` Tom Duffy
2004-12-13 19:00                               ` Roland Dreier
2004-12-14  2:30                       ` [PATCH][v3][16/21] IPoIB IPv6 support YOSHIFUJI Hideaki / 吉藤英明
2004-12-14  2:36                         ` Roland Dreier
2004-12-14 17:10 ` [openib-general] [PATCH][v3][5/21] Add InfiniBand MAD (management datagram) support Tom Duffy
2004-12-14 17:43   ` Roland Dreier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).